In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("/content/sample_data/covid_toy.csv")

In [5]:
df.sample(4)

Unnamed: 0,age,gender,fever,cough,city,has_covid
52,47,Female,100.0,Strong,Bangalore,Yes
97,20,Female,101.0,Mild,Bangalore,No
24,13,Female,100.0,Strong,Kolkata,No
47,18,Female,104.0,Mild,Bangalore,No


In [6]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


## Normal Transform one by one

In [7]:
# Here fever is took null values so we have to impute them
from sklearn.impute import SimpleImputer
si = SimpleImputer()

In [10]:
# Before fit and transform null value we need split X, y in train and test valsue. Because we fit on train value and then transform based on train value on every datasets.

from sklearn.model_selection import train_test_split

In [8]:
X = df.drop(columns=["has_covid"])
y = df["has_covid"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
si_X_train_fever = si.fit(X_train[['fever']])

In [14]:
X_train_fever = si_X_train_fever.transform(X_train[['fever']])
X_test_fever = si_X_train_fever.transform(X_test[['fever']])

In [16]:
# Now we see in our dataset 3 catagorical columns, gender, cough, and city. So we have to encode them. gender, and city will be nominal encoding using one hot encoding and cough => mild, strong.

In [17]:
print(df['cough'].unique())

['Mild' 'Strong']


In [19]:
# OHE => One Hot Encoding
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [21]:
# 1st Fit using train data
ohe = OneHotEncoder(drop='first', sparse_output=False)

ohe_gender_city = ohe.fit(X_train[['gender', 'city']])

In [24]:
X_train_gender_city = ohe_gender_city.transform(X_train[['gender', 'city']])
X_test_gerder_city = ohe_gender_city.transform(X_test[['gender', 'city']])

In [25]:
# Ordinal Encoder
ode = OrdinalEncoder(categories=[['Mild', 'Strong']])

In [26]:
# We fit and transoform together
X_train_cough = ode.fit_transform(X_train[['cough']])
X_test_cough = ode.transform(X_test[['cough']])

In [27]:
# Now done all column and transform all columns so now concatinate them. Before that we have to separate age column from X_train and X_test datasets.


In [29]:
X_train_age = X_train.drop(columns=['gender', 'city', 'cough', 'fever']).values
X_test_age = X_test.drop(columns=['gender', 'city', 'cough', 'fever']).values

In [30]:
new_X_train = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis=1)

new_X_test = np.concatenate((X_test_age, X_test_fever, X_test_gerder_city, X_test_cough), axis=1)

In [31]:
X_train.shape, new_X_train.shape

((80, 5), (80, 7))

In [32]:
X_test.shape, new_X_test.shape

((20, 5), (20, 7))

In [34]:
print(X_train['city'].unique())
print(X_train['gender'].unique())

['Mumbai' 'Kolkata' 'Delhi' 'Bangalore']
['Female' 'Male']


In [49]:
# Why this number of columns changed?
# => Gender = Male & Female => [convert into 2 columns]
# => City = Mumbai, Kolkata and so on => convert into 4 columns
# => Cough = Mild & strong => convert into 2 columns

## = Total new columns = 2 + 4 + 2 = 8 how many old columns remove 3 so, X_train.shape was 5 coulmns remove 3 it will be 2, and now add 8 so it should be 10.

## But here we remove or avoid multicollinearity so we take -1 columns from each so for gender take 2-1 = 1 coulmn, for City = 4-1 = 3 columns and cough took 1 column. So total 5 coulmns. Thats why it shows 2 + 5 = 7 coulmns.

## Column Tranform Used

In [39]:
from sklearn.compose import ColumnTransformer

In [46]:
# Now in one line we have write all transformer in one line and which column you want to apply the transform.

transform = ColumnTransformer(transformers=[
    ('trn1', SimpleImputer(), ['fever']),
    ('trn2', OneHotEncoder(drop='first', sparse_output=False), ['gender', 'city']),
    ('trn3', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough'])
], remainder='passthrough')  # <== remainder Passthrough means rest of the columns pass as it is.

In [47]:
new_X_train_transform = transform.fit_transform(X_train)
new_X_test_transform = transform.transform(X_test)

In [48]:
new_X_train_transform.shape, new_X_test_transform.shape

((80, 7), (20, 7))