COLUMN TRANSFORMER

In [48]:
import pandas as pd
import numpy as np


In [26]:
df = pd.read_csv("C:\\Users\\Admin\\OneDrive\\Desktop\\DataScience\\DataSets\\covid_toy.csv")

In [27]:
df.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
34,74,Male,102.0,Mild,Mumbai,Yes
97,20,Female,101.0,Mild,Bangalore,No
45,72,Male,99.0,Mild,Bangalore,No
23,80,Female,98.0,Mild,Delhi,Yes
96,51,Female,101.0,Strong,Kolkata,Yes


In [28]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [29]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

# age{numerical}  fever{numerical(float)}  gender{categorical(nominal)} city{categorical(nominal)}  cough{categorical{ordinal}}  = targrt column (LABEL ENCODING) --> has_covid{categorical(nominal)}

# fever has missing values so applying SIMPLEIMPUTER

In [30]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [32]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [34]:
X_train

Unnamed: 0,age,gender,fever,cough,city
36,38,Female,101.0,Mild,Bangalore
23,80,Female,98.0,Mild,Delhi
68,54,Female,104.0,Strong,Kolkata
74,34,Female,104.0,Strong,Delhi
5,84,Female,,Mild,Bangalore
...,...,...,...,...,...
83,17,Female,104.0,Mild,Kolkata
67,65,Male,99.0,Mild,Bangalore
3,31,Female,98.0,Mild,Kolkata
77,8,Female,101.0,Mild,Kolkata


In [35]:
si = SimpleImputer()

In [36]:
X_train_fever = si.fit_transform(X_train[['fever']])
X_test_fever = si.transform(X_test[['fever']])

In [37]:
# print(len(X_train_fever))
X_train_fever

array([[101.        ],
       [ 98.        ],
       [104.        ],
       [104.        ],
       [100.67605634],
       [102.        ],
       [102.        ],
       [101.        ],
       [101.        ],
       [ 98.        ],
       [ 99.        ],
       [100.        ],
       [100.67605634],
       [100.        ],
       [100.67605634],
       [101.        ],
       [100.        ],
       [103.        ],
       [103.        ],
       [104.        ],
       [ 98.        ],
       [ 99.        ],
       [ 98.        ],
       [ 98.        ],
       [ 99.        ],
       [101.        ],
       [100.        ],
       [100.        ],
       [100.67605634],
       [100.        ],
       [100.67605634],
       [101.        ],
       [100.        ],
       [104.        ],
       [ 99.        ],
       [102.        ],
       [100.67605634],
       [ 99.        ],
       [ 98.        ],
       [101.        ],
       [ 98.        ],
       [ 98.        ],
       [ 99.        ],
       [100

In [40]:
df.shape

(100, 6)

In [41]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [42]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first',sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape

(80, 4)

In [43]:
X_train_gender_city 

array([[0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],


In [44]:
X_train_gender_city.shape

(80, 4)

In [45]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [49]:

X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

# Using ColumnTransformer

In [51]:
from sklearn.compose import ColumnTransformer

In [53]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [54]:
transformer.fit_transform(X_train).shape

(80, 7)

In [55]:
transformer

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [58]:
print(transformer)

ColumnTransformer(remainder='passthrough',
                  transformers=[('tnf1', SimpleImputer(), ['fever']),
                                ('tnf2',
                                 OrdinalEncoder(categories=[['Mild',
                                                             'Strong']]),
                                 ['cough']),
                                ('tnf3',
                                 OneHotEncoder(drop='first',
                                               sparse_output=False),
                                 ['gender', 'city'])])
