# Column Transformer

In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("covid_toy.csv")
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

## TODO
<pre>
    1. Impute the Fever
    2. Ordinal Encode the: cough
    3. One Hot Encode the: gender, city
    4. Label Encode the  : has_covid

In [5]:
df['gender'].value_counts(),"---------", df['cough'].value_counts(),"----------", df['city'].value_counts(),"--------",df['has_covid'].value_counts()

(gender
 Female    59
 Male      41
 Name: count, dtype: int64,
 '---------',
 cough
 Mild      62
 Strong    38
 Name: count, dtype: int64,
 '----------',
 city
 Kolkata      32
 Bangalore    30
 Delhi        22
 Mumbai       16
 Name: count, dtype: int64,
 '--------',
 has_covid
 No     55
 Yes    45
 Name: count, dtype: int64)

In [6]:
# fever ---> simple imputer
si = SimpleImputer()
fever_si = si.fit_transform(df[['fever']])

In [7]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
cough_oe = oe.fit_transform(df[['cough']])

In [8]:
# gender, city ---> ohe
ohe = OneHotEncoder()
gender_ohe = ohe.fit_transform(df[['gender']])
city_ohe = ohe.fit_transform(df[['city']])


In [9]:
lb = LabelEncoder()
has_covid_lb = lb.fit_transform(df[['has_covid']].values.ravel())
# .values.ravel() is option, but a good practice

In [10]:
# age           gender        fever        cough         city          has_covid     
# concatenate the numpy arrays
X_tranformed = df[['age']].values+gender_ohe+ fever_si+ cough_oe+ city_ohe+has_covid_lb 

ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (100,2)  and requested shape (100,4)

In [27]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(drop='first'),['gender','city'])
],remainder='passthrough')

In [28]:
df.columns

Index(['age', 'gender', 'fever', 'cough', 'city', 'has_covid'], dtype='object')

How to determine the columns?

In [29]:
df_transformed = transformer.fit_transform(df)
df_transformed
# col1: fever, col2: cough, col3: gender, col4,col5,col6:city, col7:age, col8:has_covid
# there are 2 genders and 4 cities; 2 gender cols and 4 city cols; drop first: 1 gender col and 3 city cols;
# transformer data is organized according to the transformer objects

array([[103.0, 0.0, 1.0, 0.0, 1.0, 0.0, 60, 'No'],
       [100.0, 0.0, 1.0, 1.0, 0.0, 0.0, 27, 'Yes'],
       [101.0, 0.0, 1.0, 1.0, 0.0, 0.0, 42, 'No'],
       [98.0, 0.0, 0.0, 0.0, 1.0, 0.0, 31, 'No'],
       [101.0, 0.0, 0.0, 0.0, 0.0, 1.0, 65, 'No'],
       [100.84444444444445, 0.0, 0.0, 0.0, 0.0, 0.0, 84, 'Yes'],
       [101.0, 1.0, 1.0, 0.0, 0.0, 0.0, 14, 'No'],
       [100.84444444444445, 1.0, 0.0, 0.0, 0.0, 1.0, 20, 'Yes'],
       [100.0, 1.0, 0.0, 0.0, 0.0, 0.0, 19, 'No'],
       [101.0, 0.0, 0.0, 1.0, 0.0, 0.0, 64, 'No'],
       [100.84444444444445, 0.0, 0.0, 1.0, 0.0, 0.0, 75, 'No'],
       [98.0, 0.0, 0.0, 0.0, 0.0, 1.0, 65, 'Yes'],
       [99.0, 1.0, 0.0, 0.0, 1.0, 0.0, 25, 'No'],
       [102.0, 0.0, 1.0, 0.0, 0.0, 0.0, 64, 'Yes'],
       [104.0, 0.0, 1.0, 0.0, 0.0, 0.0, 51, 'No'],
       [103.0, 1.0, 1.0, 0.0, 1.0, 0.0, 70, 'Yes'],
       [103.0, 0.0, 0.0, 0.0, 1.0, 0.0, 69, 'Yes'],
       [98.0, 1.0, 0.0, 1.0, 0.0, 0.0, 40, 'No'],
       [98.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6

In [35]:
# Index(['age', 'gender', 'fever', 'cough', 'city', 'has_covid'], dtype='object')

# df_transformed: col1: fever, col2: cough, col3: gender, col4,col5,col6:city, col7:age, col8:has_covid

df_transformed_df = pd.DataFrame(df_transformed, columns=[['fever','cough','gender','city1','city2','city3','age','has_covid']])
df_transformed_df

Unnamed: 0,fever,cough,gender,city1,city2,city3,age,has_covid
0,103.0,0.0,1.0,0.0,1.0,0.0,60,No
1,100.0,0.0,1.0,1.0,0.0,0.0,27,Yes
2,101.0,0.0,1.0,1.0,0.0,0.0,42,No
3,98.0,0.0,0.0,0.0,1.0,0.0,31,No
4,101.0,0.0,0.0,0.0,0.0,1.0,65,No
...,...,...,...,...,...,...,...,...
95,104.0,0.0,0.0,0.0,0.0,0.0,12,No
96,101.0,1.0,0.0,0.0,1.0,0.0,51,Yes
97,101.0,0.0,0.0,0.0,0.0,0.0,20,No
98,98.0,1.0,0.0,0.0,0.0,1.0,5,No
