In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In this notebook, I have applied both the tedious and manual approach to do a feature engineering for a toy dataset related with covid and an effiecient way to do the same work using sklearn's ColumnTransformer.  From cell no 2 to cell no 22 it's a 
manual approach and just single cell i.e. cell 23 is for ColumnTransformer. We can see the difference...

In [2]:
a = pd.read_csv('D:/datasets/covidtoy.csv')
a.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [3]:
a.shape

(100, 6)

In [4]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [5]:
a.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
'''10% of the data is missing in the fever column. Theoretically, we should not do mean/median if more 5% data is missing
as this will completely change the distribution of the data but also for now I will do the mean. KNNimputer or
iterativeimputer will be better than this...
'''
(a['fever'].isnull().sum()/100)*a.shape[0]

10.0

In [7]:
si = SimpleImputer(strategy='mean')
transformed_fever = si.fit_transform(a[['fever']])
transformed_fever[3:8] 

array([[ 98.        ],
       [101.        ],
       [100.84444444],
       [101.        ],
       [100.84444444]])

the missing value has been replaced with the mean of whole fever column.

In [8]:
a['cough'].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [9]:
#implementing the ordinal encoding in cough column
oe = OrdinalEncoder(categories=[['Mild','Strong']])

In [10]:
oe.fit(a[['cough']])

OrdinalEncoder(categories=[['Mild', 'Strong']])

In [11]:
transformed_cough = oe.transform(a[['cough']])
transformed_cough[0:7]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [12]:
a['cough'].head(7)
#0 for Mild and 1 for Strong

0      Mild
1      Mild
2      Mild
3      Mild
4      Mild
5      Mild
6    Strong
Name: cough, dtype: object

In [13]:
#implementing OneHotEncoder for gender and city column
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [14]:
a['gender'].value_counts()

Female    59
Male      41
Name: gender, dtype: int64

In [15]:
a['city'].value_counts()

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [16]:
transformed_gender_city = ohe.fit_transform(a[['gender','city']])
transformed_gender_city[0:6]

array([[1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0]])

Implementing OHE in two of the columns. In gender, male has been converted to a vector [0,1] and female to a vector[1,0] 
whereas for the column 'city', it has ordered the cities in ascending order and their vector values are Banglore = [1 0 0 0], 
Delhi == [0 1 0 0], Kolkata== [0 0 1 0], Mumbai == [0 0 0 1]. Adding drop=False to avoid the dummy variable trap AKA 
multicollinearity. So in the output of cell 51, first index is for gender and if it is 1 then its male or else it's female and
remaining 3 index are for city column. Three 0s means its Banglore, [1 0 0] == Delhi, [0 1 0] ==Kolkata and [0 0 1]== Mumbai.

In [17]:
a[['gender','city']].head(6)

Unnamed: 0,gender,city
0,Male,Kolkata
1,Male,Delhi
2,Male,Delhi
3,Female,Kolkata
4,Female,Mumbai
5,Female,Bangalore


In [18]:
le = LabelEncoder()
transformed_output=le.fit_transform(a['has_covid'])
transformed_output[0:3]

array([0, 1, 0])

In [19]:
a['has_covid'].head(3)
#0 for No and 1 for Yes

0     No
1    Yes
2     No
Name: has_covid, dtype: object

In [20]:
finaldata= np.concatenate((a[['age']], transformed_gender_city, transformed_fever, transformed_cough ), axis=1)
#major takeaway that I also learned here was, I tried to join all the arrays with pd.concat but found out that it only allows
#series or dataframes. This might help you as well... 

In [21]:
finaldata[0:5]

array([[ 60.,   1.,   0.,   1.,   0., 103.,   0.],
       [ 27.,   1.,   1.,   0.,   0., 100.,   0.],
       [ 42.,   1.,   1.,   0.,   0., 101.,   0.],
       [ 31.,   0.,   0.,   1.,   0.,  98.,   0.],
       [ 65.,   0.,   0.,   0.,   1., 101.,   0.]])

In [22]:
print(f'The dataset after and before feature engineering respectively: {finaldata.shape}, {a.shape}')

The dataset after and before feature engineering respectively: (100, 7), (100, 6)


# Using Columntransformer

In [23]:
transformer = ColumnTransformer(transformers=[
    ('tf1',SimpleImputer(strategy='mean'),['fever']),
    ('tf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tf3',OneHotEncoder(drop='first', sparse=False),['gender','city'])
],remainder='passthrough')

A disadvantage of using a columntransformer is that it gives the output in array form which can be hard to allocate the exact
columns. <br> <br>
And another thing I found out is, the ColumnTransformer doesnot work with LabelEncoder(). I tried to implement LableEncoder inside the columntransformer but it threw an error saying "takes two arguments 3 given." 
https://github.com/scikit-learn/scikit-learn/issues/12720

In [24]:
features = a.drop('has_covid', axis=1)
transformer.fit_transform(features).shape

(100, 7)

We can compare the difference in the line of codes and time taken to implement feature engineering going through two approaches.