In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../data/covid_toy.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

Operations we need to make
1. Remove null values -> simple imputer
2. Ordinal encoding 
3. Nominal encoding

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [6]:
df.cough.value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [15]:
si = SimpleImputer()
ohe = OneHotEncoder(drop='first')
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])

fever = si.fit_transform(df[['fever']])
gender_city = ohe.fit_transform(df[['gender','city']])
cough = oe.fit_transform(df[['cough']])

In [30]:
gender_city[:, 0:1].shape

(100, 1)

In [46]:
df_new = np.concatenate(
    (
     gender_city[:,0:1].toarray(),
     gender_city[:,1:2].toarray(),
    ),axis=1)
# df_new = np.concatenate((df['age'], fever, gender_city, cough,df['has_covid']), axis = 1)

In [7]:
from sklearn.compose import ColumnTransformer

In [14]:
transformer = ColumnTransformer(
    # tranformer_name, tranformation, [column]

    transformers=[
        ('tnf1', SimpleImputer(), ['fever']),
        ('tnf2', OneHotEncoder(sparse_output= False, drop='first'), ['gender', 'city']), 
        ('tnf3', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough'])
    ]
    , remainder='passthrough')


# remainder = passthrough keeps all the remaining column are kept as it is.

In [None]:
transformer.fit_transform(df)

array([[103.0, 1.0, 0.0, 1.0, 0.0, 0.0, 60, 'No'],
       [100.0, 1.0, 1.0, 0.0, 0.0, 0.0, 27, 'Yes'],
       [101.0, 1.0, 1.0, 0.0, 0.0, 0.0, 42, 'No'],
       [98.0, 0.0, 0.0, 1.0, 0.0, 0.0, 31, 'No'],
       [101.0, 0.0, 0.0, 0.0, 1.0, 0.0, 65, 'No'],
       [100.84444444444445, 0.0, 0.0, 0.0, 0.0, 0.0, 84, 'Yes'],
       [101.0, 1.0, 0.0, 0.0, 0.0, 1.0, 14, 'No'],
       [100.84444444444445, 0.0, 0.0, 0.0, 1.0, 1.0, 20, 'Yes'],
       [100.0, 0.0, 0.0, 0.0, 0.0, 1.0, 19, 'No'],
       [101.0, 0.0, 1.0, 0.0, 0.0, 0.0, 64, 'No'],
       [100.84444444444445, 0.0, 1.0, 0.0, 0.0, 0.0, 75, 'No'],
       [98.0, 0.0, 0.0, 0.0, 1.0, 0.0, 65, 'Yes'],
       [99.0, 0.0, 0.0, 1.0, 0.0, 1.0, 25, 'No'],
       [102.0, 1.0, 0.0, 0.0, 0.0, 0.0, 64, 'Yes'],
       [104.0, 1.0, 0.0, 0.0, 0.0, 0.0, 51, 'No'],
       [103.0, 1.0, 0.0, 1.0, 0.0, 1.0, 70, 'Yes'],
       [103.0, 0.0, 0.0, 1.0, 0.0, 0.0, 69, 'Yes'],
       [98.0, 0.0, 1.0, 0.0, 0.0, 1.0, 40, 'No'],
       [98.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6