In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv("covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [18]:
df["cough"].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [19]:
df.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

**As you can see here:  
    1. age is numerical value -> here we are doing nothing  
    2. Gender and city is nominal categorical values -> OneHotEncoding  
    3. cough is ordinal categorical value -> Ordinal Encoding  
    4. fever column has 10 missing values -> SimpleImputator**

It would be a proper pain in the ass if We have follow traditional methods to encode this data.  
for ex:  
1. Separate "fever" column and apply SimpleImputator to remove null values  
2. Separate "gender" and "city" column. Apply OneHotEncoding to handle nominal values  
3. Separate "cough" column. Apply OrdinalEncoding to handle ordinal values  
4. Merge all columns together
   
This looks easy but with more number of columns, your ass will be on fire!
What to do now? use ColumnTransformer!

In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [21]:
from sklearn.compose import ColumnTransformer

In [24]:
transformer = ColumnTransformer(
    transformers=[
        ("tnf1", SimpleImputer(), ["fever"]),
        ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
        ("tnf3", OneHotEncoder(drop='first',sparse_output=False), ["gender", "city", "has_covid"])
    ],
    remainder="passthrough"  # remainder="passthrough" -> don't drop the remaining columns
                             # remainder="drop"-> drop the existing columns where no tranformation is done.
                             # here, "age"
)

In [25]:
df_encoded = transformer.fit_transform(df)

In [28]:
df_encoded.shape

(100, 8)