In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer         # To handle missing values
from sklearn.preprocessing import OneHotEncoder  # To perform OneHotEncoding
from sklearn.preprocessing import OrdinalEncoder # To perform Ordinal Encoding

In [2]:
df = pd.read_csv("../Datasets/Covid_Toy/covid_toy.csv")
df.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
25,23,Male,,Mild,Mumbai,No
95,12,Female,104.0,Mild,Bangalore,No
8,19,Female,100.0,Strong,Bangalore,No
5,84,Female,,Mild,Bangalore,Yes
96,51,Female,101.0,Strong,Kolkata,Yes


In [3]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, :5],
    df.iloc[:, -1],
    test_size = 0.2
)

X_train.head()

Unnamed: 0,age,gender,fever,cough,city
81,65,Male,99.0,Mild,Delhi
15,70,Male,103.0,Strong,Kolkata
19,42,Female,,Strong,Bangalore
21,73,Male,98.0,Mild,Bangalore
32,34,Female,101.0,Strong,Delhi


# Without Column Transformer

In [6]:
# adding simple imputer to fever column
si = SimpleImputer()

X_train_fever = si.fit_transform(X_train[["fever"]])
X_test_fever  = si.transform(X_test[["fever"]])

X_train_fever.shape

(80, 1)

In [7]:
# Ordinal encoding -> cough column
oe = OrdinalEncoder(categories = [["Mild", "Strong"]])

X_train_cough = oe.fit_transform(X_train[["cough"]])
X_test_cough  = oe.transform(X_test[["cough"]])

X_train_cough.shape

(80, 1)

In [8]:
# OneHotEncoding -> gender, city columns
ohe = OneHotEncoder(drop = "first", sparse = False)

X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])
X_test_gender_city  = ohe.transform(X_test[["gender", "city"]])

X_train_gender_city.shape

(80, 4)

In [9]:
# extracting age
X_train_age = X_train.drop(columns = ["gender", "fever", "cough", "city"], axis = 1).values
X_test_age  = X_test.drop(columns = ["gender", "fever", "cough", "city"], axis = 1).values

X_train_age.shape

(80, 1)

In [10]:
X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis = 1)
X_test_transformed  = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough), axis = 1)

X_train_transformed.shape

(80, 7)

# With Column Transformer

In [11]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers = [
        ("tnf1", SimpleImputer(), ["fever"]),
        ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
        ("tnf3", OneHotEncoder(drop = "first", sparse = False), ["gender", "city"])
    ],
    remainder = "passthrough" # don't do anything with those column which dont need to do transform
)

In [14]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed  = transformer.transform(X_test)

X_train_transformed.shape

(80, 7)