In [1]:
import numpy as np
import pandas as pd


In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [4]:
df = pd.read_csv("covid_toy.csv")

In [5]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [6]:
df.shape

(100, 6)

In [7]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
df["gender"].value_counts()

Female    59
Male      41
Name: gender, dtype: int64

In [10]:
df["cough"].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [11]:
df["city"].value_counts()

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [13]:
df["has_covid"].value_counts()

No     55
Yes    45
Name: has_covid, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(df.drop(columns = ["has_covid"]),df["has_covid"], test_size=0.2)

In [16]:
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((80, 5), (20, 5), (80,), (20,))

In [17]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
44,20,Male,102.0,Strong,Delhi
79,48,Female,103.0,Mild,Kolkata
97,20,Female,101.0,Mild,Bangalore
34,74,Male,102.0,Mild,Mumbai
72,83,Female,101.0,Mild,Kolkata


In [18]:
X_test.head()

Unnamed: 0,age,gender,fever,cough,city
28,16,Male,104.0,Mild,Kolkata
45,72,Male,99.0,Mild,Bangalore
10,75,Female,,Mild,Delhi
48,66,Male,99.0,Strong,Bangalore
78,11,Male,100.0,Mild,Bangalore


In [19]:
y_train.head()

44     No
79    Yes
97     No
34    Yes
72     No
Name: has_covid, dtype: object

In [20]:
y_test.head()

28     No
45     No
10     No
48     No
78    Yes
Name: has_covid, dtype: object

### Long Method

In [25]:
### adding simple imputer to fever column
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[["fever"]])

### also the test data
X_test_fever = si.fit_transform(X_test[["fever"]])

X_train_fever.shape

(80, 1)

In [28]:
### ordinal encoding in cough column
oe = OrdinalEncoder(categories = [["Mild", "Strong"]])
X_train_cough = oe.fit_transform(X_train[["cough"]])

### also the test data
X_test_cough = oe.fit_transform(X_test[["cough"]])

X_train_cough.shape

(80, 1)

In [32]:
### OneHotEncoding in city and gender column

ohe = OneHotEncoder(drop = "first", sparse_output = False, dtype = np.int32)
X_train_gender_city = ohe.fit_transform(X_train[["gender", "city"]])

### also the test data
X_test_gender_city = ohe.fit_transform(X_test[["gender", "city"]])

X_test_gender_city.shape

(20, 4)

In [38]:
### Extracting age
X_train_age = X_train.drop(columns = ["gender", "fever" ,"cough", "city"]).values

### also the test data

X_test_age = X_test.drop(columns = ["gender", "fever" ,"cough", "city"]).values

X_train_age.shape

(80, 1)

In [43]:
X_train_tranformed = np.concatenate((X_train_age, X_train_fever, X_train_gender_city, X_train_cough), axis = 1)

### also the test data

X_test_tranformed = np.concatenate((X_test_age, X_test_fever, X_test_gender_city, X_test_cough), axis = 1)

X_train_tranformed.shape

(80, 7)

### ColumnTransformer

In [44]:
from sklearn.compose import ColumnTransformer

In [63]:
transformer = ColumnTransformer(transformers=[
    ("tnf1", SimpleImputer(), ["fever"]),
    ("tnf2", OrdinalEncoder(categories=[["Mild", "Strong"]]), ["cough"]),
    ("tnf3", OneHotEncoder(sparse_output = False, drop="first", dtype=np.int32), ["gender", "city"])
], remainder = "passthrough")

In [67]:
transformer.fit_transform(X_train).shape

(80, 7)

In [69]:
transformer.transform(X_test).shape

(20, 7)