### Import the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

### Load the dataset

In [2]:
df = pd.read_csv(r'data/covid_toy.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


* age, fever - numerical data - use SimpleImputer to fill the missing values
* gender, city - nominal data - apply One-Hot Encoder
* cough - ordinal data - apply Ordinal Encoder
* has_covid - nominal data(output) - apply Lable Encoder

### Train Test Split

In [5]:
X = df.iloc[:, 0:5]
y = df.iloc[:, -1]

In [6]:
X.head()

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai


In [7]:
y.head()

0     No
1    Yes
2     No
3     No
4     No
Name: has_covid, dtype: object

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Handling different transformations separately

#### SimpleImputer

In [9]:
si = SimpleImputer() # Replaces the missing value with the mean of the column
X_train_si = si.fit_transform(X_train[['fever']])
X_test_si = si.transform(X_test[['fever']])

X_train_si.shape, X_test_si.shape

((80, 1), (20, 1))

#### OneHotEncoder

In [10]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
X_train_ohe = ohe.fit_transform(X_train[['gender', 'city']])
X_test_ohe = ohe.transform(X_test[['gender', 'city']])

X_train_ohe.shape, X_test_ohe.shape

((80, 4), (20, 4))

#### OrdinalEncoder

In [11]:
df.cough.unique()

array(['Mild', 'Strong'], dtype=object)

In [12]:
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])
X_train_oe = oe.fit_transform(X_train[['cough']])
X_test_oe = oe.transform(X_test[['cough']])

X_train_oe.shape, X_test_oe.shape

((80, 1), (20, 1))

#### LabelEncoder

In [13]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

y_train_le.shape, y_test_le.shape

((80,), (20,))

#### Concatenating all the encoded numpy arrays

In [14]:
# Get the age column
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape, X_test_age.shape

((80, 1), (20, 1))

In [15]:
X_train_all = np.concatenate((X_train_age, X_train_si, X_train_ohe, X_train_oe), axis=1)
X_test_all = np.concatenate((X_test_age, X_test_si, X_test_ohe, X_test_oe), axis=1)

X_train_all.shape, X_test_all.shape

((80, 7), (20, 7))

### ColumnTransformer

In [16]:
transformer = ColumnTransformer(
    transformers=[
        ('SimpleImputer', SimpleImputer(), ['fever']),
        ('OneHotEncoder', OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32), ['gender', 'city']),
        ('OrdinalEncoder', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough'])
    ],
    remainder='passthrough',
    verbose=True,
    verbose_feature_names_out=True
)

X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape

[ColumnTransformer] . (1 of 4) Processing SimpleImputer, total=   0.0s
[ColumnTransformer] . (2 of 4) Processing OneHotEncoder, total=   0.0s
[ColumnTransformer]  (3 of 4) Processing OrdinalEncoder, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s


((80, 7), (20, 7))