In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop('has_covid',axis = 1),
                                                df['has_covid'],test_size = 0.3,random_state = 2)

In [5]:
x_train.shape

(70, 5)

In [7]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [11]:
# fever is having some null value so first we will fill them using simple imputer
from sklearn.impute import SimpleImputer
si = SimpleImputer()
X_train_fever = si.fit_transform(x_train[['fever']])
X_train_fever.shape

(70, 1)

In [14]:
X_test_fever = si.fit_transform(x_test[['fever']])
X_test_fever.shape

(30, 1)

In [18]:
# now we will work on categorical values 
# so first we will work on ordinal data that is cough column
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories = [['Mild','Strong']])
X_train_Cough = oe.fit_transform(x_train[['cough']])
X_test_Cough = oe.fit_transform(x_test[['cough']])

In [19]:
X_train_Cough.shape

(70, 1)

In [20]:
# now let's work on other type of categorical data called as nominal data using one_hot_encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first',sparse_output = False)
X_train_Gender_city = ohe.fit_transform(x_train[['gender','city']])
X_test_Gender_city = ohe.fit_transform(x_test[['gender','city']])

In [21]:
X_train_Gender_city.shape

(70, 4)

In [23]:
# we just completed the all feture selection and analysis part. so, now let's transorm the data 
# Extracting Age
X_train_age = x_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = x_test.drop(columns=['gender','fever','cough','city']).values

In [33]:
# concetination part
X_transformed = np.concatenate((X_train_age,X_train_Gender_city,X_train_Cough,X_train_fever),axis = 1)

In [34]:
X_test_transformed = np.concatenate((X_test_age,X_test_Gender_city,X_test_Cough,X_test_fever),axis = 1)

In [35]:
X_transformed

array([[ 69.     ,   0.     ,   0.     ,   0.     ,   0.     ,   0.     ,
        102.     ],
       [ 27.     ,   1.     ,   1.     ,   0.     ,   0.     ,   0.     ,
        100.     ],
       [ 64.     ,   0.     ,   0.     ,   0.     ,   0.     ,   0.     ,
         98.     ],
       [ 66.     ,   1.     ,   0.     ,   0.     ,   0.     ,   1.     ,
         99.     ],
       [ 38.     ,   0.     ,   0.     ,   0.     ,   0.     ,   0.     ,
        101.     ],
       [ 11.     ,   1.     ,   0.     ,   0.     ,   0.     ,   0.     ,
        100.     ],
       [ 14.     ,   1.     ,   0.     ,   0.     ,   0.     ,   1.     ,
        101.     ],
       [ 46.     ,   1.     ,   0.     ,   0.     ,   0.     ,   1.     ,
        103.     ],
       [ 38.     ,   1.     ,   1.     ,   0.     ,   0.     ,   0.     ,
        100.78125],
       [ 75.     ,   0.     ,   1.     ,   0.     ,   0.     ,   0.     ,
        100.78125],
       [ 25.     ,   0.     ,   0.     ,   1.     ,   0.    

# using Sklearn

In [36]:
from sklearn.compose import ColumnTransformer

In [41]:
transformer = ColumnTransformer(transformers = [
    ('tf1',SimpleImputer(),['fever']),
    ('tf2',OrdinalEncoder(categories = [['Mild',"Strong"]]),['cough']),
    ('tf3',OneHotEncoder(sparse_output = False,drop = 'first'),['gender','city'])
],remainder = 'passthrough')

In [42]:
transformer