In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
covid = pd.read_csv(r"C:\Users\patil\Desktop\Know IT CADC\Practical Machine Learning\Common_Folder\day2\dataset\covid_toy.csv")
covid.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [3]:
covid.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [4]:
# Taking all the Independent values in X and Dependent values in Y

In [5]:
x = covid.drop('has_covid', axis=1)
y = covid['has_covid']

In [6]:
# Splitting Independent data in (x_train, x_test) & Dependent data in (y_train, y_test) in 80:20 ratio
# Train data for training the Model 
# Test data for testing the Model

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [8]:
# Filling null values in fever column using mean value of that column using SimpleImputer for that
# Using transform to assign the new values to old values

In [9]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
x_train.iloc[:,2:3]=imputer.fit_transform(x_train.iloc[:,2:3])

In [10]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
x_test.iloc[: , 2:3] = imputer.fit_transform(x_test.iloc[: , 2:3])

In [12]:
print(x_train.shape)
print(x_test.shape)
x_test

(80, 5)
(20, 5)


Unnamed: 0,age,gender,fever,cough,city
80,14,Female,99.0,Mild,Mumbai
84,69,Female,98.0,Strong,Mumbai
33,26,Female,98.0,Mild,Kolkata
81,65,Male,99.0,Mild,Delhi
93,27,Male,100.0,Mild,Kolkata
17,40,Female,98.0,Strong,Delhi
36,38,Female,101.0,Mild,Bangalore
82,24,Male,98.0,Mild,Kolkata
69,73,Female,103.0,Mild,Delhi
65,69,Female,102.0,Mild,Bangalore


In [11]:
# ColumnTransformer is used to specify which columns to transform
# transformers=[(name_of_encoder, class object which transforms the column, index of column)]    multiple tuples can be passed 

In [13]:
# ColumnTransformer is called Pipeline

In [14]:
ct = ColumnTransformer(transformers=[('enc1', OneHotEncoder(sparse=False, drop='first'), [1]), 
                                     ('enc2', OneHotEncoder(sparse=False, drop='first'), [4]), 
                                     ('enc3', OrdinalEncoder(categories=[['Mild', 'Strong']]), [3])], 
                       remainder='passthrough')

x_train = ct.fit_transform(x_train)



In [15]:
x_train.shape

(80, 7)

In [16]:
ct = ColumnTransformer(transformers=[('enc1', OneHotEncoder(sparse=False, drop='first'), [1]), 
                                     ('enc2', OneHotEncoder(sparse=False, drop='first'), [4]), 
                                     ('enc3', OrdinalEncoder(categories=[['Mild', 'Strong']]), [3])], 
                        remainder='passthrough')

x_test = ct.fit_transform(x_test)



In [17]:
x_test.shape

(20, 7)

In [18]:
x_test

array([[  0.        ,   0.        ,   0.        ,   1.        ,
          0.        ,  14.        ,  99.        ],
       [  0.        ,   0.        ,   0.        ,   1.        ,
          1.        ,  69.        ,  98.        ],
       [  0.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  26.        ,  98.        ],
       [  1.        ,   1.        ,   0.        ,   0.        ,
          0.        ,  65.        ,  99.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  27.        , 100.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          1.        ,  40.        ,  98.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,  38.        , 101.        ],
       [  1.        ,   0.        ,   1.        ,   0.        ,
          0.        ,  24.        ,  98.        ],
       [  0.        ,   1.        ,   0.        ,   0.        ,
          0.    