<h2>Missing Values<h2>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'Name':['Auba','Laca','Saka','Xhaka','Daka'],'Apps':[32,18,23,36,35],'Position':['ST','ST',np.NaN,'DM','RW']})
df.nunique() #Returns unique values in each column.

Name        5
Apps        5
Position    3
dtype: int64

In [3]:
#Subset the data on numeric columns only.
#Filter out columns of data type 'object' i.e. character.
num_cols = [i for i in df.columns if df[i].dtypes != 'O']
df[num_cols]

Unnamed: 0,Apps
0,32
1,18
2,23
3,36
4,35


In [4]:
#Impute missing values by the most frequent item.
from sklearn.impute import SimpleImputer
impute = SimpleImputer(strategy='most_frequent')
dx = impute.fit_transform(df) #Gives an array with imputed values.
df1 = pd.DataFrame(dx,columns=df.columns,index=df.index) #Convert array to dataframe.
df1



Unnamed: 0,Name,Apps,Position
0,Auba,32,ST
1,Laca,18,ST
2,Saka,23,ST
3,Xhaka,36,DM
4,Daka,35,RW


In [5]:
impute.statistics_

array(['Auba', 18, 'ST'], dtype=object)

In [6]:
#Impute the missing values as a new level called 'missing'. 
imputer = SimpleImputer(strategy='constant',fill_value='missing')
imputer.fit_transform(df)

array([['Auba', 32, 'ST'],
       ['Laca', 18, 'ST'],
       ['Saka', 23, 'missing'],
       ['Xhaka', 36, 'DM'],
       ['Daka', 35, 'RW']], dtype=object)

In [7]:
#The 'strategy=constant' with missing and the 'strategy=most_frequent' are used to impute categorical variables.
#For numeric variables, we can use 'strategy=mean/median' as arguments.

In [8]:
df1 = pd.read_csv('D:/pandas Practice/cars.csv')
df1.shape

(36, 3)

In [9]:
#Impute missing using 'IterativeImputer'.
#Works by passing the non-missing rows to a regression model where the missing column is label and others the features.
#Then passes the missing value rows to the trained model where the missing value column acts as a label to be predicted from the passed feature vector.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
i1 = IterativeImputer()
fitt = i1.fit_transform(df1)
df2 = pd.DataFrame(fitt,columns=df1.columns,index=df1.index) #Convert to a dataframe.
df2.isna().sum() #Count missing across each column of dataframe.

Volume    0
Weight    0
CO2       0
dtype: int64

In [10]:
#Impute missing values using 'KNNImputer'.
from sklearn.impute import KNNImputer
i2 = KNNImputer(n_neighbors=3)
i2.fit_transform(df1)

array([[1000.        ,  790.        ,   99.        ],
       [1200.        , 1160.        ,   95.        ],
       [1000.        ,  929.        ,   95.        ],
       [ 900.        ,  865.        ,   90.        ],
       [1500.        , 1140.        ,  105.        ],
       [1000.        ,  929.        ,  105.        ],
       [1400.        , 1109.        ,   90.        ],
       [1500.        , 1365.        ,  104.        ],
       [1500.        , 1112.        ,   98.        ],
       [1600.        , 1150.        ,   99.        ],
       [1100.        ,  980.        ,   99.        ],
       [1300.        ,  990.        ,  101.        ],
       [1000.        , 1112.        ,   99.        ],
       [1600.        , 1252.        ,   94.        ],
       [1600.        , 1326.        ,   97.        ],
       [1600.        , 1330.        ,  100.33333333],
       [1600.        , 1365.        ,   99.        ],
       [2200.        , 1280.        ,  104.        ],
       [1600.        , 1119.

<h2>One-Hot Encoding of categorical column.<h2>

<h3>Using pandas.<h3>

In [11]:
home = pd.read_csv('D:/pandas Practice/homeprices.csv')
home

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [12]:
dummies = pd.get_dummies(home['town'],drop_first=True)
dummies #Include the 'drop_first=True' option to avoid dummy variable trap.

Unnamed: 0,robinsville,west windsor
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,1
7,0,1
8,0,1
9,1,0


In [13]:
home_with_dummy = pd.concat([home,dummies],axis =1)
home_with_dummy.drop('town',axis=1)
#Concatenate the dummy variables with original dataframe and then drop the original categorical variable.

Unnamed: 0,area,price,robinsville,west windsor
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


<h3>Using OneHotEncoder. <h3>

In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False)
h1 = ohe.fit_transform(home[['town']]) #Specify list of columns to be transformed.
h1 #Dropped the first dummy to avoid dummy trap.

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [15]:
h2 = pd.DataFrame(h1,columns=['Robinsville','Westwindsor'])
h3 = pd.concat([home,h2],axis=1)
hf = h3.drop('town',axis=1)
hf

Unnamed: 0,area,price,Robinsville,Westwindsor
0,2600,550000,0.0,0.0
1,3000,565000,0.0,0.0
2,3200,610000,0.0,0.0
3,3600,680000,0.0,0.0
4,4000,725000,0.0,0.0
5,2600,585000,0.0,1.0
6,2800,615000,0.0,1.0
7,3300,650000,0.0,1.0
8,3600,710000,0.0,1.0
9,2600,575000,1.0,0.0


<h2>Label Encoding v/s Ordinal Encoding.<h2>
<h3>Both used to convert categorical columns.<h3>
<h3>Difference being, OrdinalEncoder can be used on multiple columns and hence generally used to convert ordinal features.
<h3>While the LabelEncoder is used on only one column and hence used to convert a label/target column.<h3>

In [16]:
home1 = home.copy()
home1.head(3)

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000


In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
home1['town'] = encoder.fit_transform(home1['town'])
home1
#We still used LabelEncoder to convert a feature rather than label as the feature 'town' doesn't have a specific order.
#However for ordinal features, OrdinalEncoder is a better choice.

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [18]:
X = pd.DataFrame({'name':['auba','laca','saka','xhaka','daka'],'size':['m','s','s','xl','l'],'clas':['first','third','second','first','third']})
X #Here 'size' and 'clas' are ordinal features.

Unnamed: 0,name,size,clas
0,auba,m,first
1,laca,s,third
2,saka,s,second
3,xhaka,xl,first
4,daka,l,third


In [19]:
from sklearn.preprocessing import OrdinalEncoder
encode = OrdinalEncoder(categories=[['s','m','l','xl'],['first','second','third']]) #Specify the order for each column to be transformed.
out = encode.fit_transform(X[['size','clas']]) #Specify the columns to be transformed.
pd.DataFrame(out,columns=['size','clas'])

Unnamed: 0,size,clas
0,1.0,0.0
1,0.0,2.0
2,0.0,1.0
3,3.0,0.0
4,2.0,2.0


<h2>ColumnTransformer and Pipeline<h2>
<h3>When we need to apply different transformations on same set of columns(eg. all columns) we use Pipeline. The transforms are applied in sequential manner.<h3>
<h3>When we need to apply same/different transformation on different set of columns, we use ColumnTransformer.<h3>

In [20]:
df = pd.read_csv('D:/pandas Practice/income.csv',na_values=' ?')
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [21]:
df.columns #We can see column names have a space as prefix.

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [22]:
df.columns = df.columns.str.strip() #Strip the whitespaces and save it back.
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [23]:
np.random.seed(11)
r = np.random.choice(df.index,40,replace=False)
r #An array consisting of 40 indices chosen at random.

array([24337, 17049, 21016,  2790, 13511, 29288,  2161, 12546, 20828,
         210, 20261, 27729, 30961, 10133, 24726,  1643, 15630, 21184,
       30282, 25322, 29499,  4497, 19846, 15838, 14172,  6806, 20417,
       27156,  1150, 12842, 11282, 26282, 16029, 16924,  9286, 11931,
        3866, 15826,  6487, 25119], dtype=int64)

In [24]:
#Replace values in 'age' column with missing.
df.loc[r,'age'] = np.nan

In [25]:
np.random.seed(15)
s = np.random.choice(df.index,40,replace=False)
df.loc[s,'hours-per-week'] = np.nan

In [26]:
df.isna().sum() 

age                 40
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week      40
native-country     583
income               0
dtype: int64

In [27]:
X = df.drop('income',axis=1)
y = df['income']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,MissingIndicator
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [30]:
#Create a list containing names for numeric and categorical columns.
numeric = [i for i in X_train.columns if X_train[i].dtype != 'O']
categoric = [i for i in X_train.columns if X_train[i].dtype == 'O']
categoric.remove('education') #As 'education' and 'education-num' convey same info.
print(numeric)
print(categoric)

['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [31]:
#Since our data has missing values, we apply 'Pipeline' first followed by 'ColumnTransformer'.
pipe_num = Pipeline([('imp_num',KNNImputer(n_neighbors=3)),('scalar',StandardScaler())])
pipe_cat = Pipeline([('imp_cat',SimpleImputer(strategy='constant',fill_value='missing')),('ohe',OneHotEncoder(sparse=False,drop='first'))])
#Here we specified 'SimpleImputer' on categoric columns where in missing values get imputed with string value 'missing'.
#Hence instead of imputing it with another value, we keep the essence of it and include it as a level.

In [32]:
ct = ColumnTransformer([('mi',MissingIndicator(),X_train.columns),('pp_num',pipe_num,numeric),('pp_cat',pipe_cat,categoric)])
#'MissingIndicator' creates a new column with 1/0 values for every column having missing values.
#This helps us know after missing imputation which rows have been imputed for missing.

In [33]:
xt = ct.fit_transform(X_train)
pd.DataFrame(xt)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,0.0,0.0,0.0,0.0,0.0,-0.044367,-0.328132,-0.035768,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,-0.849816,0.244047,-0.035768,0.23557,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,-0.996262,0.946616,-0.425272,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.907528,-0.139482,-0.425272,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,2.371981,-0.840666,-0.425272,-0.14558,5.985757,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.0,0.0,0.0,0.0,0.0,-0.849816,0.892095,-0.035768,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26044,0.0,0.0,0.0,0.0,0.0,-0.264035,0.919982,0.743239,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26045,0.0,0.0,0.0,0.0,0.0,-1.435598,0.702276,-0.035768,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26046,0.0,0.0,0.0,0.0,0.0,-0.337258,-0.669944,-1.204280,-0.14558,-0.214934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
ct.transform(X_test)
#Notice how we used 'fit_transform' for training set but 'transform' for test set. That's the THUMB RULE !!

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
from sklearn.tree import DecisionTreeClassifier
pipe_final = Pipeline([('step_ct',ct),('model',DecisionTreeClassifier())])
pipe_final.fit(X_train,y_train)



Pipeline(steps=[('step_ct',
                 ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                                 ('pp_num',
                                                  Pipeline(steps=[('imp_num',
                                                                   KNNImputer(n_neighbors=3)),...
                                                  ['age', 'fnlwgt',
                                                   'education-num',
                                                   'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week']),
                                

In [36]:
pipe_final.predict(X_test)

array([' <=50K', ' <=50K', ' >50K', ..., ' >50K', ' <=50K', ' <=50K'],
      dtype=object)

In [37]:
pipe_final.score(X_test,y_test)

0.8091509289114079

In [38]:
pipe_final.named_steps

{'step_ct': ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
       dtype='object')),
                                 ('pp_num',
                                  Pipeline(steps=[('imp_num',
                                                   KNNImputer(n_neighbors=3)),
                                                  ('scalar', StandardScaler())]),
                                  ['age', 'fnlwgt', 'education-num',
                                   'capital-gain', 'capital-loss',
                                   'hours-per-week']),
                                 ('pp_cat',
                                  Pipeline(steps=[('imp_cat',
                                                   SimpleImputer(fill_value='missing',
    

In [39]:
pipe_final.named_steps['step_ct']

ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                ('pp_num',
                                 Pipeline(steps=[('imp_num',
                                                  KNNImputer(n_neighbors=3)),
                                                 ('scalar', StandardScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('pp_cat',
                                 Pipeline(steps=[('imp_cat',
                                                  SimpleImputer(fill_value='missing',
                              

In [40]:
pipe_final.named_steps['step_ct'].named_transformers_

{'mi': MissingIndicator(),
 'pp_num': Pipeline(steps=[('imp_num', KNNImputer(n_neighbors=3)),
                 ('scalar', StandardScaler())]),
 'pp_cat': Pipeline(steps=[('imp_cat',
                  SimpleImputer(fill_value='missing', strategy='constant')),
                 ('ohe', OneHotEncoder(drop='first', sparse=False))])}

In [41]:
pipe_final.named_steps['step_ct'].named_transformers_['pp_cat']

Pipeline(steps=[('imp_cat',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('ohe', OneHotEncoder(drop='first', sparse=False))])

In [42]:
pipe_final.named_steps['step_ct'].named_transformers_['pp_cat'].named_steps

{'imp_cat': SimpleImputer(fill_value='missing', strategy='constant'),
 'ohe': OneHotEncoder(drop='first', sparse=False)}

In [43]:
pipe_final.named_steps['step_ct'].named_transformers_['pp_cat'].named_steps['ohe']

OneHotEncoder(drop='first', sparse=False)

In [44]:
pipe_final.named_steps['step_ct'].named_transformers_['pp_cat'].named_steps['ohe'].get_feature_names()
#This gives an array of one-hot-encoded columns.

array(['x0_ Local-gov', 'x0_ Never-worked', 'x0_ Private',
       'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc', 'x0_ State-gov',
       'x0_ Without-pay', 'x0_missing', 'x1_ Married-AF-spouse',
       'x1_ Married-civ-spouse', 'x1_ Married-spouse-absent',
       'x1_ Never-married', 'x1_ Separated', 'x1_ Widowed',
       'x2_ Armed-Forces', 'x2_ Craft-repair', 'x2_ Exec-managerial',
       'x2_ Farming-fishing', 'x2_ Handlers-cleaners',
       'x2_ Machine-op-inspct', 'x2_ Other-service',
       'x2_ Priv-house-serv', 'x2_ Prof-specialty', 'x2_ Protective-serv',
       'x2_ Sales', 'x2_ Tech-support', 'x2_ Transport-moving',
       'x2_missing', 'x3_ Not-in-family', 'x3_ Other-relative',
       'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',
       'x4_ Asian-Pac-Islander', 'x4_ Black', 'x4_ Other', 'x4_ White',
       'x5_ Male', 'x6_ Canada', 'x6_ China', 'x6_ Columbia', 'x6_ Cuba',
       'x6_ Dominican-Republic', 'x6_ Ecuador', 'x6_ El-Salvador',
       'x6_ England', 'x6_ France', 'x6