<h2>Missing Values<h2>

In [54]:
import numpy as np
import pandas as pd

In [55]:
df = pd.DataFrame({'Name':['Auba','Laca','Saka','Xhaka','Daka'],'Apps':[32,18,23,36,35],'Position':['ST','ST',np.NaN,'DM','RW']})
df.nunique() #Returns unique values in each column.

Name        5
Apps        5
Position    3
dtype: int64

In [56]:
#Subset the data on numeric columns only.
#Filter out columns of data type 'object' i.e. character.
num_cols = [i for i in df.columns if df[i].dtypes != 'O']
df[num_cols]

Unnamed: 0,Apps
0,32
1,18
2,23
3,36
4,35


In [57]:
#Impute missing values by the most frequent item.
from sklearn.impute import SimpleImputer
impute = SimpleImputer(strategy='most_frequent')
dx = impute.fit_transform(df) #Gives an array with imputed values.
df = pd.DataFrame(dx,columns=df.columns,index=df.index) #Convert array to dataframe.
df

Unnamed: 0,Name,Apps,Position
0,Auba,32,ST
1,Laca,18,ST
2,Saka,23,ST
3,Xhaka,36,DM
4,Daka,35,RW


In [58]:
impute.statistics_

array(['Auba', 18, 'ST'], dtype=object)

In [59]:
#Impute the missing values as a new level called 'missing'. 
imputer = SimpleImputer(strategy='constant',fill_value='missing')
imputer.fit_transform(df)

array([['Auba', 32, 'ST'],
       ['Laca', 18, 'ST'],
       ['Saka', 23, 'ST'],
       ['Xhaka', 36, 'DM'],
       ['Daka', 35, 'RW']], dtype=object)

In [60]:
df1 = pd.read_csv('D:/pandas Practice/cars.csv')
df1.shape

(36, 3)

In [61]:
#Impute missing using 'IterativeImputer'.
#Works by passing the non-missing rows to a regression model where the missing column is label and others the features.
#Then passes the missing value rows to the trained model where the missing value column acts as a label to be predicted from the passed feature vector.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
i1 = IterativeImputer()
fitt = i1.fit_transform(df1)
df2 = pd.DataFrame(fitt,columns=df1.columns,index=df1.index) #Convert to a dataframe.
df2.isna().sum() #Count missing across each column of dataframe.

Volume    0
Weight    0
CO2       0
dtype: int64

In [62]:
#Impute missing values using 'KNNImputer'.
from sklearn.impute import KNNImputer
i2 = KNNImputer(n_neighbors=3)
i2.fit_transform(df1)

array([[1000.        ,  790.        ,   99.        ],
       [1200.        , 1160.        ,   95.        ],
       [1000.        ,  929.        ,   95.        ],
       [ 900.        ,  865.        ,   90.        ],
       [1500.        , 1140.        ,  105.        ],
       [1000.        ,  929.        ,  105.        ],
       [1400.        , 1109.        ,   90.        ],
       [1500.        , 1365.        ,  104.        ],
       [1500.        , 1112.        ,   98.        ],
       [1600.        , 1150.        ,   99.        ],
       [1100.        ,  980.        ,   99.        ],
       [1300.        ,  990.        ,  101.        ],
       [1000.        , 1112.        ,   99.        ],
       [1600.        , 1252.        ,   94.        ],
       [1600.        , 1326.        ,   97.        ],
       [1600.        , 1330.        ,  100.33333333],
       [1600.        , 1365.        ,   99.        ],
       [2200.        , 1280.        ,  104.        ],
       [1600.        , 1119.

<h2>Label Encoding of categorical columns.<h2>

In [63]:
#Create a new dataframe.
bridge_types = ('Arch','Beam','Truss','Cantilever','Tied Arch','Suspension','Cable')
bridge_df = pd.DataFrame(bridge_types, columns=['Bridge_Types'])
bridge_df

Unnamed: 0,Bridge_Types
0,Arch
1,Beam
2,Truss
3,Cantilever
4,Tied Arch
5,Suspension
6,Cable


In [64]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
bridge_df['Bridge_cat'] = encoder.fit_transform(bridge_df)
bridge_df

  return f(**kwargs)


Unnamed: 0,Bridge_Types,Bridge_cat
0,Arch,0
1,Beam,1
2,Truss,6
3,Cantilever,3
4,Tied Arch,5
5,Suspension,4
6,Cable,2


<h2>One-Hot Encoding of categorical column.<h2>

In [65]:
bridgeDF = pd.get_dummies(bridge_df,drop_first=True)
bridgeDF #Set 'drop_first=True' to avoid dummy variable trap.
#Hence we get 6 dummy variables for 7 levels of categorical column.

Unnamed: 0,Bridge_cat,Bridge_Types_Beam,Bridge_Types_Cable,Bridge_Types_Cantilever,Bridge_Types_Suspension,Bridge_Types_Tied Arch,Bridge_Types_Truss
0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0
2,6,0,0,0,0,0,1
3,3,0,0,1,0,0,0
4,5,0,0,0,0,1,0
5,4,0,0,0,1,0,0
6,2,0,1,0,0,0,0
