<h2>Missing Values<h2>

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'Name':['Auba','Laca','Saka','Xhaka','Daka'],'Apps':[32,18,23,36,35],'Position':['ST','ST',np.NaN,'DM','RW']})
df.nunique() #Returns unique values in each column.

Name        5
Apps        5
Position    3
dtype: int64

In [4]:
#Subset the data on numeric columns only.
#Filter out columns of data type 'object' i.e. character.
num_cols = [i for i in df.columns if df[i].dtypes != 'O']
df[num_cols]

Unnamed: 0,Apps
0,32
1,18
2,23
3,36
4,35


In [5]:
#Impute missing values by the most frequent item.
from sklearn.impute import SimpleImputer
impute = SimpleImputer(strategy='most_frequent')
dx = impute.fit_transform(df) #Gives an array with imputed values.
df = pd.DataFrame(dx,columns=df.columns,index=df.index) #Convert array to dataframe.
df

Unnamed: 0,Name,Apps,Position
0,Auba,32,ST
1,Laca,18,ST
2,Saka,23,ST
3,Xhaka,36,DM
4,Daka,35,RW


In [6]:
impute.statistics_

array(['Auba', 18, 'ST'], dtype=object)

In [7]:
#Impute the missing values as a new level called 'missing'. 
imputer = SimpleImputer(strategy='constant',fill_value='missing')
imputer.fit_transform(df)

array([['Auba', 32, 'ST'],
       ['Laca', 18, 'ST'],
       ['Saka', 23, 'ST'],
       ['Xhaka', 36, 'DM'],
       ['Daka', 35, 'RW']], dtype=object)

In [8]:
df1 = pd.read_csv('D:/pandas Practice/cars.csv')
df1.shape

(36, 3)

In [9]:
#Impute missing using 'IterativeImputer'.
#Works by passing the non-missing rows to a regression model where the missing column is label and others the features.
#Then passes the missing value rows to the trained model where the missing value column acts as a label to be predicted from the passed feature vector.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
i1 = IterativeImputer()
fitt = i1.fit_transform(df1)
df2 = pd.DataFrame(fitt,columns=df1.columns,index=df1.index) #Convert to a dataframe.
df2.isna().sum() #Count missing across each column of dataframe.

Volume    0
Weight    0
CO2       0
dtype: int64

In [10]:
#Impute missing values using 'KNNImputer'.
from sklearn.impute import KNNImputer
i2 = KNNImputer(n_neighbors=3)
i2.fit_transform(df1)

array([[1000.        ,  790.        ,   99.        ],
       [1200.        , 1160.        ,   95.        ],
       [1000.        ,  929.        ,   95.        ],
       [ 900.        ,  865.        ,   90.        ],
       [1500.        , 1140.        ,  105.        ],
       [1000.        ,  929.        ,  105.        ],
       [1400.        , 1109.        ,   90.        ],
       [1500.        , 1365.        ,  104.        ],
       [1500.        , 1112.        ,   98.        ],
       [1600.        , 1150.        ,   99.        ],
       [1100.        ,  980.        ,   99.        ],
       [1300.        ,  990.        ,  101.        ],
       [1000.        , 1112.        ,   99.        ],
       [1600.        , 1252.        ,   94.        ],
       [1600.        , 1326.        ,   97.        ],
       [1600.        , 1330.        ,  100.33333333],
       [1600.        , 1365.        ,   99.        ],
       [2200.        , 1280.        ,  104.        ],
       [1600.        , 1119.

<h2>One-Hot Encoding of categorical column.<h2>

<h3>Using pandas.<h3>

In [20]:
home = pd.read_csv('D:/pandas Practice/homeprices.csv')
home

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [21]:
dummies = pd.get_dummies(home['town'],drop_first=True)
dummies #Include the 'drop_first=True' option to avoid dummy variable trap.

Unnamed: 0,robinsville,west windsor
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,1
7,0,1
8,0,1
9,1,0


In [26]:
home_with_dummy = pd.concat([home,dummies],axis =1)
home_with_dummy.drop('town',axis=1)
#Concatenate the dummy variables with original dataframe and then drop the original categorical variable.

Unnamed: 0,area,price,robinsville,west windsor
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


<h3>Using OneHotEncoder. <h3>

In [37]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False)
h1 = ohe.fit_transform(home[['town']]) #Specify list of columns to be transformed.
h1 #Dropped the first dummy to avoid dummy trap.

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [45]:
h2 = pd.DataFrame(h1,columns=['Robinsville','Westwindsor'])
h3 = pd.concat([home,h2],axis=1)
hf = h3.drop('town',axis=1)
hf

Unnamed: 0,area,price,Robinsville,Westwindsor
0,2600,550000,0.0,0.0
1,3000,565000,0.0,0.0
2,3200,610000,0.0,0.0
3,3600,680000,0.0,0.0
4,4000,725000,0.0,0.0
5,2600,585000,0.0,1.0
6,2800,615000,0.0,1.0
7,3300,650000,0.0,1.0
8,3600,710000,0.0,1.0
9,2600,575000,1.0,0.0


<h2>Label Encoding v/s Ordinal Encoding.<h2>
<h3>Both used to convert categorical columns having ordinal data.<h3>
<h3>Difference being, OrdinalEncoder can be used on multiple columns and hence generally used to convert features.
<h3>While the LabelEncoder is used on only one column and hence used to convert a label/target column.<h3>

In [30]:
home1 = home.copy()
home1.head(3)

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000


In [46]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
home1['town'] = encoder.fit_transform(home1['town'])
home1
#We still used LabelEncoder to convert a feature rather than label as the feature 'town' doesn't have a specific order.
#However for ordinal features, OrdinalEncoder is a better choice.

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [50]:
X = pd.DataFrame({'name':['auba','laca','saka','xhaka','daka'],'size':['m','s','s','xl','l'],'clas':['first','third','second','first','third']})
X

Unnamed: 0,name,size,clas
0,auba,m,first
1,laca,s,third
2,saka,s,second
3,xhaka,xl,first
4,daka,l,third


In [52]:
from sklearn.preprocessing import OrdinalEncoder
encode = OrdinalEncoder(categories=[['s','m','l','xl'],['first','second','third']]) #Specify the order for each column to be transformed.
out = encode.fit_transform(X[['size','clas']]) #Specify the columns to be transformed.
pd.DataFrame(out,columns=['size','clas'])

Unnamed: 0,size,clas
0,1.0,0.0
1,0.0,2.0
2,0.0,1.0
3,3.0,0.0
4,2.0,2.0
