In [7]:
import pandas as pd
import numpy as np

#### Frequent Category Imputation

In [8]:
df = pd.read_csv('House Train.csv', usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [9]:
df.shape

(1460, 4)

In [10]:
df.isnull().sum()

BsmtQual        37
FireplaceQu    690
GarageType      81
SalePrice        0
dtype: int64

In [11]:
df.isnull().mean().sort_values(ascending=True)

SalePrice      0.000000
BsmtQual       0.025342
GarageType     0.055479
FireplaceQu    0.472603
dtype: float64

In [12]:
print(df['BsmtQual'].value_counts().index[0])
print(df['GarageType'].value_counts().index[0])
print(df['FireplaceQu'].value_counts().index[0])

TA
Attchd
Gd


In [13]:
def impute_nan(df,variable):
    most_frequent_cat=df[variable].value_counts().index[0]
    df[variable].fillna(most_frequent_cat, inplace=True)

In [14]:
for x in ['BsmtQual','GarageType','FireplaceQu']:
    impute_nan(df,x)

In [15]:
df.isnull().mean()

BsmtQual       0.0
FireplaceQu    0.0
GarageType     0.0
SalePrice      0.0
dtype: float64

#### Adding a New Feature to capture NAN


In [16]:
df = pd.read_csv('House Train.csv', usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [17]:
def impute_nan(df,variable):
    df[variable+'_new']= np.where(df[variable].isnull(),1,0)

In [18]:
for x in ['BsmtQual','FireplaceQu','GarageType']:
    impute_nan(df,x)

In [19]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_new,FireplaceQu_new,GarageType_new
0,Gd,,Attchd,208500,0,1,0
1,Gd,TA,Attchd,181500,0,0,0
2,Gd,TA,Attchd,223500,0,0,0
3,TA,Gd,Detchd,140000,0,0,0
4,Gd,TA,Attchd,250000,0,0,0


In [20]:
def impute_nan1(df,variables):
    most_frequent=df[variables].mode()[0]
    df[variables].fillna(most_frequent, inplace=True)

In [21]:
for x in ['BsmtQual','FireplaceQu','GarageType']:
    impute_nan1(df,x)

In [22]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_new,FireplaceQu_new,GarageType_new
0,Gd,Gd,Attchd,208500,0,1,0
1,Gd,TA,Attchd,181500,0,0,0
2,Gd,TA,Attchd,223500,0,0,0
3,TA,Gd,Detchd,140000,0,0,0
4,Gd,TA,Attchd,250000,0,0,0


#### Suppose if you have more frequent categories, we just replace NAN with a new category

In [23]:
df = pd.read_csv('House Train.csv', usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [24]:
def impute_nan(df,variable):
    df[variable]=np.where(df[variable].isnull(),'Unknown',df[variable])

In [25]:
for x in ['BsmtQual','FireplaceQu','GarageType','SalePrice']:
    impute_nan(df,x)

In [26]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,Unknown,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


#### OR we can also achive it by creating new variable and dropping the old one.

In [27]:
df = pd.read_csv('House Train.csv', usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [28]:
def impute_new_nan(df,varible):
    df[varible+'-New_Var']= np.where(df[varible].isnull(),'missing',df[varible])

In [29]:
for x in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_nan(df,x)

In [30]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual-New_Var,FireplaceQu-New_Var,GarageType-New_Var
0,Gd,,Attchd,208500,Gd,missing,Attchd
1,Gd,TA,Attchd,181500,Gd,TA,Attchd
2,Gd,TA,Attchd,223500,Gd,TA,Attchd
3,TA,Gd,Detchd,140000,TA,Gd,Detchd
4,Gd,TA,Attchd,250000,Gd,TA,Attchd


In [35]:
df.drop(['BsmtQual','FireplaceQu','GarageType','SalePrice'], axis=1)
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual-New_Var,FireplaceQu-New_Var,GarageType-New_Var
0,Gd,,Attchd,208500,Gd,missing,Attchd
1,Gd,TA,Attchd,181500,Gd,TA,Attchd
2,Gd,TA,Attchd,223500,Gd,TA,Attchd
3,TA,Gd,Detchd,140000,TA,Gd,Detchd
4,Gd,TA,Attchd,250000,Gd,TA,Attchd
