# Data Cleaning

In [141]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import warnings
import math
from scipy import stats

In [142]:
df_train = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [143]:
ids = df_train.pop("Id")

In [144]:
df_train = df_train.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
print('Shape of all_data= {}'.format(df_train.shape))

Shape of all_data= (1459, 76)


In [145]:
for col in ('MSSubClass', 'YrSold', 'MoSold'):
    df_train[col] = df_train[col].astype(str)

In [146]:
for col in df_train.select_dtypes(include = ["object", "category"]):
    df_train[col] = df_train[col].fillna("None")
    print("="*60)
    print(f"{col} : Null count: {df_train[col].isnull().sum()} Unique values: {df_train[col].unique()}")

MSSubClass : Null count: 0 Unique values: ['20' '60' '120' '160' '80' '30' '50' '90' '85' '190' '45' '70' '75' '180'
 '40' '150']
MSZoning : Null count: 0 Unique values: ['RH' 'RL' 'RM' 'FV' 'C (all)' 'None']
Alley : Null count: 0 Unique values: ['None' 'Pave' 'Grvl']
LotShape : Null count: 0 Unique values: ['Reg' 'IR1' 'IR2' 'IR3']
LandContour : Null count: 0 Unique values: ['Lvl' 'HLS' 'Bnk' 'Low']
LotConfig : Null count: 0 Unique values: ['Inside' 'Corner' 'FR2' 'CulDSac' 'FR3']
LandSlope : Null count: 0 Unique values: ['Gtl' 'Mod' 'Sev']
Neighborhood : Null count: 0 Unique values: ['NAmes' 'Gilbert' 'StoneBr' 'BrDale' 'NPkVill' 'NridgHt' 'Blmngtn'
 'NoRidge' 'Somerst' 'SawyerW' 'Sawyer' 'NWAmes' 'OldTown' 'BrkSide'
 'ClearCr' 'SWISU' 'Edwards' 'CollgCr' 'Crawfor' 'Blueste' 'IDOTRR'
 'Mitchel' 'Timber' 'MeadowV' 'Veenker']
Condition1 : Null count: 0 Unique values: ['Feedr' 'Norm' 'PosN' 'RRNe' 'Artery' 'RRNn' 'PosA' 'RRAn' 'RRAe']
Condition2 : Null count: 0 Unique values: ['Norm' 'F

In [147]:
df_train["LotFrontage"] = df_train.groupby("Neighborhood")["LotFrontage"].transform(lambda x : x.median())

In [148]:
df_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,73.0,11622,,Reg,Lvl,Inside,Gtl,NAmes,...,0,120,0,MnPrv,,0,6,2010,WD,Normal
1,20,RL,73.0,14267,,IR1,Lvl,Corner,Gtl,NAmes,...,0,0,0,,Gar2,12500,6,2010,WD,Normal
2,60,RL,63.0,13830,,IR1,Lvl,Inside,Gtl,Gilbert,...,0,0,0,MnPrv,,0,3,2010,WD,Normal
3,60,RL,63.0,9978,,IR1,Lvl,Inside,Gtl,Gilbert,...,0,0,0,,,0,6,2010,WD,Normal
4,120,RL,60.0,5005,,IR1,HLS,Inside,Gtl,StoneBr,...,0,144,0,,,0,1,2010,WD,Normal


In [149]:
for col in df_train.select_dtypes(include = ['int64', 'float64']):
    df_train[col] = df_train[col].fillna(method ='bfill')

In [150]:
for col in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd', 'SaleType'):
    df_train[col] = df_train[col].replace({"None" : df_train[col].mode()[0]})

In [151]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant

def get_highest_vif_feature(df, thresh=5):
   
    const = add_constant(df)
    print(f'Shape of data after adding const column: {const.shape}')
    cols = const.columns
      # Calculating VIF for each feature
    vif_df = pd.Series([ (variance_inflation_factor(const.values, i)) for i in range(const.shape[1]) ], index= const.columns).to_frame()
    
    vif_df = vif_df.sort_values(by=0, ascending=False).rename(columns={0: 'VIF'})
    vif_df = vif_df.drop('const')
    vif_df = vif_df[vif_df['VIF'] > thresh]

    if vif_df.empty:
        print('DataFrame is empty!')
        return None
    else:
        print(f'\nFeatures above VIF threshold: {vif_df.to_dict()}')       
        # Feature with hig
        return list(vif_df.index)[0]
        print(f'Lets delete the feature with highest VIF value: {list(vif_df.index)[0]}')
        
# Selecting only numeric features
print(f'Shape of input data: {df_train.shape}')
numeric_feats = df_train.dtypes[df_train.dtypes != "object"].index
print(f"Calculating VIF for {len(numeric_feats)} numerical features")

df_numeric = df_train[numeric_feats]
print(f'Shape of df_numeric: {df_numeric.shape}')
    
feature_to_drop = None
feature_to_drop_list = []
while True:
    feature_to_drop = get_highest_vif_feature(df_numeric, thresh=5)
    print(f'feature_to_drop: {feature_to_drop}')
    if feature_to_drop is None:
        print('No more features to drop!')
        break
    else:
        feature_to_drop_list.append(feature_to_drop)
        df_numeric = df_numeric.drop(feature_to_drop, axis=1)
        print(f'Feature {feature_to_drop} droped from df_numeric')

print(f'\nfeature_to_drop_list: {feature_to_drop_list}')

Shape of input data: (1459, 76)
Calculating VIF for 33 numerical features
Shape of df_numeric: (1459, 33)
Shape of data after adding const column: (1459, 34)


  vif = 1. / (1. - r_squared_i)



Features above VIF threshold: {'VIF': {'LowQualFinSF': inf, 'BsmtFinSF1': inf, 'GrLivArea': inf, '2ndFlrSF': inf, '1stFlrSF': inf, 'TotalBsmtSF': inf, 'BsmtUnfSF': inf, 'BsmtFinSF2': inf, 'GarageCars': 6.23525081555748, 'GarageArea': 6.0644990009608355, 'YearBuilt': 5.359795538560827}}
feature_to_drop: LowQualFinSF
Feature LowQualFinSF droped from df_numeric
Shape of data after adding const column: (1459, 33)

Features above VIF threshold: {'VIF': {'BsmtFinSF1': inf, 'TotalBsmtSF': inf, 'BsmtUnfSF': inf, 'BsmtFinSF2': inf, 'GrLivArea': 129.93909202508195, '2ndFlrSF': 96.48083163518426, '1stFlrSF': 89.69431493670268, 'GarageCars': 6.23525081555748, 'GarageArea': 6.064499000960832, 'YearBuilt': 5.3597955385608245}}
feature_to_drop: BsmtFinSF1
Feature BsmtFinSF1 droped from df_numeric
Shape of data after adding const column: (1459, 32)

Features above VIF threshold: {'VIF': {'GrLivArea': 129.93909202508195, '2ndFlrSF': 96.48083163518426, '1stFlrSF': 89.69431493670268, 'GarageCars': 6.235

In [152]:
df_train['GarageArea_GarageCars'] = df_train['GarageArea'] * df_train['GarageCars'] + df_train['GarageYrBlt']

df_train['YearBuilt_YearRemodAdd'] = df_train['YearBuilt'] * df_train['YearRemodAdd']

df_train['TotalBathrooms'] = (df_train['FullBath'] + (0.5 * df_train['HalfBath']) +
                               df_train['BsmtFullBath'] + (0.5 * df_train['BsmtHalfBath']))

df_train['TotalPorchSF'] = (df_train['OpenPorchSF'] + df_train['3SsnPorch'] +
                              df_train['EnclosedPorch'] + df_train['ScreenPorch'] +
                              df_train['WoodDeckSF'])

In [153]:
cat_feats = []
for col in df_train.select_dtypes(include = ['object', 'category']):
    cat_feats.append(col)
    
print(f"Number of categorical features: {len(cat_feats)} \n list of categorical features : {cat_feats}")
    
num_feats = []
for col in df_train.select_dtypes(include = ['int64', 'float64']):
    num_feats.append(col)
      
print(f"\nNumber of numeric features: {len(num_feats)} \n list of numeric features : {num_feats}")

Number of categorical features: 43 
 list of categorical features : ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

Number of numeric features: 37 
 list of numeric features : ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenA

In [154]:
cat_feats_ordinal = ['Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 
                     'BldgType', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond',
                     'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1',
                     'HeatingQC', 'KitchenQual', 'Functional', 
                     'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                     'PavedDrive', 'Fence']

cat_feats_nominal = ['MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 
                     'CentralAir', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition',
                     'Electrical', 'MasVnrType', 'Exterior1st', 'Exterior2nd', 'Heating', 'Foundation']

In [155]:
df_train['Alley'].replace(to_replace = ['None', 'Grvl', 'Pave'], 
                          value = [0, 1, 2], inplace = True)
df_train['LotShape'].replace(to_replace = ['Reg', 'IR1', 'IR2', 'IR3'], 
                             value = [3, 2, 1,0], inplace = True)
df_train['LandContour'].replace(to_replace = ['Lvl', 'Bnk', 'Low', 'HLS'], 
                                value = [3, 2, 1,0], inplace = True)
df_train['LotConfig'].replace(to_replace = ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], 
                              value = [0, 3, 1, 2, 4], inplace = True)
df_train['LandSlope'].replace(to_replace = ['Gtl', 'Mod', 'Sev'], 
                              value =
                              [2, 1, 0], inplace = True)
df_train['BldgType'].replace(to_replace = ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
                             value = [4, 3, 2, 1, 0], inplace = True)
df_train['RoofStyle'].replace(to_replace = ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'], 
                              value = [4, 2, 3, 1, 5, 0], inplace = True)
df_train['RoofMatl'].replace(to_replace = ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv',
                                           'WdShake', 'WdShngl'], 
                             value = [7, 6, 5, 4, 3, 2, 1, 0], inplace = True)
df_train['ExterQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa'], 
                              value = [3, 2, 1, 0], inplace = True)
df_train['ExterCond'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 
                              value = [4, 3, 2, 1, 0], inplace = True)
df_train['BsmtQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'None'], 
                             value = [4, 3, 2, 1, 0], inplace = True)
df_train['BsmtCond'].replace(to_replace = ['Gd', 'TA', 'Fa', 'Po', 'None'], 
                             value = [4, 3, 2, 1, 0], inplace = True)
df_train['BsmtExposure'].replace(to_replace = ['Gd', 'Av', 'Mn', 'No', 'None'], 
                                 value = [4, 3, 2, 1, 0], inplace = True)
df_train['BsmtFinType1'].replace(to_replace = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'None'], 
                                 value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
df_train['BsmtFinType2'].replace(to_replace = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'None'], 
                                 value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
df_train['HeatingQC'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 
                              value = [4, 3, 2, 1, 0], inplace = True)
df_train['KitchenQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa'], 
                                value = [3, 2, 1, 0], inplace = True)
df_train['Functional'].replace(to_replace = ['Typ', 'Min1', 'Min2', 'Mod',  'Maj1', 'Maj2', 'Sev', 'None'],
                               value = [6, 5, 4, 3, 2, 1, 0, 7], inplace = True)
df_train['FireplaceQu'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'], 
                                value = [5, 4, 3, 2, 1, 0], inplace = True)
df_train['GarageType'].replace(to_replace = ['2Types', 'Attchd', 'Basment', 'BuiltIn', 
                                             'CarPort', 'Detchd', 'None'], 
                               value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
df_train['GarageFinish'].replace(to_replace = ['Fin', 'RFn', 'Unf', 'None'], 
                                 value = [3, 2, 1, 0], inplace = True)
df_train['GarageQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'], 
                               value = [5, 4, 3, 2, 1, 0], inplace = True)
df_train['GarageCond'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa',  'Po', 'None'], 
                               value = [5, 4, 3, 2, 1, 0], inplace = True)
df_train['PavedDrive'].replace(to_replace = ['Y', 'P', 'N'], value = [2, 1, 0], inplace = True)
df_train['Fence'].replace(to_replace = ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'None'], 
                          value = [4, 3, 2, 1, 0], inplace = True)

In [156]:
df_train.shape

(1459, 80)

In [157]:
cat_feats_nominal_one_hot_encoded = pd.get_dummies(df_train[cat_feats_nominal], 
                                                   drop_first = True).reset_index(drop = True)

In [163]:
cat_feats_nominal_one_hot_encoded.shape

(1459, 133)

In [159]:
cat_feats_nominal_one_hot_encoded.head()

Unnamed: 0,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,...,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,Heating_GasW,Heating_Grav,Heating_Wall,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [160]:
df_train = df_train.drop(cat_feats_nominal, axis= 'columns')
df_train.head()

Unnamed: 0,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,BldgType,OverallQual,OverallCond,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,GarageArea_GarageCars,YearBuilt_YearRemodAdd,TotalBathrooms,TotalPorchSF
0,73.0,11622,0,3,3,0,2,4,5,6,...,0,0,120,0,3,0,2691.0,3845521,1.0,260
1,73.0,14267,0,2,3,1,2,4,6,6,...,0,0,0,0,0,12500,2270.0,3833764,1.5,429
2,63.0,13830,0,2,3,0,2,4,5,5,...,0,0,0,0,3,0,2961.0,3990006,2.5,246
3,63.0,9978,0,2,3,0,2,4,6,6,...,0,0,0,0,0,0,2938.0,3992004,2.5,396
4,60.0,5005,0,2,0,0,2,1,8,5,...,0,0,144,0,0,0,3004.0,3968064,2.0,226


In [161]:
df_train = pd.concat([df_train, cat_feats_nominal_one_hot_encoded], axis = 1)

In [162]:
df_train.shape

(1459, 195)