# Feature Engineering for our House Prediction Dataset
1. Missing Values
2. Temporal Variables
3. Categorical Variables
4. Standardize values of Variables to same range

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [84]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


There is a chance of data leakage, hence we first split the data and then apply feature engineering

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['SalePrice'], test_size=0.1, random_state=0)

In [86]:
X_train.shape, X_test.shape

((1314, 81), (146, 81))

##### Treating the missing values

In [87]:
nan_feat = [feature for feature in df.columns if df[feature].isnull().sum() > 1 and df[feature].dtypes == 'O']
for feat in nan_feat:
    print("{}: {}% missing values".format(feat, np.round(df[feat].isnull().mean() * 100), 4))


Alley: 94.0% missing values
MasVnrType: 60.0% missing values
BsmtQual: 3.0% missing values
BsmtCond: 3.0% missing values
BsmtExposure: 3.0% missing values
BsmtFinType1: 3.0% missing values
BsmtFinType2: 3.0% missing values
FireplaceQu: 47.0% missing values
GarageType: 6.0% missing values
GarageFinish: 6.0% missing values
GarageQual: 6.0% missing values
GarageCond: 6.0% missing values
PoolQC: 100.0% missing values
Fence: 81.0% missing values
MiscFeature: 96.0% missing values


In [88]:
def replace_cat_feat(df, nan_feat):
    dta = df.copy()
    dta[nan_feat] = dta[nan_feat].fillna('Missing')
    return dta

df = replace_cat_feat(df, nan_feat)
df[nan_feat].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [89]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,12,2008,WD,Normal,250000


In [90]:
num_na_feat = [feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!='O']
for feat in num_na_feat:
    print("{}: {}% of missing numerical features".format(feat, np.around(df[feat].isnull().mean() * 100, 4)))

LotFrontage: 17.7397% of missing numerical features
MasVnrArea: 0.5479% of missing numerical features
GarageYrBlt: 5.5479% of missing numerical features


In [91]:
for feat in num_na_feat:
    med_val = df[feat].median()
    df[feat+'_nan'] = np.where(df[feat].isnull(), 1, 0)
    df[feat].fillna(med_val, inplace=True)
    
df[num_na_feat].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feat].fillna(med_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feat].fillna(med_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves 

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [92]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [93]:
for feat in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    df[feat] = df['YrSold'] - df[feat]
df[['YearBuilt', 'YearRemodAdd', "GarageYrBlt"]].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0
