# House Prices Predictive Model

# 3 - Preprocessing Training Data

In [594]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from library.sb_utils import save_file

## Read df from cleaned csv

In [595]:
df = pd.read_csv('../data/house_refined_data_cleaned.csv')

In [596]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,Street,Alley,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,...,Heating,GrLivArea,KitchenQual,GarageType,GarageFinish,GarageCars,SalePrice,TotalSF,HasPool,YearBuilt
0,1,60,RL,Pave,,CollgCr,Norm,Norm,2Story,7,...,GasA,1710,Gd,Attchd,RFn,2,208500,2566,False,2003
1,2,20,RL,Pave,,Veenker,Feedr,Norm,1Story,6,...,GasA,1262,TA,Attchd,RFn,2,181500,2524,False,1976
2,3,60,RL,Pave,,CollgCr,Norm,Norm,2Story,7,...,GasA,1786,Gd,Attchd,RFn,2,223500,2706,False,2001
3,4,70,RL,Pave,,Crawfor,Norm,Norm,2Story,7,...,GasA,1717,Gd,Detchd,Unf,3,140000,2473,False,1915
4,5,60,RL,Pave,,NoRidge,Norm,Norm,2Story,8,...,GasA,2198,Gd,Attchd,RFn,3,250000,3343,False,2000


In [597]:
df['MSSubClass'] = df['MSSubClass'].astype('category').cat.codes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1460 non-null   int64 
 1   MSSubClass    1460 non-null   int8  
 2   MSZoning      1460 non-null   object
 3   Street        1460 non-null   object
 4   Alley         1460 non-null   object
 5   Neighborhood  1460 non-null   object
 6   Condition1    1460 non-null   object
 7   Condition2    1460 non-null   object
 8   HouseStyle    1460 non-null   object
 9   OverallQual   1460 non-null   int64 
 10  MasVnrType    1460 non-null   object
 11  ExterQual     1460 non-null   object
 12  ExterCond     1460 non-null   object
 13  TotalBsmtSF   1460 non-null   int64 
 14  Heating       1460 non-null   object
 15  GrLivArea     1460 non-null   int64 
 16  KitchenQual   1460 non-null   object
 17  GarageType    1460 non-null   object
 18  GarageFinish  1460 non-null   object
 19  Garage

### Get dummy variables

In [598]:
dummies_df = pd.get_dummies(df['MSSubClass'], drop_first=True, prefix='Dummies')
df = pd.concat([df, dummies_df], axis=1)

In [599]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'Street', 'Alley', 'Neighborhood',
       'Condition1', 'Condition2', 'HouseStyle', 'OverallQual', 'MasVnrType',
       'ExterQual', 'ExterCond', 'TotalBsmtSF', 'Heating', 'GrLivArea',
       'KitchenQual', 'GarageType', 'GarageFinish', 'GarageCars', 'SalePrice',
       'TotalSF', 'HasPool', 'YearBuilt', 'Dummies_1', 'Dummies_2',
       'Dummies_3', 'Dummies_4', 'Dummies_5', 'Dummies_6', 'Dummies_7',
       'Dummies_8', 'Dummies_9', 'Dummies_10', 'Dummies_11', 'Dummies_12',
       'Dummies_13', 'Dummies_14'],
      dtype='object')

In [600]:
df.HasPool = df.HasPool.replace(to_replace=[True, False], value=[1,0])
df.YearBuilt = df.YearBuilt - min(df.YearBuilt)
df.GrLivArea = df.GrLivArea - min(df.GrLivArea)
df.TotalSF = df.TotalSF - min(df.TotalSF)
df = pd.get_dummies(df, drop_first=True)
df.drop(columns=['Id','MSSubClass'], inplace=True)
df.dtypes

OverallQual          int64
TotalBsmtSF          int64
GrLivArea            int64
GarageCars           int64
SalePrice            int64
                     ...  
GarageType_Detchd    uint8
GarageType_None      uint8
GarageFinish_None    uint8
GarageFinish_RFn     uint8
GarageFinish_Unf     uint8
Length: 102, dtype: object

In [601]:
df.columns

Index(['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageCars', 'SalePrice',
       'TotalSF', 'HasPool', 'YearBuilt', 'Dummies_1', 'Dummies_2',
       ...
       'KitchenQual_TA', 'GarageType_Attchd', 'GarageType_Basment',
       'GarageType_BuiltIn', 'GarageType_CarPort', 'GarageType_Detchd',
       'GarageType_None', 'GarageFinish_None', 'GarageFinish_RFn',
       'GarageFinish_Unf'],
      dtype='object', length=102)

## Train Test Split

In [602]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='SalePrice',axis=1),
df['SalePrice'], test_size=0.2, random_state=0)

## Scale Data

In [603]:
scaler = StandardScaler()
scaler.fit(X_train)
scaled_train_X = scaler.transform(X_train)
scaled_test_X = scaler.transform(X_test)

In [604]:
print(scaled_train_X)

[[ 2.13150648  1.86572881  0.60188649 ... -0.22858752 -0.63681669
   1.18044171]
 [-0.79485211 -0.38726187 -1.21671763 ... -0.22858752 -0.63681669
   1.18044171]
 [-0.79485211 -0.43096212 -1.08041967 ... -0.22858752 -0.63681669
   1.18044171]
 ...
 [-0.06326246 -2.55770764  0.7459729  ... -0.22858752 -0.63681669
   1.18044171]
 [ 0.66832719  0.77807813  0.0742187  ... -0.22858752 -0.63681669
  -0.84714052]
 [ 0.66832719  0.34350342  0.62330474 ... -0.22858752 -0.63681669
  -0.84714052]]


## Save pre-processed data

In [606]:
np.savetxt('../data/scaled_train_X.csv', scaled_train_X, delimiter=",")
np.savetxt('../data/scaled_test_X.csv', scaled_test_X, delimiter=",")
np.savetxt('../data/train_y.csv', y_train, delimiter=",")
np.savetxt('../data/test_y.csv', y_test, delimiter=",")