## House Pricing Model

This notebook is primary for model development. 


In [1]:
# Package Imports
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from scipy.stats import uniform
import math
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

# from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [6]:
## run once
# os.chdir("..")

In [9]:
# os.getcwd()

In [21]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [12]:
# Data Loading
df = pd.read_csv("./data/train.csv")
print(f"Data shape : {df.shape}")

Data shape : (1460, 81)


In [13]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [22]:
X.isna().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
LotFrontage       259
GarageType         81
GarageYrBlt        81
GarageQual         81
GarageCond         81
GarageFinish       81
BsmtFinType2       38
BsmtExposure       38
BsmtCond           37
BsmtFinType1       37
BsmtQual           37
MasVnrArea          8
MasVnrType          8
Electrical          1
HalfBath            0
BsmtFullBath        0
BsmtHalfBath        0
BedroomAbvGr        0
FullBath            0
TotRmsAbvGrd        0
Functional          0
KitchenAbvGr        0
KitchenQual         0
Id                  0
Fireplaces          0
LowQualFinSF        0
GarageCars          0
GarageArea          0
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
GrLivArea           0
HeatingQC 

In [23]:
drop_features = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "LotFrontage"] 


In [27]:
for col in drop_features:
    print(col)
    print(f"No of unique values {df[col].nunique}")
    if df[col].dtype == "O":
        print(df[col].unique)

PoolQC
No of unique values <bound method IndexOpsMixin.nunique of 0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      NaN
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20      NaN
21      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
27      NaN
28      NaN
29      NaN
30      NaN
31      NaN
32      NaN
33      NaN
34      NaN
35      NaN
36      NaN
37      NaN
38      NaN
39      NaN
40      NaN
41      NaN
42      NaN
43      NaN
44      NaN
45      NaN
46      NaN
47      NaN
48      NaN
49      NaN
50      NaN
51      NaN
52      NaN
53      NaN
54      NaN
55      NaN
56      NaN
57      NaN
58      NaN
59      NaN
60      NaN
61      NaN
62      NaN
63      NaN
64      NaN
65      NaN
66      NaN
67      NaN
68      NaN
69      NaN
70      NaN
71      NaN
72      NaN
73      NaN
74      NaN
75      NaN
76      NaN
77      Na

In [31]:
df[drop_features].dtypes

PoolQC          object
MiscFeature     object
Alley           object
Fence           object
FireplaceQu     object
LotFrontage    float64
dtype: object

In [28]:
df["PoolQC"].dtype

dtype('O')

In [17]:
features = [col for col in df.columns if col not in ['SalePrice']]
X = df[features]
y = df['SalePrice']


In [19]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [32]:
# Columns

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 13 and 
                    X[cname].dtype == "object"]


numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

X = preprocessor.fit_transform(X)

In [14]:
def rmse(true,pred):
    res =  mean_squared_error(np.log(true), np.log(pred), squared=False)
    return res

def train(model, X, y, grid, metric, metric_module, greater):
    
    # Preprocess

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    grid_search = GridSearchCV(model, 
                            param_grid=grid,
                            scoring=make_scorer(metric_module, greater_is_better=greater),
                            cv=5)
    grid_search.fit(X_train,y_train)
    
    valid_score = grid_search.best_score_
    test_score = metric_module(y_test, grid_search.best_estimator_.predict(X_test))
    print(f'validation_{metric}:{valid_score }' )
    print(f'test_{metric}:{test_score}' )
    
    return grid_search, valid_score, test_score

In [17]:
# Decision Tree

tree_model = DecisionTreeRegressor()

grid = dict(max_features = ['auto','sqrt','log2'],
            min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10, endpoint=True)],
            min_samples_leaf = [float(x) for x in np.linspace(0.1, 0.5, 5, endpoint=True)])
decision_tree_clf = train(tree_model, X, y, grid, "rmse", rmse, greater=False)


validation_rmse:-0.22927548820894567
test_rmse:0.228471647644574


In [19]:
# Random Forest Regressor
random_model = RandomForestRegressor(random_state=0)
grid2 = dict(n_estimators = [100],
             max_features = ['auto','sqrt','log2'],
             min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10, endpoint=True)],
             min_samples_leaf = [float(x) for x in np.linspace(0.1, 0.5, 5, endpoint=True)])
random_tree_clf = train(random_model, X, y, grid2, "rmse", rmse, greater=False)

In [15]:
# XGBoost Model

xgb_model = XGBRegressor()
params = {
        'n_estimators': [500, 1000]
        }
xgb_model_clf, valid_score, test_score= train(xgb_model, X, y, params, "rmse", rmse, greater=False) 
    

validation_rmse:-0.1525117740687361
test_rmse:0.14655285390965905


In [17]:
report= xgb_model_clf.best_params_

In [18]:
metric  = {
    'valid_score': valid_score,
    'test_score':test_score
}
report.update(metric)
report

{'n_estimators': 500,
 'valid_score': -0.1525117740687361,
 'test_score': 0.14655285390965905}