In [225]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from datetime import datetime
import joblib


pd.set_option('display.max_columns', None)

In [226]:
main_df = pd.read_csv('../data/Daegu_Real_Estate_data.csv')
main_df.head()

Unnamed: 0,SalePrice,YearBuilt,YrSold,MonthSold,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_APT,N_manager,N_elevators,SubwayStation,N_FacilitiesNearBy(PublicOffice),N_FacilitiesNearBy(Hospital),N_FacilitiesNearBy(Dpartmentstore),N_FacilitiesNearBy(Mall),N_FacilitiesNearBy(ETC),N_FacilitiesNearBy(Park),N_SchoolNearBy(Elementary),N_SchoolNearBy(Middle),N_SchoolNearBy(High),N_SchoolNearBy(University),N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,2007,8,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,3.0,0.0,Kyungbuk_uni_hospital,2.0,1,1.0,1.0,1.0,0.0,3.0,2.0,2.0,2.0,5,6.0,9.0
1,51327,1985,2007,8,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,1.0,2.0,2.0,Daegu,5.0,1,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
2,48672,1985,2007,8,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,1.0,2.0,2.0,Daegu,5.0,1,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,3,12.0,4.0
3,380530,2006,2007,8,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,6.0,5.0,11.0,Sin-nam,1.0,1,0.0,1.0,0.0,0.0,2.0,2.0,1.0,2.0,5,3.0,7.0
4,221238,1993,2007,8,1761,3,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,8.0,20.0,Myung-duk,6.0,2,0.0,1.0,5.0,0.0,4.0,3.0,5.0,5.0,4,14.0,17.0


In [227]:
df = main_df.copy()

In [228]:
df.isna().sum()

SalePrice                             0
YearBuilt                             0
YrSold                                0
MonthSold                             0
Size(sqf)                             0
Floor                                 0
HallwayType                           0
HeatingType                           0
AptManageType                         0
N_Parkinglot(Ground)                  0
N_Parkinglot(Basement)                0
TimeToBusStop                         0
TimeToSubway                          0
N_APT                                 0
N_manager                             0
N_elevators                           0
SubwayStation                         0
N_FacilitiesNearBy(PublicOffice)      0
N_FacilitiesNearBy(Hospital)          0
N_FacilitiesNearBy(Dpartmentstore)    0
N_FacilitiesNearBy(Mall)              0
N_FacilitiesNearBy(ETC)               0
N_FacilitiesNearBy(Park)              0
N_SchoolNearBy(Elementary)            0
N_SchoolNearBy(Middle)                0


In [240]:
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']

In [230]:
X.columns

Index(['YearBuilt', 'YrSold', 'MonthSold', 'Size(sqf)', 'Floor', 'HallwayType',
       'HeatingType', 'AptManageType', 'N_Parkinglot(Ground)',
       'N_Parkinglot(Basement)', 'TimeToBusStop', 'TimeToSubway', 'N_APT',
       'N_manager', 'N_elevators', 'SubwayStation',
       'N_FacilitiesNearBy(PublicOffice)', 'N_FacilitiesNearBy(Hospital)',
       'N_FacilitiesNearBy(Dpartmentstore)', 'N_FacilitiesNearBy(Mall)',
       'N_FacilitiesNearBy(ETC)', 'N_FacilitiesNearBy(Park)',
       'N_SchoolNearBy(Elementary)', 'N_SchoolNearBy(Middle)',
       'N_SchoolNearBy(High)', 'N_SchoolNearBy(University)',
       'N_FacilitiesInApt', 'N_FacilitiesNearBy(Total)',
       'N_SchoolNearBy(Total)'],
      dtype='object')

In [241]:
num_cols = ['Size(sqf)','N_Parkinglot(Basement)']
cat_cols = [col for col in X.columns if X[col].dtypes == "O" or col in ['YearBuilt','Floor']]


In [242]:
for each in cat_cols:
    print(each, X[each].nunique())

YearBuilt 16
Floor 43
HallwayType 3
HeatingType 2
AptManageType 2
TimeToBusStop 3
TimeToSubway 5
SubwayStation 8


In [243]:
X = pd.concat([X[cat_cols],X[num_cols]],axis=1)

In [244]:
X

Unnamed: 0,YearBuilt,Floor,HallwayType,HeatingType,AptManageType,TimeToBusStop,TimeToSubway,SubwayStation,Size(sqf),N_Parkinglot(Basement)
0,2006,3,terraced,individual_heating,management_in_trust,5min~10min,10min~15min,Kyungbuk_uni_hospital,814,184.0
1,1985,8,corridor,individual_heating,self_management,0~5min,5min~10min,Daegu,587,76.0
2,1985,6,corridor,individual_heating,self_management,0~5min,5min~10min,Daegu,587,76.0
3,2006,8,terraced,individual_heating,management_in_trust,0~5min,0-5min,Sin-nam,2056,536.0
4,1993,3,mixed,individual_heating,management_in_trust,0~5min,15min~20min,Myung-duk,1761,536.0
...,...,...,...,...,...,...,...,...,...,...
5886,2007,19,terraced,individual_heating,management_in_trust,0~5min,0-5min,Kyungbuk_uni_hospital,1643,1270.0
5887,2006,13,terraced,individual_heating,management_in_trust,5min~10min,0-5min,Myung-duk,903,181.0
5888,2007,20,terraced,individual_heating,management_in_trust,0~5min,0-5min,Kyungbuk_uni_hospital,868,1270.0
5889,1978,1,corridor,individual_heating,self_management,0~5min,0-5min,Kyungbuk_uni_hospital,1327,0.0


In [119]:
# bin_cols = ['N_FacilitiesInApt']

# class make_bins_ct(BaseEstimator, TransformerMixin):

    
#     def __init__(self, cols):
#         self.cols = cols
#         self.labels = ['1-3','3-5','5-9','>=10']
        
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         for col in bin_cols:
#             X[col] = pd.cut(X[col], [-1,3,5,9,X[col].max()], labels=self.labels)
#         return X

# bin_pipe = make_pipeline(make_bins_ct(cols=bin_cols),OneHotEncoder(handle_unknown='ignore'),StandardScaler(with_mean=False) )

# X = pd.concat([X[cat_cols],X[bin_cols],X[num_cols]],axis=1)

In [245]:
cat_pipe = make_pipeline(OneHotEncoder(handle_unknown='ignore'), StandardScaler(with_mean=False))
num_pipe = make_pipeline(MinMaxScaler())


In [252]:
main_ct = make_column_transformer(
#     (bin_pipe, bin_cols),
    (cat_pipe, cat_cols),
    (num_pipe, num_cols),
    remainder="passthrough")

In [237]:
# model_LR = Pipeline([
#     ("preprocess", main_ct),
#     ("model_LR", LinearRegression())
# ])

# model_RF = Pipeline([
#     ("preprocess", main_ct),
#     ("model_LR", RandomForestRegressor())
# ])

In [238]:

estimators = {
    "SVR":SVR(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor()
}

In [27]:
def modeler(estimators,X, y, params=None, random_state=25):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
    
    MAE,MAPE,MSE,RMSE,R2 = ([] for i in range(5))
    keys = estimators.keys()
    
    for i in range(len(keys)):
        
        model = Pipeline([
            ("preprocess", main_ct),
            ("model", estimators[list(keys)[i]])
        ])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        MAE.append(mean_absolute_error(y_test, y_pred ))
        MAPE.append(np.mean(np.abs((y_test - y_pred)/ y_test)))
        MSE.append(mean_squared_error(y_test,y_pred))
        RMSE.append(mean_squared_error(y_test,y_pred,squared=False))
        R2.append(r2_score(y_test,y_pred))
        
    data = {
        'Model':list(keys),
        'MAE': MAE,
        'MAPE': MAPE,
        'MSE':MSE,
        'RMSE': RMSE,
        'R2': R2
    }
    
    return pd.DataFrame(data)

<h1>BASELINE ESTIMATORS</h1>

In [28]:
modeler(estimators,X, y, 22)

Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2
0,SVR,88393.662458,0.55676,11900420000.0,109089.050776,-0.022786
1,DecisionTreeRegressor,34760.537781,0.182045,1979396000.0,44490.406137,0.82988
2,RandomForestRegressor,34759.45921,0.182146,1979523000.0,44491.828123,0.829869
3,GradientBoostingRegressor,35258.102062,0.185993,2007893000.0,44809.518535,0.827431


<h1>GRID SEARCH FOR BEST MODEL AND PARAMS</h1>

In [38]:
params = {
    
        
    "SVR":{
        'C': [0.1,1, 10, 100], 
        'gamma': [1,0.1,0.01,0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
    },
    
    "RandomForestRegressor": {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': [1, 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [100, 250, 500]
    },

    
    "DecisionTreeRegressor": {
        'splitter':['best','random'],
        'max_depth' : [1,3,5,7,12],
        'min_samples_leaf':[1,3,5,7,10],
        'max_features':['log2','sqrt',1],
        'max_leaf_nodes':[None,10,30,50,90] 
    },
    
    "GradientBoostingRegressor":{
        'learning_rate': [0.01,0.02,0.03,0.04],
        'subsample'    : [0.9, 0.5, 0.2, 0.1],
        'n_estimators' : [100,200,500],
        'max_depth'    : [4,6,8,10]
    }
}

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
prepped_X_train = main_ct.fit_transform(X_train)



def find_best_model(estimators):
    
    keys = estimators.keys()
    
    best_params=[]
    for i in range(len(keys)):
        grid = GridSearchCV(estimators[list(keys)[i]],param_grid=params[list(keys)[i]],cv=3,verbose=3)
        grid.fit(prepped_X_train, y_train)
        best_params.append(grid.best_params_)

    
    return best_params

In [None]:
best_params = find_best_model(estimators)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

model_RF = Pipeline([
    ("preprocess", main_ct),
    ("model_RF", RandomForestRegressor())
])

model_RF.fit(X_train, y_train)
y_pred = model_RFb .predict(X_test)

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

grid = { 
    'n_estimators': [50,100,150,200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7]
}

model_RF = RandomForestRegressor()

start_time = datetime.now()
prepped_X_train = main_ct.fit_transform(X_train)
model_RF.fit(prepped_X_train, y_train)

end_time = datetime.now()
print(end_time - start_time)

0:00:32.482510


In [251]:
# joblib.dump(main_ct,'prep_pipe.pkl')
# joblib.dump(model_RF,'model.pkl')

import pickle

with open('prep_pipe.pkl','wb') as p:
    pickle.dump(main_ct,p)

with open('model.pkl','wb') as m:    
    pickle.dump(model_RF,m)

In [40]:
# def print_metrics(y_test,models,y_pred):
    
#     MAE,MAPE,MSE,RMSE,R2 = ([] for i in range(5))
    
#     for i in range(len(models)):
#         MAE.append(mean_absolute_error(y_test, y_pred[i] ))
#         MAPE.append(np.mean(np.abs((y_test - y_pred[i])/ y_test)))
#         MSE.append(mean_squared_error(y_test,y_pred[i]))
#         RMSE.append(mean_squared_error(y_test,y_pred[i],squared=False))
#         R2.append(r2_score(y_test,y_pred[i]))
        
#     data = {
#         'Model':models,
#         'MAE': MAE,
#         'MAPE': MAPE,
#         'MSE':MSE,
#         'RMSE': RMSE,
#         'R2': R2
#     }

#     return pd.DataFrame(data)

# print_metrics(y_test,models=['Linear Regression','Random Forest'],y_pred=[y_pred_lr, y_pred_rf])