In [129]:
import sys
import os
# Adds the parent directory of 'notebook' (the project root) to the search path
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from src.features.custom_transformer import ColumnDropper, ColumnBasedMedianImputer
from src.features.build_features import build_preprocessor
from sklearn.metrics import root_mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [130]:
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")
# create new model pipeline
X = train_df.drop('SalePrice',axis=1)

##as the y label is right skewed , we will log transform as regression expects normal distributed data
y = np.log1p(train_df['SalePrice'])

In [131]:
preprocessor = build_preprocessor()

In [132]:
drop_cols = ['Id','Fence','Alley','MiscFeature','PoolQC']
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include='object').columns

updated_num_cols = [c for c in num_cols if c not in drop_cols]
updated_cat_cols = [c for c in cat_cols if c not in drop_cols]

from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector

fill_LotFrontage_imputer = Pipeline([('imputer',ColumnBasedMedianImputer(fill_column='LotFrontage',
                                                                         group_by_columns='Neighborhood'))])

scaler_encoder = ColumnTransformer([('scaler',StandardScaler(),updated_num_cols),
                                    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False,),updated_cat_cols)])


pipeline = Pipeline([('drop_column',ColumnDropper(drop_cols)),
                     ('fill_LotFrontage',ColumnBasedMedianImputer(fill_column='LotFrontage',group_by_columns='Neighborhood')),
                     ('preprocessor',preprocessor),
                     ('scale_and_encode',scaler_encoder)])

In [133]:
##create train test split

scaled_X = pipeline.fit_transform(X,y)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.33, random_state=42)



##Ridge model

In [134]:
##train ridge model

from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=(0.1,0.5,1.0,5,10,50,100),scoring='neg_mean_absolute_error',cv=5)

ridge.fit(X_train,y_train)


In [135]:
## test ridge model

pred = ridge.predict(X_test)


In [136]:
mae = mean_absolute_error(y_test,pred)
mae_dict = {'ridge':mae}
mae

0.09791553011587037

In [137]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict = {'ridge':rmse}
rmse

0.13890092302524282

In [138]:
ridge.alpha_

np.float64(50.0)

In [139]:
##Lasso model

from sklearn.linear_model import LassoCV

lasso = LassoCV(eps=0.001,n_alphas=10,cv=5,max_iter=10000)

lasso.fit(X_train,y_train)

In [140]:
pred = lasso.predict(X_test)

In [141]:
mae = mean_absolute_error(y_test,pred)
mae_dict.update({'lasso':mae})
mae

0.08535941065919676

In [142]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict.update({'lasso':rmse})
rmse

0.12096169812989112

In [143]:
X_train.shape

(978, 286)

In [144]:
##Random Forrest

from sklearn.ensemble import RandomForestRegressor

n_estimators = [64,100,128,200,250]
max_features = ['sqrt', 'log2', None]


param_grid = {'n_estimators':n_estimators,
              'max_features':max_features}

rfr = RandomForestRegressor()

grid = GridSearchCV(rfr,param_grid)

grid.fit(X_train,y_train)

In [145]:
grid.best_estimator_

In [146]:
pred = grid.best_estimator_.predict(X_test)

In [147]:
mae = mean_absolute_error(y_test,pred)
mae_dict.update({'Random Forrest':mae})
mae

0.09601022907253848

In [148]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict.update({'RandomForrest':rmse})
rmse

0.14544092429473607

In [149]:
##XGBoost

from sklearn.ensemble import GradientBoostingRegressor

param_grid = {'n_estimators':[50,100,150],
              'learning_rate': [0.1,0.005,0.2]}

gradient_boost = GradientBoostingRegressor()

grid = GridSearchCV(gradient_boost,param_grid)

grid.fit(X_train,y_train)

In [150]:
grid.best_estimator_

In [151]:
pred = grid.best_estimator_.predict(X_test)

In [152]:
mae = mean_absolute_error(y_test,pred)
mae_dict.update({'Gradient Boost':mae})
mae

0.08752284565579153

In [153]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict.update({'Gradient Boost':rmse})
rmse

0.13613063570947495

In [154]:
##support vector

from sklearn.svm import SVR

param_grid = {'C': [0.0001,0.01,0.1,0.5],
              'kernel':['linear','rbf','poly'],
              'gamma': ['scale','auto'],
              'degree':[2,3,4],
              'epsilon':[0,0.01,0.1,0.5,1,2]}

svr = SVR()

grid = GridSearchCV(svr,param_grid)

grid.fit(X_train,y_train)

In [155]:
grid.best_estimator_

In [156]:
grid.best_estimator_.get_params()

{'C': 0.5,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 2,
 'epsilon': 0.01,
 'gamma': 'auto',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [157]:
pred = grid.best_estimator_.predict(X_test)

In [158]:
mae = mean_absolute_error(y_test,pred)
mae_dict.update({'SVR':mae})
mae

0.08067614027349317

In [159]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict.update({'SVR':rmse})
rmse

0.12740854152742517

In [160]:
##KNN

from sklearn.neighbors import KNeighborsRegressor

param_grid = {'n_neighbors':list(range(1,20))}

knn = KNeighborsRegressor()

grid = GridSearchCV(knn,param_grid)


grid.fit(X_train,y_train)

In [161]:
grid.best_estimator_

In [162]:
pred = grid.best_estimator_.predict(X_test)

In [163]:
mae = mean_absolute_error(y_test,pred)
mae_dict.update({'knn' : mae})
mae

0.11862068816777291

In [164]:
rmse = root_mean_squared_error(y_test,pred)
rmse_dict.update({'knn':rmse})
rmse

0.17432423386054197

In [165]:
mae_dict

{'ridge': 0.09791553011587037,
 'lasso': 0.08535941065919676,
 'Random Forrest': 0.09601022907253848,
 'Gradient Boost': 0.08752284565579153,
 'SVR': 0.08067614027349317,
 'knn': 0.11862068816777291}

In [166]:
rmse_dict

{'ridge': 0.13890092302524282,
 'lasso': 0.12096169812989112,
 'RandomForrest': 0.14544092429473607,
 'Gradient Boost': 0.13613063570947495,
 'SVR': 0.12740854152742517,
 'knn': 0.17432423386054197}

best model: SVR
{'C': 0.5
 'degree': 2,
 'epsilon': 0.01,
 'gamma': 'auto',
 'kernel': 'rbf'}

 rmse : 0.12740854152742517