In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set()
%matplotlib inline

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error as mae

from sklearn.model_selection import KFold, cross_validate, cross_val_score
import scipy.stats as st

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import warnings 
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
# Load and view data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

pd.set_option('display.max_columns', None)

In [3]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1168, 117)
(1168, 1)
(292, 117)
(292, 1)


# 1. First Linear Regression Model

In this model, I ran a straight forward Linear Regression Model.

In [4]:
# Linear Regression Model
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)
    
y_pred = lr.predict(X_test) 

mse = mean_squared_error(y_test, y_pred)
rmse1 = np.sqrt(mse)

round(rmse1, 2)

34044.04

In [5]:
# RMSE range
y_test_mean = np.mean(y_test)

lower = y_test_mean - rmse1
upper = y_test_mean + rmse1

print(str(round(lower[0], 2)) +'\n'+ str(round(upper[0], 2)))

144795.77
212883.86


In [6]:
# Root Mean Square Percent Error
rmspe1 = (np.sqrt(np.mean(np.square((y_test - y_pred) / y_test)))) * 100
rmspe1[0]

17.091033580513024

In [7]:
# Calculate Mean Absolute Error
error = mae(y_test, y_pred)
  
# display
print("Mean absolute error : " + str(error))

Mean absolute error : 21260.00684931507


In [8]:
# Calculate Mean Absolute Percent Error
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    calc = np.mean(np.abs((actual - pred) / actual)) * 100
    return calc

mape1 = mape(y_test,y_pred)

print("Mean absolute percent error : " + str(mape1))

Mean absolute percent error : 12.295704942988314


# 2. Second Linear Regression Model

I seem to be having some trouble applying Cross validation to my Linear Regression Model. My RMSE value keeps coming out very very large.

In [9]:
k_folds = KFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(lr, X_train, y_train, 
                         scoring="neg_mean_squared_error", cv = k_folds)

print("Cross Validation Scores: ", scores)
print("RMSE: ", np.sqrt(np.mean(np.absolute(scores))))
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [-9.27782702e+08 -3.55600702e+31 -9.29906876e+30 -8.05444206e+08
 -4.05403132e+29]
RMSE:  3008805149668394.5
Number of CV Scores used in Average:  5


# 3. Third Linear Regression Model

In this model, I observed the correlation between each column and the "Saleprice". I then tested removing lower correlation number to see how the RMSE changed. 

In [10]:
corr_dict={}
for col in X_train.columns:
    corr = X_train[col].corr(y_train['SalePrice'])
    corr_dict.update({col : np.abs(corr)})
    
corr_df = pd.DataFrame(corr_dict.items(),columns=['Column', 'Correlation'])\
                        .sort_values(by='Correlation',ascending=False)

corr_df

Unnamed: 0,Column,Correlation
2,OverallQual,0.785555
12,GrLivArea,0.695652
21,GarageCars,0.640991
22,GarageArea,0.624139
8,TotalBsmtSF,0.597766
...,...,...
45,RoofStyle_Mansard,0.006321
6,BsmtFinSF2,0.005731
111,Foundation_Wood,0.002776
110,Foundation_Stone,0.002416


In [11]:
corr_30 = corr_df[corr_df['Correlation'] > 0.3]
print(f'\033[1mNumber of remaining features:\033[0m {corr_30.shape[0]}')


[1mNumber of remaining features:[0m 25


In [12]:
new_cols = list(corr_30['Column'])

In [13]:
X_train2 = X_train[new_cols]
X_test2 = X_test[new_cols]

print(X_train.shape)
print(X_test.shape)
print(X_train2.shape)
print(X_test2.shape)

(1168, 117)
(292, 117)
(1168, 25)
(292, 25)


In [14]:
lr = linear_model.LinearRegression()
lr.fit(X_train2, y_train)
    
y_pred2 = lr.predict(X_test2) 

mse2 = mean_squared_error(y_test, y_pred2)
rmse2 = np.sqrt(mse2)

round(rmse2, 2)

34754.13

In [15]:
# Root Mean Square Percent Error
rmspe2 = (np.sqrt(np.mean(np.square((y_test - y_pred2) / y_test)))) * 100
rmspe2[0]

18.646905087342773

In [16]:
# calculate Mean Absolute Error
error2 = mae(y_test, y_pred2)
  
# display
print("Mean absolute error : " + str(error2))

Mean absolute error : 22400.1063978875


In [17]:
mape2 = mape(y_test,y_pred2)

print("Mean absolute percent error : " + str(mape2))

Mean absolute percent error : 13.388847987452207


# 4. Random Forest Model

In [18]:
rf = RandomForestRegressor(random_state=42)

rf.fit(X_train, y_train.values.ravel())  

y_pred3 = rf.predict(X_test) 

mse3 = mean_squared_error(y_test, y_pred3)
rmse3 = np.sqrt(mse3)

round(rmse3, 0)

27763.0

In [19]:
y_pred3 = y_pred3.reshape(292,1)

# Root Mean Square Percent Error
rmspe3 = (np.sqrt(np.mean(np.square((y_test - y_pred3) / y_test)))) * 100
rmspe3[0]

18.18019480599643

In [20]:
# calculate Mean Absolute Error
error3 = mae(y_test, y_pred3)
  
# display
print("Mean absolute error : " + str(error3))

Mean absolute error : 17505.4601826484


In [21]:
mape3 = mape(y_test,y_pred3)

print("Mean absolute percent error : " + str(mape3))

Mean absolute percent error : 10.665555030964459


# 5. Random Forest Model Hyperparameter Optimization
 
Using RandomizedSearchCV searching for the best hyperparameters.

In [22]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [23]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())

# Best params
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [24]:
#Apply best params
  
rf2 = RandomForestRegressor(n_estimators= 400, min_samples_split= 2,
                            min_samples_leaf= 1, max_features= 'sqrt',
                            max_depth= None, bootstrap= False, random_state=42)


rf2.fit(X_train, y_train.values.ravel())  

y_pred4 = rf2.predict(X_test) 

mse4 = mean_squared_error(y_test, y_pred4)
rmse4 = np.sqrt(mse4)

round(rmse4, 0)

28930.0

In [25]:
y_pred4 = y_pred4.reshape(292,1)

# Root Mean Square Percent Error
rmspe4 = (np.sqrt(np.mean(np.square((y_test - y_pred4) / y_test)))) * 100
rmspe4[0]

18.932930258350016

In [26]:
# calculate Mean Absolute Error
error4 = mae(y_test, y_pred4)
  
# display
print("Mean absolute error : " + str(error4))

Mean absolute error : 16803.014999999996


In [27]:
mape4 = mape(y_test,y_pred4)

print("Mean absolute percent error : " + str(mape4))

Mean absolute percent error : 10.49535137104386


# 6. Random Forest Model Hyperparameter Optimization
 
Using GridSearchCV searching for the best hyperparameters.

In [28]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
    
# Create a based model
rf3 = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf3, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)


In [29]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 200}

In [30]:
rf3 = RandomForestRegressor(n_estimators= 100, min_samples_split= 8,
                            min_samples_leaf= 3, max_features= 3,
                            max_depth= 100, bootstrap= True, random_state=42)


rf3.fit(X_train, y_train.values.ravel())  

y_pred5 = rf3.predict(X_test) 

mse5 = mean_squared_error(y_test, y_pred5)
rmse5 = np.sqrt(mse5)

round(rmse5, 0)

42705.0

In [31]:
y_pred5 = y_pred5.reshape(292,1)

# Root Mean Square Percent Error
rmspe5 = (np.sqrt(np.mean(np.square((y_test - y_pred5) / y_test)))) * 100
rmspe5[0]

27.31570437630738

In [32]:
# calculate Mean Absolute Error
error5 = mae(y_test, y_pred5)
  
# display
print("Mean absolute error : " + str(error5))

Mean absolute error : 24293.047791938076


In [33]:
mape5 = mape(y_test,y_pred5)

print("Mean absolute percent error : " + str(mape5))

Mean absolute percent error : 15.583474793001056


# 7. Results Dataframe

In [34]:
error_df = pd.DataFrame([('1st LR Model', rmse1, rmspe1[0], error, mape1),
                  ('LR CORR Model', rmse2, rmspe2[0], error2, mape2),
                  ('1st RF Model', rmse3, rmspe3[0], error3, mape3), 
                  ('RF RandCV Model', rmse4, rmspe4[0], error4, mape4),
                  ('RF GridCV Model',rmse5, rmspe5[0], error5, mape5)], 
                 columns = ['Name','RMSE', 'RMSPE','MAE',"MAPE"])

error_df

Unnamed: 0,Name,RMSE,RMSPE,MAE,MAPE
0,1st LR Model,34044.044801,17.091034,21260.006849,12.295705
1,LR CORR Model,34754.126151,18.646905,22400.106398,13.388848
2,1st RF Model,27763.130613,18.180195,17505.460183,10.665555
3,RF RandCV Model,28930.042529,18.93293,16803.015,10.495351
4,RF GridCV Model,42705.141467,27.315704,24293.047792,15.583475


[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=  19.1s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=  17.7s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   5.2s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1800; total time=   3.6s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1800; total time=   3.7s
[CV] END bootstrap=True, max_depth=80, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   2.5s
[CV] END bootstrap=False, max_depth=8

[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.6s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.0s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  15.2s
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   2.5s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=

[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.6s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.4s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   2.0s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=  17.1s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   2.9s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   7.4s
[CV] END bootstrap=False, max_depth=1