Taking the best model from the previous checkpoint.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

In [3]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [4]:
df = pd.concat([df,pd.get_dummies(df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
df = pd.concat([df,pd.get_dummies(df.street, prefix="street", drop_first=True)], axis=1)
dummy_col_names = list(pd.get_dummies(df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_col_names = dummy_col_names + list(pd.get_dummies(df.street, prefix="street", drop_first=True).columns)

In [11]:
df['totalsf'] = df['totalbsmtsf'] + df['firstflrsf'] + df['secondflrsf']

df['int_over_sf'] = df['totalsf'] * df['overallqual']

# Y is the target variable
Y = np.log1p(df['saleprice'])
# X is the feature set
X = df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalsf', 'int_over_sf'] + dummy_col_names]

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

In [12]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# Predictions
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print('R-squared of the model in training set is: {}'.format(lrm.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {}'.format(lrm.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.8321322553132751
-----Test set statistics-----
R-squared of the model in test set is: 0.824930233091685
Mean absolute error of the prediction is: 0.12570372872861377
Mean squared error of the prediction is: 0.02919212187135
Root mean squared error of the prediction is: 0.1708570217209407
Mean absolute percentage error of the prediction is: 1.0503577667823585


In [7]:
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]
lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

# Predictions
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print('Best alpha value is: {}'.format(lasso_cv.alpha_))
print('R-squared of the model in training set is: {}'.format(lasso_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {}'.format(lasso_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.0001
R-squared of the model in training set is: 0.8319394287042422
-----Test set statistics-----
R-squared of the model in test set is: 0.8226434437869413
Mean absolute error of the prediction is: 0.12624310826908405
Mean squared error of the prediction is: 0.02957343403767703
Root mean squared error of the prediction is: 0.17196928225028163
Mean absolute percentage error of the prediction is: 1.0552354946577738


In [8]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

# Predictions
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print('Best alpha value is: {}'.format(ridge_cv.alpha_))
print('R-squared of the model in training set is: {}'.format(ridge_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {}'.format(ridge_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1.0
R-squared of the model in training set is: 0.8316364867222636
-----Test set statistics-----
R-squared of the model in test set is: 0.8203050076234277
Mean absolute error of the prediction is: 0.12673637339741065
Mean squared error of the prediction is: 0.029963358092979
Root mean squared error of the prediction is: 0.1730992723640946
Mean absolute percentage error of the prediction is: 1.059694123031067


In [10]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

# Predictions
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print('Best alpha value is: {}'.format(elasticnet_cv.alpha_))
print('R-squared of the model in training set is: {}'.format(elasticnet_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {}'.format(elasticnet_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.001
R-squared of the model in training set is: 0.8299654806803801
-----Test set statistics-----
R-squared of the model in test set is: 0.8149185869526185
Mean absolute error of the prediction is: 0.12770726087011364
Mean squared error of the prediction is: 0.03086152030253383
Root mean squared error of the prediction is: 0.1756744725409296
Mean absolute percentage error of the prediction is: 1.068544489730312


OLS looks to be the best model according to the results of the tests run.