In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [13]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices_df = pd.read_sql_query('select * from houseprices',con=engine)

# Can close the connection since we're only doing one query
engine.dispose()

print(house_prices_df.head(10))

   id  mssubclass mszoning  lotfrontage  lotarea street alley lotshape  \
0   1          60       RL         65.0     8450   Pave  None      Reg   
1   2          20       RL         80.0     9600   Pave  None      Reg   
2   3          60       RL         68.0    11250   Pave  None      IR1   
3   4          70       RL         60.0     9550   Pave  None      IR1   
4   5          60       RL         84.0    14260   Pave  None      IR1   
5   6          50       RL         85.0    14115   Pave  None      IR1   
6   7          20       RL         75.0    10084   Pave  None      Reg   
7   8          60       RL          NaN    10382   Pave  None      IR1   
8   9          50       RM         51.0     6120   Pave  None      Reg   
9  10         190       RL         50.0     7420   Pave  None      Reg   

  landcontour utilities  ... poolarea poolqc  fence miscfeature miscval  \
0         Lvl    AllPub  ...        0   None   None        None       0   
1         Lvl    AllPub  ...       

In [14]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns)

In [21]:
# Y : Target Variable
Y = house_prices_df['saleprice']
# X: Features
X = house_prices_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf'] + dummy_column_names]

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

In [22]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# Predictions
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.7680258684645617
-----Test set statistics-----
R-squared of the model in test set is: 0.7677759097496346
Mean absolute error of the prediction is: 25546.803828589873
Mean squared error of the prediction is: 1559083821.972992
Root mean squared error of the prediction is: 39485.23549344732
Mean absolute percentage error of the prediction is: 15.551420347538441


In [30]:
# alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

lasso = LassoCV(alphas=alphas, cv=5)

lasso.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lasso.predict(X_train)
y_preds_test = lasso.predict(X_test)

# print("Best alpha value is: {}".format(lasso.alpha_))
print("R-squared of the model in training set is: {}".format(lasso.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.7678047359412336
-----Test set statistics-----
R-squared of the model in test set is: 0.7678676963052431
Mean absolute error of the prediction is: 25550.27903500316
Mean squared error of the prediction is: 1558467594.2001991
Root mean squared error of the prediction is: 39477.431453935795
Mean absolute percentage error of the prediction is: 15.613447980127408


In [31]:
ridge = RidgeCV(alphas=alphas, cv=5)

ridge.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge.predict(X_train)
y_preds_test = ridge.predict(X_test)

# print("Best alpha value is: {}".format(ridge.alpha_))
print("R-squared of the model in training set is: {}".format(ridge.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.7678338103945402
-----Test set statistics-----
R-squared of the model in test set is: 0.7678244205564295
Mean absolute error of the prediction is: 25539.73254414962
Mean squared error of the prediction is: 1558758134.7715333
Root mean squared error of the prediction is: 39481.11111368996
Mean absolute percentage error of the prediction is: 15.627026388844916


In [32]:
elasticnet = ElasticNetCV(alphas=alphas, cv=5)

elasticnet.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet.predict(X_train)
y_preds_test = elasticnet.predict(X_test)

# print("Best alpha value is: {}".format(elasticnet.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.7678925153819995
-----Test set statistics-----
R-squared of the model in test set is: 0.7676453805577793
Mean absolute error of the prediction is: 25567.300532077265
Mean squared error of the prediction is: 1559960156.3407888
Root mean squared error of the prediction is: 39496.33092251467
Mean absolute percentage error of the prediction is: 15.636674449979168
