In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

* Load the houseprices data from Thinkful's database.
* Reimplement your model from the previous checkpoint.
* Try OLS, lasso, ridge, and elastic net regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models. Scikit-learn has RidgeCV, LassoCV, and ElasticNetCV that you can utilize to do this. Which model is the best? Why?

In [8]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

import warnings

warnings.filterwarnings("ignore")
import config

<IPython.core.display.Javascript object>

In [21]:
postgres_user = config.user
postgres_pw = config.password
postgres_host = config.host
postgres_port = config.port
postgres_db = "houseprices"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

df = pd.read_sql_query("select * from houseprices", con=engine)

# No need for an open connection,
# because you're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [22]:
df = pd.concat(
    [df, pd.get_dummies(df.condition1, prefix="condition1", drop_first=True)], axis=1
)
df = pd.concat(
    [df, pd.get_dummies(df.condition2, prefix="condition2", drop_first=True)], axis=1
)
dummy_cols = list(
    pd.get_dummies(df.condition1, prefix="condition1", drop_first=True).columns
)
dummy_cols = dummy_cols + list(
    pd.get_dummies(df.condition2, prefix="condition2", drop_first=True).columns
)

<IPython.core.display.Javascript object>

In [23]:
X = df[
    [
        "overallqual",
        "grlivarea",
        "garagecars",
        "totalbsmtsf",
    ]
    + dummy_cols
]

<IPython.core.display.Javascript object>

In [24]:
droplist = [
    "condition1_PosN",
    "condition1_RRAn",
    "condition1_Feedr",
    "condition1_PosA",
    "condition1_RRAe",
    "condition1_RRNe",
    "condition1_RRNn",
    "condition2_Feedr",
    "condition2_Norm",
    "condition2_PosA",
    "condition2_RRAe",
    "condition2_RRAn",
    "condition2_RRNn",
]

<IPython.core.display.Javascript object>

In [25]:
df["totalsf"] = df["totalbsmtsf"] + df["firstflrsf"] + df["secondflrsf"]
df["inter_qual_sf"] = df["totalsf"] * df["overallqual"]
X = X.drop(droplist, axis=1)
y = np.log1p(df.saleprice)
X = pd.concat([X, df["totalsf"], df["inter_qual_sf"]], axis=1)
X = X.drop(columns=["totalbsmtsf"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [26]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

<IPython.core.display.Javascript object>

In [27]:
print("R-squared for training: {}".format(model.score(X_train, y_train)))
print("-------Test Set Stats-------")
print("R-squared for testing: {}".format(model.score(X_test, y_test)))
print("MAE: {}".format(mean_absolute_error(y_test, y_pred_test)))
print("MSE: {}".format(mse(y_test, y_pred_test)))
print("RMSE: {}".format(rmse(y_test, y_pred_test)))
print("MAPE: {}".format(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100))

R-squared for training: 0.8055560580981167
-------Test Set Stats-------
R-squared for testing: 0.8263515481931782
MAE: 0.12830411924832696
MSE: 0.031795580798379124
RMSE: 0.1783131537446947
MAPE: 1.0823455824107318


<IPython.core.display.Javascript object>

In [35]:
alphas = [np.power(10.0, p) for p in np.arange(-10, 40, 1)]
model_lasso = LassoCV(alphas=alphas, cv=5)
model_lasso.fit(X_train, y_train)

y_pred = model_lasso.predict(X_test)
print("Best alpha: {}".format(model_lasso.alpha_))
print("R-squared for training: {}".format(model_lasso.score(X_train, y_train)))
print("-------Test Set Stats-------")
print("R-squared for testing: {}".format(model_lasso.score(X_test, y_test)))
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("MSE: {}".format(mse(y_test, y_pred)))
print("RMSE: {}".format(rmse(y_test, y_pred)))
print("MAPE: {}".format(np.mean(np.abs((y_test - y_pred) / y_test)) * 100))

Best alpha: 0.001
R-squared for training: 0.8031161829794807
-------Test Set Stats-------
R-squared for testing: 0.8247308238237335
MAE: 0.12880626346603025
MSE: 0.032092340556985614
RMSE: 0.17914335197540995
MAPE: 1.0860708717647505


<IPython.core.display.Javascript object>

In [36]:
model_ridge = RidgeCV(alphas=alphas, cv=5)
model_ridge.fit(X_train, y_train)

y_pred = model_ridge.predict(X_test)
print("Best alpha: {}".format(model_ridge.alpha_))
print("R-squared for training: {}".format(model_ridge.score(X_train, y_train)))
print("-------Test Set Stats-------")
print("R-squared for testing: {}".format(model_ridge.score(X_test, y_test)))
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("MSE: {}".format(mse(y_test, y_pred)))
print("RMSE: {}".format(rmse(y_test, y_pred)))
print("MAPE: {}".format(np.mean(np.abs((y_test - y_pred) / y_test)) * 100))

Best alpha: 10.0
R-squared for training: 0.8038168452724259
-------Test Set Stats-------
R-squared for testing: 0.8255006993749661
MAE: 0.1284405133762482
MSE: 0.03195137390833882
RMSE: 0.1787494724700994
MAPE: 1.0831546205326443


<IPython.core.display.Javascript object>

In [37]:
model_elastic = ElasticNetCV(alphas=alphas, cv=5)
model_elastic.fit(X_train, y_train)

y_pred = model_elastic.predict(X_test)
print("Best alpha: {}".format(model_elastic.alpha_))
print("R-squared for training: {}".format(model_elastic.score(X_train, y_train)))
print("-------Test Set Stats-------")
print("R-squared for testing: {}".format(model_elastic.score(X_test, y_test)))
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))
print("MSE: {}".format(mse(y_test, y_pred)))
print("RMSE: {}".format(rmse(y_test, y_pred)))
print("MAPE: {}".format(np.mean(np.abs((y_test - y_pred) / y_test)) * 100))

Best alpha: 0.001
R-squared for training: 0.804256841850514
-------Test Set Stats-------
R-squared for testing: 0.8252211838955797
MAE: 0.12865483959782825
MSE: 0.032002554076758144
RMSE: 0.17889257691910568
MAPE: 1.084926413100304


<IPython.core.display.Javascript object>

Linear Regression is the best model according to the results.