In [210]:
# Packages
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from house_prices.adhoc_functions.model import vis_all
from sklearn.linear_model import LassoCV, Lasso, RidgeCV, Ridge, ElasticNetCV, BayesianRidge
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
%matplotlib inline

In [195]:
# Options
sns.set_style("whitegrid")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', -1)

In [196]:
explore_in = './data/transformed_data/explore.csv'
train_in = './data/transformed_data/model_train_linear.csv'
test_in = './data/transformed_data/model_test_linear.csv'
model_out = './data/transformed_data/linear_models.csv'

In [197]:
drop_vars = [
    'Id', 'Neighborhood', 'MSZoning', 'BldgType', 'Functional', 'MSSubClass', 
    'Condition1', 'LotConfig', 'MasVnrType', 'SaleType', 'SaleCondition', 'Stories',
]
keep_vars = [
    'SalePrice',            # Target
    'OverallQual',          # 0.8
    'ExterQual',            # 0.7
    'YearBuilt',            # 0.5
    'YearRemodAdd',         # 0.5
    'MasVnrArea',           # 0.5
    'TotalBsmtSF',          # 0.6
    '1stFlrSF',             # 0.6
    'GrLivArea',            # 0.7
    'FullBath',             # 0.6
    'KitchenQual',          # 0.7
    'TotRmsAbvGrd',         # 0.5
    'Fireplaces',           # 0.5
    'GarageCars',           # 0.6
    'GarageArea',           # 0.6
    'PropertyAge',          #-0.5
    'TotalBath',            # 0.6
    'HasPorch',             # 0.4
    'HeatingQC',            # 0.4
    'TotalArea',            # 0.8
    'CoreArea',             # 0.8
    'Neighborhood_ordinal', # 0.7
    'MSSubClass_ordinal',   # 0.5
    'MasVnrType_ordinal',   # 0.4
    'Foundation_ordinal',   # 0.5 
    'SaleType_ordinal',     # 0.4
    'SaleCondition_ordinal',# 0.3
]

test_vars = keep_vars.copy()
test_vars.remove('SalePrice')

#### Train Data

In [203]:
df_train_in = pd.read_csv(train_in)
features = df_train_in[keep_vars].drop('SalePrice', axis=1)
target = np.log1p(df_train_in['SalePrice'])
features.head()

Unnamed: 0,OverallQual,ExterQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,PropertyAge,TotalBath,HasPorch,HeatingQC,TotalArea,CoreArea,Neighborhood_ordinal,MSSubClass_ordinal,MasVnrType_ordinal,Foundation_ordinal,SaleType_ordinal,SaleCondition_ordinal
0,0.510826,0.510826,0.949275,0.883333,0.109355,0.131114,0.113132,0.230501,0.5,0.666667,0.5,0.0,0.5,0.326754,0.036765,0.5,1,1.0,0.19533,0.178548,0.666667,1.0,1.0,1.0,0.333333,0.5
1,0.441833,0.287682,0.753623,0.433333,0.0,0.187762,0.193049,0.161124,0.333333,0.333333,0.333333,0.333333,0.5,0.28096,0.227941,0.3,0,1.0,0.170937,0.175467,0.791667,0.714286,0.5,0.666667,0.333333,0.5
2,0.510826,0.510826,0.934783,0.866667,0.092074,0.14026,0.126161,0.241808,0.5,0.666667,0.333333,0.333333,0.5,0.356816,0.051471,0.5,1,1.0,0.205897,0.188753,0.666667,1.0,1.0,1.0,0.333333,0.5
3,0.510826,0.287682,0.311594,0.333333,0.0,0.116655,0.13442,0.231548,0.333333,0.666667,0.416667,0.333333,0.75,0.373459,0.669118,0.2,1,0.75,0.191092,0.171712,0.75,0.642857,0.5,0.333333,0.333333,0.0
4,0.575364,0.510826,0.927536,0.833333,0.180508,0.171764,0.170666,0.300969,0.5,0.666667,0.583333,0.333333,0.75,0.463459,0.058824,0.5,1,1.0,0.255683,0.23391,0.958333,1.0,1.0,1.0,0.333333,0.5


In [204]:
def rmse(pred, true):
    error = 0
    for x,y in zip(pred,true):
        error+=(x-y)**2
    error/=len(pred)
    return np.sqrt(error)


def validate_model(model, train_features, train_target, test_features, test_target):
    model.fit(train_features, train_target)
    print("Training Error: ", rmse(model.predict(train_features), train_target))
    print("Test Error: ", rmse(model.predict(test_features), test_target))

In [205]:
train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size=0.2, random_state = 42
)

train_features = train_features.as_matrix()
test_features = test_features.as_matrix()
train_target = train_target.as_matrix()
test_target = test_target.as_matrix()

  """
  
  import sys
  


#### Submission Data

In [206]:
df_sub_in = pd.read_csv('./data/transformed_data/model_test_linear.csv')
sub_features = df_sub_in[test_vars]
sub_features.head()

Unnamed: 0,OverallQual,ExterQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,PropertyAge,TotalBath,HasPorch,HeatingQC,TotalArea,CoreArea,Neighborhood_ordinal,MSSubClass_ordinal,MasVnrType_ordinal,Foundation_ordinal,SaleType_ordinal,SaleCondition_ordinal
0,0.367725,0.287682,0.625954,0.183333,0.0,0.159659,0.09922,0.09922,0.0,0.6,0.166667,0.0,0.2,0.399173,0.384615,0.0,0,0.5,0.098318,0.114879,0.375,0.067114,0.5,0.666667,0.333333,0.5
1,0.441833,0.287682,0.603053,0.133333,0.077332,0.231781,0.179545,0.179545,0.0,0.8,0.25,0.0,0.2,0.190354,0.407692,0.083333,1,0.5,0.179545,0.193603,0.375,0.067114,1.0,0.666667,0.333333,0.5
2,0.367725,0.287682,0.900763,0.8,0.0,0.167326,0.105382,0.23164,0.2,0.6,0.25,0.25,0.4,0.280601,0.107692,0.25,1,0.75,0.191299,0.184876,0.541667,0.09396,0.5,1.0,0.333333,0.5
3,0.441833,0.287682,0.908397,0.8,0.015268,0.166994,0.104998,0.227401,0.2,0.8,0.333333,0.25,0.4,0.274491,0.1,0.25,1,1.0,0.188241,0.18253,0.541667,0.09396,1.0,1.0,0.333333,0.5
4,0.575364,0.510826,0.862595,0.7,0.0,0.224124,0.170772,0.170772,0.2,0.8,0.166667,0.0,0.4,0.29271,0.146154,0.166667,1,1.0,0.170772,0.185137,0.916667,0.087248,0.5,1.0,0.333333,0.5


#### Ridge

In [229]:
# CV Model
alphas = [0.0001, 0.001, 0.01, 0.03, 0.06, 0.1, 0.2]
ridge = RidgeCV(alphas=alphas, cv=20)
validate_model(ridge, train_features, train_target, test_features, test_target)

# Final Model
alpha = ridge.alpha_
ridge = Ridge(alpha=alpha)
ridge.fit(features, target)

# Submission
df_sub = pd.DataFrame()
df_sub['Id'] = df_sub_in['Id']
df_sub['SalePrice'] = np.round(np.expm1(ridge.predict(sub_features)),0)
df_sub['SalePrice'] = (df_sub['SalePrice'] / 100).astype(int) * 100
df_sub.to_csv('./data/output_data/ridge_submission.csv', index=False)

Training Error:  0.1439547835491929
Test Error:  0.14437790112204138


#### Lasso

In [181]:
alphas = [0.01, 0.03, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 5, 10, 50]
lasso = LassoCV(alphas=alphas, cv=20)
validate_model(lasso, train_features, train_target, test_features, test_target)

Training Error:  0.19142274869642642
Test Error:  0.21148287170096364


#### Elastic

In [182]:
l1_ratio=0
alphas=[0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 10],
elastic = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, max_iter=5000, cv=20)
validate_model(elastic, train_features, train_target, test_features, test_target)

Training Error:  0.3068905367629894
Test Error:  0.34400260787606846


#### Bayesian Ridge

In [183]:
bayes = BayesianRidge()
validate_model(bayes, train_features, train_target, test_features, test_target)

Training Error:  0.1439982618278771
Test Error:  0.14454876004072245


#### Ridge Prediction Error

In [None]:
visualizer = PredictionError(ridge)
visualizer.fit(features, target)  
visualizer.poof()             

#### Ridge Submission

In [None]:
predictions = ridge.predict(test_features)
df_sub = pd.DataFrame()
df_sub['Id'] = df_test_in['Id']
df_sub['SalePrice'] = np.expm1(ridge.predict(test_features))
df_sub.to_csv('./data/output_data/ridge_submission.csv', index=False)

#### Lasso

In [None]:
scoring = make_scorer(mean_squared_error)
alphas = [0.01, 0.03, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 5, 10, 50]
lasso = LassoCV(alphas=alphas)
lasso.fit(features, target)
mse = cross_val_score(lasso, features, target, cv=20, scoring=scoring)
rmse = np.sqrt(mse)
score = np.mean(rmse)
score

#### Lasso Coefficients

In [None]:
coefs = lasso.coef_
plt.figure(figsize=(6, 20))
plt.barh(np.arange(coefs.size), coefs)
plt.yticks(np.arange(coefs.size), features)
plt.title("Coefficients")
plt.tight_layout()

#### Lasso Submission

In [None]:
predictions = lasso.predict(test_features)
df_sub = pd.DataFrame()
df_sub['Id'] = df_test_in['Id']
df_sub['SalePrice'] = np.expm1(lasso.predict(test_features))
df_sub.to_csv('./data/output_data/lasso_submission.csv', index=False)

#### ElasticNet

In [None]:
scoring = make_scorer(mean_squared_error)
l1_ratio=0.5,
alphas=[0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 10],
elastic = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, max_iter=5000)
elastic.fit(features, target)
elastic_score = np.mean(np.sqrt(cross_val_score(elastic, features, target, cv=20, scoring=scoring)))
elastic_score

In [None]:
coefs = elastic.coef_
plt.figure(figsize=(6, 20))
plt.barh(np.arange(coefs.size), coefs)
plt.yticks(np.arange(coefs.size), features)
plt.title("Coefficients")
plt.tight_layout()