In [17]:
# Packages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, LinearRegression, ElasticNet, ElasticNetCV
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
%matplotlib inline

In [18]:
%run './ml/ml/model_features.py'
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# Options
sns.set_style("whitegrid")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', -1)

In [45]:
explore_in = './data/transformed_data/explore.csv'
train_in = './data/transformed_data/model_train.csv'
test_in = './data/transformed_data/model_test.csv'
model_out = './data/transformed_data/linear_models.csv'
drop_features_train = [
    'Neighborhood', 'MSZoning', 'BldgType', 'Functional', 'MSSubClass', 
    'Condition1', 'LotConfig', 'MasVnrType', 'SaleType', 
    'SaleCondition', 'Id', 'SalePrice'
]
drop_features_test = [
    'Neighborhood', 'MSZoning', 'BldgType', 'Functional', 'MSSubClass', 
    'Condition1', 'LotConfig', 'MasVnrType', 'SaleType', 
    'SaleCondition', 'Id', 'MSSubClass_150'
]

In [46]:
# Import Files
df_train_in = pd.read_csv(train_in)
df_test_in = pd.read_csv(test_in)
df_results = pd.read_csv(explore_in)
df_results = df_results[(df_results['TotalArea'] <= 10000)]
df_results['SalePriceLog'] = np.log1p(df_results['SalePrice'])

# Results Dataset
end_col = ['SalePrice', 'SalePriceLog']
df_results = df_results[[col for col in df_results if col not in end_col] + end_col]

# Define Features & Target
master_target = df_train_in[['SalePrice']]
master_features = df_train_in.drop(drop_features_train, axis=1)

#### Creating the Base Datasets

In [47]:
def append_results(df, model, name, features):
    df[name] = np.expm1(model.predict(features)).astype(int)
    df['{}Error'.format(name)] = df[name] - df['SalePrice']
    df['{}RMSE'.format(name)] = np.sqrt(df['{}Error'.format(name)]**2)
    
    return df

def get_score(features, target):
    # Train
    r2 = r2_score(features, target)
    mse = mean_squared_error(features, target)
    rmse = np.sqrt(mean_squared_error(features, target))
    print('R2: {}'.format(round(r2, 4)))
    print('MSE: {}'.format(round(mse, 4)))
    print('RMSE: {}'.format(round(rmse, 4)))    
    
def execute_model(model, features, target, test_features, test_target):
    print(model)
    train_prediction = model.predict(features)
    get_score(train_prediction, target)
    test_prediction = model.predict(test_features)
    get_score(test_prediction, test_target) 
    
def execute_final_model(model, features, target):
    master_prediction = model.predict(features)
    get_score(master_prediction, target)
    
def show_coefficients(model, features):
    coef_df = pd.DataFrame()
    coef_df['coef'] = model.coef_.ravel()
    coef_df['cols'] = ridge_features.columns.tolist()
    matplotlib.rcParams['figure.figsize'] = (10.0, 30.0)
    coef_df.plot(
        kind='barh',
        x='cols',
        y='coef'
    )
    plt.title("Coefficients in the Ridge Model")
    
scoring = make_scorer(mean_squared_error)

#### Ridge

In [48]:
# Train/Test split
features, test_features, target, test_target = train_test_split(
    master_features.drop(linear_drop_features, axis=1), 
    master_target,
    test_size=0.1,
    random_state=1
)

In [49]:
# CV Model

alphas=[0.01, 0.03, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 5, 10, 50]

for alpha in alphas:
    cv_model = Ridge(
        alpha=alpha,
    )
    
    cv_model.fit(features, target)
    cv_scores = cross_val_score(cv_model, features, target, cv=20, scoring=scoring)
    rmse_scores = np.sqrt(cv_scores)
    rmse = round(rmse_scores.mean(), 4)
    std = round(rmse_scores.std(), 4)
    
    print('Alpha:', alpha)
    print('RMSE scores mean:', rmse)
    print('RMSE scores STD:', std)
    print('')

Alpha: 0.01
RMSE scores mean: 0.1216
RMSE scores STD: 0.0191

Alpha: 0.03
RMSE scores mean: 0.1214
RMSE scores STD: 0.0191

Alpha: 0.06
RMSE scores mean: 0.1212
RMSE scores STD: 0.019

Alpha: 0.1
RMSE scores mean: 0.1211
RMSE scores STD: 0.019

Alpha: 0.2
RMSE scores mean: 0.1208
RMSE scores STD: 0.0189

Alpha: 0.3
RMSE scores mean: 0.1206
RMSE scores STD: 0.0188

Alpha: 0.4
RMSE scores mean: 0.1206
RMSE scores STD: 0.0188

Alpha: 0.5
RMSE scores mean: 0.1205
RMSE scores STD: 0.0188

Alpha: 0.6
RMSE scores mean: 0.1205
RMSE scores STD: 0.0188

Alpha: 0.7
RMSE scores mean: 0.1206
RMSE scores STD: 0.0188

Alpha: 0.8
RMSE scores mean: 0.1206
RMSE scores STD: 0.0188

Alpha: 0.9
RMSE scores mean: 0.1207
RMSE scores STD: 0.0189

Alpha: 1
RMSE scores mean: 0.1207
RMSE scores STD: 0.0189

Alpha: 2
RMSE scores mean: 0.1216
RMSE scores STD: 0.0193

Alpha: 3
RMSE scores mean: 0.1225
RMSE scores STD: 0.0198

Alpha: 5
RMSE scores mean: 0.1243
RMSE scores STD: 0.0206

Alpha: 10
RMSE scores mean: 0.1

In [50]:
# Master Model
model = Ridge(alpha=0.8)
final_features = master_features.drop(linear_drop_features, axis=1)
final_target = master_target
model.fit(final_features, final_target)
execute_final_model(model,final_features, final_target)
df_results = append_results(df_results, model, 'Ridge', final_features)

R2: 0.9128
MSE: 0.0124
RMSE: 0.1112


In [52]:
# Ridge Submission
test_features = df_test_in.drop(linear_drop_features + drop_features_test, axis=1)
df_sub = pd.DataFrame()
df_sub['Id'] = df_test_in['Id']
df_sub['SalePrice'] = np.expm1(model.predict(test_features))
df_sub.to_csv('./data/output_data/my_submission.csv', index=False)

#### Lasso

In [None]:
model_drop_features = [
    'OverallQual',
#     'PropertyAge',
#     'OverallGrade',
#     'ExterGrade',
    'CoreArea',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
#     'TotalArea',
#     'LotArea',
#     'FullBath',
#     'HalfBath',
    'TotalBath',
#     'TotRmsAbvGrd',
#     'BedroomAbvGr',
#     'KitchenAbvGr',
#     'SimpleOverallQual',
#     'SimpleOverallCond',
    'IsRemodelled',
    'IsNew',
#     'Neighborhood_Blmngtn',
#     'Neighborhood_Blueste',
#     'Neighborhood_BrDale',
#     'Neighborhood_BrkSide',
#     'Neighborhood_ClearCr',
#     'Neighborhood_CollgCr',
#     'Neighborhood_Crawfor',
#     'Neighborhood_Edwards',
#     'Neighborhood_Gilbert',
#     'Neighborhood_IDOTRR',
#     'Neighborhood_MeadowV',
#     'Neighborhood_Mitchel',
#     'Neighborhood_NAmes',
#     'Neighborhood_NPkVill',
#     'Neighborhood_NWAmes',
#     'Neighborhood_NoRidge',
#     'Neighborhood_NridgHt',
#     'Neighborhood_OldTown',
#     'Neighborhood_SWISU',
#     'Neighborhood_Sawyer',
#     'Neighborhood_SawyerW',
#     'Neighborhood_Somerst',
#     'Neighborhood_StoneBr',
#     'Neighborhood_Timber',
#     'Neighborhood_Veenker',
#     'MSZoning_C (all)',
#     'MSZoning_FV',
#     'MSZoning_RH',
#     'MSZoning_RL',
#     'MSZoning_RM',
#     'BldgType_1Fam',
#     'BldgType_2fmCon',
#     'BldgType_Duplex',
#     'BldgType_Twnhs',
#     'BldgType_TwnhsE',
#     'Functional_Maj1',
#     'Functional_Maj2',
#     'Functional_Min1',
#     'Functional_Min2',
#     'Functional_Mod',
#     'Functional_Sev',
#     'Functional_Typ',
#     'MSSubClass_120',
#     'MSSubClass_160',
#     'MSSubClass_180',
#     'MSSubClass_190',
#     'MSSubClass_20',
#     'MSSubClass_30',
#     'MSSubClass_40',
#     'MSSubClass_45',
#     'MSSubClass_50',
#     'MSSubClass_60',
#     'MSSubClass_70',
#     'MSSubClass_75',
#     'MSSubClass_80',
#     'MSSubClass_85',
#     'MSSubClass_90',
    'Condition1_Artery',
    'Condition1_Feedr',
    'Condition1_Norm',
    'Condition1_PosA',
    'Condition1_PosN',
    'Condition1_RRAe',
    'Condition1_RRAn',
    'Condition1_RRNe',
    'Condition1_RRNn',
#     'LotConfig_Corner',
#     'LotConfig_CulDSac',
#     'LotConfig_FR2',
#     'LotConfig_FR3',
#     'LotConfig_Inside',
#     'Exterior1st_AsbShng',
#     'Exterior1st_AsphShn',
#     'Exterior1st_BrkComm',
#     'Exterior1st_BrkFace',
#     'Exterior1st_CBlock',
#     'Exterior1st_CemntBd',
#     'Exterior1st_HdBoard',
    'Exterior1st_ImStucc',
#     'Exterior1st_MetalSd',
#     'Exterior1st_Plywood',
    'Exterior1st_Stone',
#     'Exterior1st_Stucco',
#     'Exterior1st_VinylSd',
#     'Exterior1st_Wd Sdng',
#     'Exterior1st_WdShing',
#     'MasVnrType_BrkCmn',
#     'MasVnrType_BrkFace',
#     'MasVnrType_None',
#     'MasVnrType_Stone',
#     'SaleType_COD',
#     'SaleType_CWD',
#     'SaleType_Con',
#     'SaleType_ConLD',
#     'SaleType_ConLI',
#     'SaleType_ConLw',
#     'SaleType_New',
#     'SaleType_Oth',
#     'SaleType_WD',
#     'SaleCondition_Abnorml',
#     'SaleCondition_AdjLand',
#     'SaleCondition_Alloca',
#     'SaleCondition_Family',
#     'SaleCondition_Normal',
#     'SaleCondition_Partial'
]

In [None]:
lasso_features = features.drop(drop_features, axis=1)

# CV Model
cv_model = LassoCV(
    # Returns R^2 score which can be negative
    alphas=[0.0001,0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 10],
    cv=10
)
cv_model.fit(lasso_features, target)
alpha = cv_model.alpha_

# Individual Model
model = Lasso(alpha=cv_model.alpha_)
model.fit(lasso_features, target)
print('Alpha:', alpha)
print('Score:', model.score(lasso_features, target))

df_results = append_results(df_results, model, 'Lasso', lasso_features)

coef_df = pd.DataFrame()
coef_df['coef'] = model.coef_.ravel()
coef_df['cols'] = lasso_features.columns.tolist()
matplotlib.rcParams['figure.figsize'] = (10.0, 20.0)
coef_df.plot(
    kind='barh',
    x='cols',
    y='coef'
)
plt.title("Coefficients in the Lasso Model")

#### ElasticNet

In [None]:
drop_features = [
    'OverallQual',
#     'PropertyAge',
#     'OverallGrade',
#     'ExterGrade',
    'CoreArea',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
#     'TotalArea',
#     'LotArea',
#     'FullBath',
#     'HalfBath',
    'TotalBath',
#     'TotRmsAbvGrd',
#     'BedroomAbvGr',
#     'KitchenAbvGr',
#     'SimpleOverallQual',
#     'SimpleOverallCond',
    'IsRemodelled',
    'IsNew',
#     'Neighborhood_Blmngtn',
#     'Neighborhood_Blueste',
#     'Neighborhood_BrDale',
#     'Neighborhood_BrkSide',
#     'Neighborhood_ClearCr',
#     'Neighborhood_CollgCr',
#     'Neighborhood_Crawfor',
#     'Neighborhood_Edwards',
#     'Neighborhood_Gilbert',
#     'Neighborhood_IDOTRR',
#     'Neighborhood_MeadowV',
#     'Neighborhood_Mitchel',
#     'Neighborhood_NAmes',
#     'Neighborhood_NPkVill',
#     'Neighborhood_NWAmes',
#     'Neighborhood_NoRidge',
#     'Neighborhood_NridgHt',
#     'Neighborhood_OldTown',
#     'Neighborhood_SWISU',
#     'Neighborhood_Sawyer',
#     'Neighborhood_SawyerW',
#     'Neighborhood_Somerst',
#     'Neighborhood_StoneBr',
#     'Neighborhood_Timber',
#     'Neighborhood_Veenker',
#     'MSZoning_C (all)',
#     'MSZoning_FV',
#     'MSZoning_RH',
#     'MSZoning_RL',
#     'MSZoning_RM',
#     'BldgType_1Fam',
#     'BldgType_2fmCon',
#     'BldgType_Duplex',
#     'BldgType_Twnhs',
#     'BldgType_TwnhsE',
#     'Functional_Maj1',
#     'Functional_Maj2',
#     'Functional_Min1',
#     'Functional_Min2',
#     'Functional_Mod',
#     'Functional_Sev',
#     'Functional_Typ',
#     'MSSubClass_120',
#     'MSSubClass_160',
#     'MSSubClass_180',
#     'MSSubClass_190',
#     'MSSubClass_20',
#     'MSSubClass_30',
#     'MSSubClass_40',
#     'MSSubClass_45',
#     'MSSubClass_50',
#     'MSSubClass_60',
#     'MSSubClass_70',
#     'MSSubClass_75',
#     'MSSubClass_80',
#     'MSSubClass_85',
#     'MSSubClass_90',
    'Condition1_Artery',
    'Condition1_Feedr',
    'Condition1_Norm',
    'Condition1_PosA',
    'Condition1_PosN',
    'Condition1_RRAe',
    'Condition1_RRAn',
    'Condition1_RRNe',
    'Condition1_RRNn',
#     'LotConfig_Corner',
#     'LotConfig_CulDSac',
#     'LotConfig_FR2',
#     'LotConfig_FR3',
#     'LotConfig_Inside',
#     'Exterior1st_AsbShng',
#     'Exterior1st_AsphShn',
#     'Exterior1st_BrkComm',
#     'Exterior1st_BrkFace',
#     'Exterior1st_CBlock',
#     'Exterior1st_CemntBd',
#     'Exterior1st_HdBoard',
#     'Exterior1st_ImStucc',
#     'Exterior1st_MetalSd',
#     'Exterior1st_Plywood',
#     'Exterior1st_Stone',
#     'Exterior1st_Stucco',
#     'Exterior1st_VinylSd',
#     'Exterior1st_Wd Sdng',
#     'Exterior1st_WdShing',
#     'MasVnrType_BrkCmn',
#     'MasVnrType_BrkFace',
#     'MasVnrType_None',
#     'MasVnrType_Stone',
#     'SaleType_COD',
#     'SaleType_CWD',
#     'SaleType_Con',
#     'SaleType_ConLD',
#     'SaleType_ConLI',
#     'SaleType_ConLw',
#     'SaleType_New',
#     'SaleType_Oth',
#     'SaleType_WD',
#     'SaleCondition_Abnorml',
#     'SaleCondition_AdjLand',
#     'SaleCondition_Alloca',
#     'SaleCondition_Family',
#     'SaleCondition_Normal',
#     'SaleCondition_Partial'
]

In [None]:
elastic_features = features.drop(drop_features, axis=1)

# CV Model
cv_model = ElasticNetCV(
    l1_ratio=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    alphas=[0.0001,0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 10],
    cv=20
)
cv_model.fit(elastic_features, target)

# Individual Model
model = ElasticNet(
    l1_ratio=cv_model.l1_ratio_,
    alpha=cv_model.alpha_
)
model.fit(elastic_features, target)
print('Alpha:', cv_model.alpha_)
print('l1:', cv_model.l1_ratio_)
print('Score:', model.score(elastic_features, target))
df_results = append_results(df_results, model, 'ElasticNet', elastic_features)
coef_df = pd.DataFrame()
coef_df['coef'] = model.coef_.ravel()
coef_df['cols'] = elastic_features.columns.tolist()
matplotlib.rcParams['figure.figsize'] = (10.0, 20.0)
coef_df.plot(
    kind='barh',
    x='cols',
    y='coef'
)
plt.title("Coefficients in the ElasticNet Model")

#### Exploration

In [None]:
end_cols = [
    'SalePrice',
    'Linear',    
    'Ridge',    
    'Lasso', 
    'ElasticNet',
    'LinearError',
    'RidgeError',    
    'LassoError',
    'ElasticNetError',    
    'LinearRMSE',
    'RidgeRMSE',
    'LassoRMSE',    
    'ElasticNetRMSE',
    'SalePriceLog',
    'LinearRMSLE',
    'LinearRMSLEDiff',
    'RidgeRMSLE',
    'RidgeRMSLEDiff',
    'LassoRMSLE',
    'LassoRMSLEDiff',
    'ElasticNetRMSLE',
    'ElasticNetRMSLEDiff'
]
df_results = df_results[[col for col in df_results if col not in end_cols] + end_cols].sort_values('SalePrice')
df_results.to_csv(model_out, index=False)

In [None]:
# Ridge Regression
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
cv = [
    rmse_cv(Ridge(alpha = alpha)).mean() for alpha in alphas
]
cv = pd.Series(cv, index = alphas)
matplotlib.rcParams['figure.figsize'] = (5.0, 5.0)
cv.plot(title = "Validation")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
model = Ridge(alpha=0.4).fit(features, target)
coef_df = pd.DataFrame()
coef_df['coef'] = model.coef_.ravel()
coef_df['cols'] = features.columns.tolist()
matplotlib.rcParams['figure.figsize'] = (10.0, 20.0)
coef_df.plot(
    kind='barh',
    x='cols',
    y='coef'
)
plt.title("Coefficients in the Ridge Model")

In [None]:
# Lasso Regression
alphas = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
cv = [
    rmse_cv(Lasso(alpha = alpha)).mean() for alpha in alphas
]
cv = pd.Series(cv, index = alphas)
matplotlib.rcParams['figure.figsize'] = (5.0, 5.0)
cv.plot(title = "Validation")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
model = Lasso(alpha=0.001).fit(features, target)
coef_df = pd.DataFrame()
coef_df['coef'] = model.coef_.ravel()
coef_df['cols'] = features.columns.tolist()
matplotlib.rcParams['figure.figsize'] = (10.0, 20.0)
coef_df.plot(
    kind='barh',
    x='cols',
    y='coef'
)
plt.title("Coefficients in the Ridge Model")

#### Submission

In [None]:
test_features = df_test_in[lasso_features.columns.tolist()]
df_sub = pd.DataFrame()
df_sub['Id'] = df_test_in['Id']
df_sub['SalePrice'] = np.expm1(model.predict(test_features))
df_sub.to_csv('./data/output_data/my_submission.csv', index=False)

#### Sources & Reference

Tutorials & Papers:  
* [Cardinality Reduction](https://pkghosh.wordpress.com/2017/10/09/combating-high-cardinality-features-in-supervised-machine-learning/)  
* [FA & PCA](https://www.dummies.com/programming/big-data/data-science/data-science-using-python-to-perform-factor-and-principal-component-analysis/)  
* [Factor Analysis for Decomposition](https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783989485/1/ch01lvl1sec19/using-factor-analysis-for-decomposition)  
* [SKL Decomposition](http://scikit-learn.org/stable/modules/decomposition.html)  
* [Clustering Mixed Data](https://datascience.stackexchange.com/questions/8681/clustering-for-mixed-numeric-and-nominal-discrete-data)  
* [Log Transforms](https://stats.stackexchange.com/questions/18844/when-and-why-should-you-take-the-log-of-a-distribution-of-numbers)
* [Box Cox Transformation](https://www.statisticshowto.datasciencecentral.com/box-cox-transformation/)  
* [Log Transforms](http://onlinestatbook.com/2/transformations/log.html)   
* [SKL ensembling](http://scikit-learn.org/stable/modules/ensemble.html)  


Kaggle Kernels & Notebooks: 
* [Data Exploration Kernel](https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python)
* [Good Overall Kaggle Kernel](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)  
* [Applied Regression](https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset)  
* [Regularized Linear Models](https://www.kaggle.com/apapiu/regularized-linear-models)  