In [124]:
import warnings
warnings.filterwarnings('ignore')

In [125]:
import os
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [126]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', -1)

In [127]:
infile = './transformed_data/train_engineered.csv'
log = './logs/log.csv'

In [128]:
df = pd.read_csv(infile)
df.head(5)

Unnamed: 0,OverallQual,TotalArea,GarageCars,PropertyAge,MSSubClass,MSZoning,Neighborhood,HasPorch,HasDeck,LotConfig,HouseStyle,SaleType,SaleCondition,SalePrice,TotalArea_scaled,PropertyAge_scaled,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_1,Neighborhood_2,Neighborhood_3,Neighborhood_4,Neighborhood_5,Neighborhood_6,Neighborhood_7,Neighborhood_8,Neighborhood_9,Neighborhood_10,Neighborhood_11,Neighborhood_12,Neighborhood_13,Neighborhood_14,Neighborhood_15,Neighborhood_16,Neighborhood_17,Neighborhood_18,Neighborhood_19,Neighborhood_20,Neighborhood_21,Neighborhood_22,Neighborhood_23,Neighborhood_24,Neighborhood_25,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,7,4276,2,5,60,RL,1,1,0,Inside,2Story,WD,Normal,208500,0.386627,0.036765,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,6,3786,2,31,20,RL,2,0,1,FR2,1Story,WD,Normal,181500,0.334119,0.227941,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,7,4492,2,7,60,RL,1,1,0,Inside,2Story,WD,Normal,223500,0.409773,0.051471,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,7,4190,3,91,70,RL,3,1,0,Corner,2Story,WD,Abnorml,140000,0.377411,0.669118,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,8,5541,3,8,60,RL,4,1,1,FR2,2Story,WD,Normal,250000,0.522182,0.058824,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [129]:
# Model Parameters

feature_names = df.drop(['TotalArea', 'PropertyAge', 'MSSubClass', 'MSZoning', 'HouseStyle', 'SaleType', 'SaleCondition', 'LotConfig'], axis=1).columns.tolist()

features  = df[feature_names]
label = df[['SalePrice']]
cv=5

models = [{
    'name': 'Linear Regression',
    'model': LinearRegression()
}, {
    'name': 'Ridge Regression',
    'model': Ridge()
}, {
    'name': 'Lasso Regressor',
    'model': Lasso()
}]

In [130]:
def test_models(models, feature_names, features, label, cv, log):
    '''
    Runs a series of simple models to test Feature Engineering performance
    '''

    # Importing the log or creating one if it doesn't exist
    if os.path.isfile(log):
        df_log = pd.read_csv(log)
    else:
        df_log = pd.DataFrame(
            columns=['model', 'features', 'performance', 'cv']
        )

    # Running the models
    for item in models:

        name = item['name']
        scores = cross_val_score(item['model'], features, label, cv=cv, scoring='mean_squared_error')
        performance = sqrt(round(scores.mean(),5))
        performance_std = round(scores.std(),5)
        performance_min = round(scores.min(),5)
        performance_max = round(scores.max(),5)
        
        # Output Performance
        print('{} Mean: {}'.format(name, performance))
        print('{} STD: {}'.format(name, performance_std))
        print('{} Min: {}'.format(name, performance_min))
        print('{} Max: {}'.format(name, performance_max))  

        # Log performance
        performance_dict = {
            'model': [item['name']],
            'features': [feature_names],
            'performance': [performance],
            'cv': [cv]
        }

        df_perf = pd.DataFrame.from_dict(performance_dict)
        df_log = pd.concat([df_log, df_perf])

    df_log = df_log.sort_values(by='performance', ascending=False)
    df_log.to_csv(log, index=False)

In [131]:
test_models(models, feature_names, features, label, cv, log)

Linear Regression Mean: -0.0
Linear Regression STD: 0.0
Linear Regression Min: -0.0
Linear Regression Max: -0.0
Ridge Regression Mean: -0.0
Ridge Regression STD: 0.0
Ridge Regression Min: -0.0
Ridge Regression Max: -0.0
Lasso Regressor Mean: -0.0
Lasso Regressor STD: 0.0
Lasso Regressor Min: -0.0
Lasso Regressor Max: -0.0
