In [59]:
import warnings
warnings.filterwarnings('ignore')

In [60]:
import os
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [61]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', -1)

In [62]:
infile = './transformed_data/train_engineered.csv'
log = './logs/log.csv'

In [63]:
df = pd.read_csv(infile)
df.head(5)

Unnamed: 0,OverallQual,TotalArea,GarageCars,PropertyAge,MSSubClass,MSZoning,Neighborhood,HasPorch,HasDeck,LotConfig,HouseStyle,SaleType,SaleCondition,SalePrice,TotalArea_scaled,PropertyAge_scaled,OverallQual_scaled,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_High,Neighborhood_Low,Neighborhood_Mid,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_Normal,SaleCondition_Partial
0,7,4276,2,5,60,RL,Mid,1,0,Inside,2Story,WD,Normal,208500,0.386627,0.036765,0.666667,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,6,3786,2,31,20,RL,Mid,0,1,FR2,1Story,WD,Normal,181500,0.334119,0.227941,0.555556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,7,4492,2,7,60,RL,Mid,1,0,Inside,2Story,WD,Normal,223500,0.409773,0.051471,0.666667,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,7,4190,3,91,70,RL,Mid,1,0,Corner,2Story,WD,Abnorml,140000,0.377411,0.669118,0.666667,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,8,5541,3,8,60,RL,High,1,1,FR2,2Story,WD,Normal,250000,0.522182,0.058824,0.777778,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [80]:
# Model Parameters
feature_names = [
    'OverallQual_scaled',
    'TotalArea_scaled',
    'GarageCars',
    'PropertyAge_scaled',
    'Neighborhood_High',
    'Neighborhood_Low',
    'Neighborhood_Mid',
    'HasPorch',
    'HasDeck',
    'SaleCondition_Abnorml',
    'SaleCondition_Normal',
    'SaleCondition_Partial'    
]

features  = df[feature_names]
label = np.log1p(df[['SalePrice']])
cv=5

models = [{
    'name': 'Linear Regression',
    'model': LinearRegression()
}, {
    'name': 'Ridge Regression',
    'model': Ridge()
}, {
    'name': 'Lasso Regressor',
    'model': Lasso()
}]

In [81]:
model = LinearRegression()
out = cross_val_score(model, features, label, cv=10)
out.mean()

0.839892786333551

In [66]:
def test_models(models, feature_names, features, label, cv, log):
    '''
    Runs a series of simple models to test Feature Engineering performance
    '''

    # Importing the log or creating one if it doesn't exist
    if os.path.isfile(log):
        df_log = pd.read_csv(log)
    else:
        df_log = pd.DataFrame(
            columns=['model', 'features', 'performance', 'cv']
        )

    # Running the models
    for item in models:

        name = item['name']
        scores = cross_val_score(item['model'], features, label, cv=cv)
        performance = sqrt(round(scores.mean(),5))
        performance_std = round(scores.std(),5)
        performance_min = round(scores.min(),5)
        performance_max = round(scores.max(),5)
        
        # Output Performance
        print('{} Mean: {}'.format(name, performance))
        print('{} STD: {}'.format(name, performance_std))
        print('{} Min: {}'.format(name, performance_min))
        print('{} Max: {}'.format(name, performance_max))  

        # Log performance
        performance_dict = {
            'model': [item['name']],
            'features': [feature_names],
            'performance': [performance],
            'cv': [cv]
        }

        df_perf = pd.DataFrame.from_dict(performance_dict)
        df_log = pd.concat([df_log, df_perf])

    df_log = df_log.sort_values(by='performance', ascending=False)
    df_log.to_csv(log, index=False)

In [67]:
test_models(models, feature_names, features, label, cv, log)

Linear Regression Mean: 0.8931517228332485
Linear Regression STD: 0.02407
Linear Regression Min: 0.75598
Linear Regression Max: 0.8214
Ridge Regression Mean: 0.893481952811583
Ridge Regression STD: 0.02348
Ridge Regression Min: 0.75819
Ridge Regression Max: 0.8216
Lasso Regressor Mean: 0.8935658901278629
Lasso Regressor STD: 0.02405
Lasso Regressor Min: 0.7559
Lasso Regressor Max: 0.82146
