In [1]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import warnings
import sklearn
import pandas as pd


from sklearn.model_selection import train_test_split

In [2]:
test = pd.read_csv('pipeline_data/test.csv')
train = pd.read_csv('pipeline_data/train.csv')

In [3]:
X = train.drop(columns = 'SalePrice')
y = train.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                random_state=42)


In [4]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [5]:
X.isnull().sum()[X.isnull().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
num_features = X_train.select_dtypes(include='number').columns.tolist()
cat_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [8]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [9]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('number', num_pipeline, num_features),
    ('category', cat_pipeline, cat_features)
])

In [10]:
from column_transformer_features import get_feature_names

arr = transformer.fit_transform(X_train)
pd.DataFrame(arr, columns =get_feature_names(transformer))



Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,x41_ConLw,x41_New,x41_Oth,x41_WD,x42_Abnorml,x42_AdjLand,x42_Alloca,x42_Family,x42_Normal,x42_Partial
0,0.174092,0.000000,0.167808,0.033186,0.444444,0.625,0.615942,0.116667,0.000000,0.163359,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.730637,0.235294,0.130137,0.030555,0.555556,0.750,0.876812,0.733333,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.437286,0.058824,0.157534,0.034948,0.444444,0.750,0.275362,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.547635,0.176471,0.133562,0.027577,0.444444,0.750,0.471014,0.000000,0.182874,0.100815,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.260452,0.176471,0.099315,0.017294,0.444444,0.625,0.376812,0.000000,0.000000,0.038625,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.750514,0.000000,0.195205,0.037472,0.555556,0.500,0.971014,0.933333,0.000000,0.004252,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1164,0.774503,0.176471,0.150685,0.030400,0.333333,0.250,0.405797,0.000000,0.000000,0.110206,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1165,0.886909,0.000000,0.133562,0.032120,0.444444,0.750,0.601449,0.666667,0.000000,0.029589,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1166,0.589445,0.176471,0.116438,0.029643,0.666667,0.875,0.333333,0.800000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

In [12]:
lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', transformer),
    ('model', lasso)
])

In [13]:
lasso_pipeline.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('number',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   MinMaxScaler())]),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                         

In [14]:
preds = lasso_pipeline.predict(X_test)
mean_absolute_error(y_test, preds)

18076.767565943333

In [15]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_dict = {'model__alpha': [0.5]}

lasso_search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

lasso_search.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('number',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer()),
                                                                                         ('scale',
                                                                                          MinMaxScaler())]),
                                                                         ['Id',
                                                                          'MSSubClass',
                                                                          'LotFrontage',
                                                                          'LotArea',
                                                                          'OverallQua

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
rf = RandomForestRegressor(random_state=42)
rf_pipeline = Pipeline(steps=[
    ('preprocess', transformer),
    ('model', rf)
])

In [95]:
param_dicta = {'model__n_estimators': [10,150], 'model__max_depth':[3,30,50,100,500, None]}

rf_search = GridSearchCV(rf_pipeline, param_dicta, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

rf_search.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('number',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer()),
                                                                                         ('scale',
                                                                                          MinMaxScaler())]),
                                                                         ['Id',
                                                                          'MSSubClass',
                                                                          'LotFrontage',
                                                                          'LotArea',
                                                                          'OverallQua

In [96]:
rf_search.best_params_

{'model__max_depth': 30, 'model__n_estimators': 150}

In [97]:
-rf_search.best_score_

17858.182575351217

In [98]:
rf_search.cv_results_

{'mean_fit_time': array([0.09343646, 0.84041312, 0.21751783, 2.88596718, 0.21902485,
        2.9356061 , 0.22681742, 2.97321517, 0.22535009, 3.39482372,
        0.2684958 , 3.24721911]),
 'std_fit_time': array([0.01202293, 0.01077071, 0.0042035 , 0.02175597, 0.00590935,
        0.07703271, 0.00282434, 0.02345325, 0.00250343, 0.3596496 ,
        0.00980904, 0.19342788]),
 'mean_score_time': array([0.00958023, 0.01595175, 0.00897648, 0.02003725, 0.0090348 ,
        0.01964436, 0.00908213, 0.01934474, 0.00897171, 0.02225437,
        0.01048172, 0.02217536]),
 'std_score_time': array([0.00157662, 0.00044677, 0.00044622, 0.0016733 , 0.00065531,
        0.00100757, 0.00082811, 0.00111461, 0.00107626, 0.0034818 ,
        0.00101373, 0.00225786]),
 'param_model__max_depth': masked_array(data=[3, 3, 30, 30, 50, 50, 100, 100, 500, 500, None, None],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',


In [99]:
dat = rf_search.best_estimator_['model'].feature_importances_
fn = get_feature_names(transformer)

prio_list = sorted(zip(dat,fn), reverse = True)



In [100]:
for x,y in prio_list:
    print(x,y)

0.5545758699707554 OverallQual
0.1232549041853066 GrLivArea
0.03344446743320289 TotalBsmtSF
0.03258429435414961 2ndFlrSF
0.029205756691776587 BsmtFinSF1
0.02545772809137748 1stFlrSF
0.01662930621594092 LotArea
0.015498795449692666 GarageArea
0.013216270721568836 GarageCars
0.01167797840527534 YearBuilt
0.008597461601294782 LotFrontage
0.006791617258196842 x34_Unf
0.006617262522899944 YearRemodAdd
0.006128645079104858 TotRmsAbvGrd
0.005900541278797024 GarageYrBlt
0.005588367846217423 x21_Ex
0.00557780555636611 OpenPorchSF
0.005489156648532317 FullBath
0.004644823816748272 BsmtUnfSF
0.004090893174139889 Id
0.004025978863776462 WoodDeckSF
0.0035696310913644388 OverallCond
0.003292262603934207 Fireplaces
0.003249281162749926 MasVnrArea
0.0029965146104172693 ScreenPorch
0.0029351866732711266 x30_Gd
0.002913135423365668 x21_Gd
0.002875675494361785 MoSold
0.002239569296948643 MSSubClass
0.0020062551947595485 x28_Y
0.0017747635760200049 x33_Detchd
0.0016483603149879135 x3_Reg
0.001618904618986

In [101]:
import matplotlib.pyplot as plt
rf_search.cv_results_

{'mean_fit_time': array([0.09343646, 0.84041312, 0.21751783, 2.88596718, 0.21902485,
        2.9356061 , 0.22681742, 2.97321517, 0.22535009, 3.39482372,
        0.2684958 , 3.24721911]),
 'std_fit_time': array([0.01202293, 0.01077071, 0.0042035 , 0.02175597, 0.00590935,
        0.07703271, 0.00282434, 0.02345325, 0.00250343, 0.3596496 ,
        0.00980904, 0.19342788]),
 'mean_score_time': array([0.00958023, 0.01595175, 0.00897648, 0.02003725, 0.0090348 ,
        0.01964436, 0.00908213, 0.01934474, 0.00897171, 0.02225437,
        0.01048172, 0.02217536]),
 'std_score_time': array([0.00157662, 0.00044677, 0.00044622, 0.0016733 , 0.00065531,
        0.00100757, 0.00082811, 0.00111461, 0.00107626, 0.0034818 ,
        0.00101373, 0.00225786]),
 'param_model__max_depth': masked_array(data=[3, 3, 30, 30, 50, 50, 100, 100, 500, 500, None, None],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
