In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import GridSearchCV

In [20]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sample = pd.read_csv('sample_submission.csv')

In [8]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [10]:
X = train.iloc[:,:-1].copy()
y = train.SalePrice

In [11]:
# According to the documentation, some NA values actually contain information on the category and are not just missing entries.

correct_null = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

missing_values = X.isnull().sum()[(X.isnull().sum() > 0)&(~X.isnull().sum().index.isin(correct_null))]

object_columns = X.select_dtypes('object').columns
numeric_columns = X.select_dtypes(exclude='object').columns

In [12]:
# filling "correct" NA values with the string 'not'
X[correct_null] = X[correct_null].fillna(value = 'not')

# these are the actual null values according to the documentation
X.isna().sum()[X.isna().sum() > 0]

LotFrontage    259
MasVnrType       8
MasVnrArea       8
Electrical       1
GarageYrBlt     81
dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [124]:
rfr = RandomForestRegressor()
abr = AdaBoostRegressor()
gbr = GradientBoostingRegressor()

numeric_pipeline = Pipeline([('numeric imputer', SimpleImputer(strategy='mean')),
                             ('scaler', StandardScaler())])

categorical_pipeline = Pipeline([('one hot encoding', OneHotEncoder(handle_unknown='ignore'))])

transformer = ColumnTransformer([('numeric pipeline', numeric_pipeline, numeric_columns),
                                 ('categorical pipeline', categorical_pipeline, object_columns)],
                                remainder = 'passthrough', sparse_threshold = 0)

model_pipeline = Pipeline([('transformer', transformer),
                           ('model', rfr)])

In [125]:
rfr_params = {'model__n_estimators': [10, 50, 100, 500, 1000],
              'model__max_depth': [3, None],
              'model': [rfr]}
abr_params = {'model__n_estimators': [10, 50, 100, 500, 1000],
              'model__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
              'model': [abr]}
gbr_params = {'model__n_estimators': [10, 50, 100, 500, 1000],
              'model__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
              'model__max_depth': [3, None],
              'model': [gbr]}
param_grid = [rfr_params, abr_params, gbr_params]

In [128]:
gs = GridSearchCV(estimator = model_pipeline,
                  param_grid = param_grid,
                  cv = 5)

In [129]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          sparse_threshold=0,
                                                          transformers=[('numeric '
                                                                         'pipeline',
                                                                         Pipeline(steps=[('numeric '
                                                                                          'imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         In

In [130]:
# import sklearn
# sklearn.metrics.SCORERS.keys()

In [131]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle

y_pred = gs.predict(X_test)
rmse = mse(y_test, y_pred, squared = False)
rmsle = msle(y_test, y_pred, squared = False)
print(f'rmse: {rmse}, rmsle: {rmsle}')

rmse: 27292.923885451368, rmsle: 0.13989637518523373


In [132]:
gs.best_params_

{'model': GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000),
 'model__learning_rate': 0.01,
 'model__max_depth': 3,
 'model__n_estimators': 1000}

In [69]:
predictions = gs.predict(test)
pred_df = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
pred_csv = pred_df.to_csv('submission_1.csv', index = False)
# rmse: 29364.96317211984

In [105]:
# predictions = gs.predict(test)
# pred_df = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
# pred_csv = pred_df.to_csv('submission_2.csv', index = False)
# rmse: 26303.82350858063