In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Data preparation

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
dataset = pd.read_csv('data/train.csv').fillna(method='ffill')
dataset.shape

(1460, 81)

In [4]:
def encode_column(encoded_dataset):
    encoded_dataset['MSZoning'] = encoded_dataset['MSZoning'].replace({'A': 0, 'C': 1, 'C (all)': 1, 'FV': 2, 'I': 3, 'RH': 4, 'RL': 5, 'RP': 6, 'RM': 7 }).astype(int)
    encoded_dataset['Street'] = encoded_dataset['Street'].replace({'Grvl': 0, 'Pave': 1 }).astype(int)
    encoded_dataset['Alley'] = encoded_dataset['Alley'].replace({'Grvl': 0, 'Pave': 1, 'NA': 2 }).astype(int)
    encoded_dataset['LotShape'] = encoded_dataset['LotShape'].replace({'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3 }).astype(int)
    encoded_dataset['LandContour'] = encoded_dataset['LandContour'].replace({'Lvl': 0, 'Bnk': 1, 'HLS': 2, 'Low': 3 }).astype(int)
    encoded_dataset['Utilities'] = encoded_dataset['Utilities'].replace({'AllPub': 0, 'NoSewr': 1, 'NoSeWa': 2, 'ELO': 3 }).astype(int)
    encoded_dataset['LotConfig'] = encoded_dataset['LotConfig'].replace({'Inside': 0, 'Corner': 1, 'CulDSac': 2, 'FR2': 3, 'FR3': 4 }).astype(int)
    encoded_dataset['LandSlope'] = encoded_dataset['LandSlope'].replace({'Gtl': 0, 'Mod': 1, 'Sev': 2 }).astype(int)
    encoded_dataset['Neighborhood'] = encoded_dataset['Neighborhood'].replace({'Blmngtn': 0, 'Blueste': 1, 'BrDale': 2, 'BrkSide': 3, 'ClearCr': 4, 'CollgCr': 5, 'Crawfor': 6, 'Edwards': 7, 'Gilbert': 8, 'IDOTRR': 9, 'MeadowV': 10, 'Mitchel': 11, 'Names': 12, 'NAmes': 12, 'NoRidge': 13, 'NPkVill': 14, 'NridgHt': 15, 'NWAmes': 16, 'OldTown': 17, 'SWISU': 18, 'Sawyer': 19, 'SawyerW': 20, 'Somerst': 21, 'StoneBr': 22, 'Timber': 23, 'Veenker': 24 }).astype(int)
    encoded_dataset['Condition1'] = encoded_dataset['Condition1'].replace({'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRNn': 3, 'RRAn': 4, 'PosN': 5, 'PosA': 6, 'RRNe': 7, 'RRAe': 8 }).astype(int)
    encoded_dataset['Condition2'] = encoded_dataset['Condition2'].replace({'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRNn': 3, 'RRAn': 4, 'PosN': 5, 'PosA': 6, 'RRNe': 7, 'RRAe': 8 }).astype(int)
    encoded_dataset['BldgType'] = encoded_dataset['BldgType'].replace({'1Fam': 0, '2FmCon': 1, '2fmCon': 1, 'Duplx': 2, 'Duplex': 2, 'TwnhsE': 3, 'Twnhs': 3, 'TwnhsI': 4 }).astype(int)
    encoded_dataset['HouseStyle'] = encoded_dataset['HouseStyle'].replace({'1Story': 0, '1.5Fin': 1, '1.5Unf': 2, '2Story': 3, '2.5Fin': 4, '2.5Unf': 5, 'SFoyer': 6, 'SLvl': 7 }).astype(int)
    encoded_dataset['RoofStyle'] = encoded_dataset['RoofStyle'].replace({'Flat': 0, 'Gable': 1, 'Gambrel': 2, 'Hip': 3, 'Mansard': 4, 'Shed': 5 }).astype(int)
    encoded_dataset['RoofMatl'] = encoded_dataset['RoofMatl'].replace({'ClyTile': 0, 'CompShg': 1, 'Membran': 2, 'Metal': 3, 'Roll': 4, 'Tar&Grv': 5, 'WdShake': 6, 'WdShngl': 7 }).astype(int)
    encoded_dataset['Exterior1st'] = encoded_dataset['Exterior1st'].replace({'AsbShng': 0, 'AsphShn': 1, 'BrkComm': 2, 'Brk Cmn': 2, 'BrkFace': 3, 'CBlock': 4, 'CemntBd': 5, 'CmentBd': 5, 'HdBoard': 6, 'ImStucc': 7, 'MetalSd': 8, 'Other': 9, 'Plywood': 10, 'PreCase': 11, 'Stone': 12, 'Stucco': 13, 'VinylSd': 14, 'Wd Sdng': 15, 'WdShing': 16, 'Wd Shng': 17 }).astype(int)
    encoded_dataset['Exterior2nd'] = encoded_dataset['Exterior2nd'].replace({'AsbShng': 0, 'AsphShn': 1, 'BrkComm': 2, 'Brk Cmn': 2, 'BrkFace': 3, 'CBlock': 4, 'CemntBd': 5, 'CmentBd': 5, 'HdBoard': 6, 'ImStucc': 7, 'MetalSd': 8, 'Other': 9, 'Plywood': 10, 'PreCase': 11, 'Stone': 12, 'Stucco': 13, 'VinylSd': 14, 'Wd Sdng': 15, 'WdShing': 16, 'Wd Shng': 17 }).astype(int)
    encoded_dataset['MasVnrType'] = encoded_dataset['MasVnrType'].replace({'BrkCmn': 0, 'BrkFace': 1, 'CBlock': 2, 'None': 3, 'Stone': 4 }).astype(int)
    encoded_dataset['ExterQual'] = encoded_dataset['ExterQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['ExterCond'] = encoded_dataset['ExterCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['Foundation'] = encoded_dataset['Foundation'].replace({'BrkTil': 0, 'CBlock': 1, 'PConc': 2, 'Slab': 3, 'Stone': 4, 'Wood': 5 }).astype(int)
    encoded_dataset['BsmtQual'] = encoded_dataset['BsmtQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtCond'] = encoded_dataset['BsmtCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtExposure'] = encoded_dataset['BsmtExposure'].replace({'Gd': 0, 'Av': 1, 'Mn': 2, 'No': 3, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtFinType1'] = encoded_dataset['BsmtFinType1'].replace({'GLQ': 0, 'ALQ': 1, 'BLQ': 2, 'Rec': 3, 'LwQ': 4, 'Unf': 5, 'NA': 6 }).astype(int)
    encoded_dataset['BsmtFinType2'] = encoded_dataset['BsmtFinType2'].replace({'GLQ': 0, 'ALQ': 1, 'BLQ': 2, 'Rec': 3, 'LwQ': 4, 'Unf': 5, 'NA': 6 }).astype(int)
    encoded_dataset['Heating'] = encoded_dataset['Heating'].replace({'Floor': 0, 'GasA': 1, 'GasW': 2, 'Grav': 3, 'OthW': 4, 'Wall': 5 }).astype(int)
    encoded_dataset['HeatingQC'] = encoded_dataset['HeatingQC'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['CentralAir'] = encoded_dataset['CentralAir'].replace({'N': 0, 'Y': 1 }).astype(int)
    encoded_dataset['Electrical'] = encoded_dataset['Electrical'].replace({'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4 }).astype(int)
    encoded_dataset['KitchenQual'] = encoded_dataset['KitchenQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['Functional'] = encoded_dataset['Functional'].replace({'Typ': 0, 'Min1': 1, 'Min2': 2, 'Mod': 3, 'Maj1': 4, 'Maj2': 5, 'Sev': 6, 'Sal': 7 }).astype(int) 
    encoded_dataset['FireplaceQu'] = encoded_dataset['FireplaceQu'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['GarageType'] = encoded_dataset['GarageType'].replace({'2Types': 0, 'Attchd': 1, 'Basment': 2, 'BuiltIn': 3, 'CarPort': 4, 'Detchd': 5, 'NA': 6 }).astype(int)
    encoded_dataset['GarageFinish'] = encoded_dataset['GarageFinish'].replace({'Fin': 0, 'RFn': 1, 'Unf': 2, 'NA': 3 }).astype(int)
    encoded_dataset['GarageQual'] = encoded_dataset['GarageQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['GarageCond'] = encoded_dataset['GarageCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['PavedDrive'] = encoded_dataset['PavedDrive'].replace({'Y': 0, 'P': 1, 'N': 2 }).astype(int)
    encoded_dataset['PoolQC'] = encoded_dataset['PoolQC'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['Fence'] = encoded_dataset['Fence'].replace({'GdPrv': 0, 'MnPrv': 1, 'GdWo': 2, 'MnWw': 3, 'NA': 4 }).astype(int)
    encoded_dataset['MiscFeature'] = encoded_dataset['MiscFeature'].replace({'Elev': 0, 'Gar2': 1, 'Othr': 2, 'Shed': 3, 'TenC': 4, 'NA': 5 }).astype(int)
    encoded_dataset['SaleType'] = encoded_dataset['SaleType'].replace({'WD': 0, 'CWD': 1, 'VWD': 2, 'New': 3, 'COD': 4, 'Con': 5, 'ConLw': 6, 'ConLI': 7, 'ConLD': 8, 'Oth': 9 }).astype(int)
    encoded_dataset['SaleCondition'] = encoded_dataset['SaleCondition'].replace({'Normal': 0, 'Abnorml': 1, 'AdjLand': 2, 'Alloca': 3, 'Family': 4, 'Partial': 5 }).astype(int)
    return encoded_dataset

In [7]:
dataset = encode_column(dataset.fillna(0))
dataset.shape

(1460, 81)

In [8]:
X = dataset.drop(['Id', 'SalePrice'], axis=1).values
y = dataset['SalePrice'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Test Dataset

In [36]:
test_dataset = encode_column(pd.read_csv('data/test.csv').fillna(0).drop(['Id'], axis=1))
test_dataset.shape

test_id = pd.read_csv('data/test.csv').fillna(0)['Id']

# Modeling

## Random Forest

In [45]:
from sklearn.ensemble import RandomForestRegressor
rf_params = {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

random_forest_regressor = RandomForestRegressor(**rf_params)
random_forest_regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_features='sqrt', min_samples_split=5)

## XGBoost

In [46]:
import xgboost as xgb
xg_params = {'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8}

xgboost = xgb.XGBRegressor(**xg_params)
xgboost.fit(X_train, y_train, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

## Gradient Boosting

In [47]:
from sklearn.ensemble import GradientBoostingRegressor
gb_params = {'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 1000, 'subsample': 0.8}

gb_regressor = GradientBoostingRegressor(**params)
gb_regressor.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=5, max_features='sqrt',
                          min_samples_split=10, n_estimators=1000,
                          subsample=0.8)

## Blending

In [48]:
from sklearn.ensemble import VotingRegressor

models = list()
models.append(('gradient_boosting', GradientBoostingRegressor(**gb_params)))
models.append(('xgboost', xgb.XGBRegressor(**xg_params)))
models.append(('random_forest', RandomForestRegressor(**rf_params)))

ensemble = VotingRegressor(estimators=models)

ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)

In [49]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 16129.159692814941
Mean Squared Error: 929528834.5934461
Root Mean Squared Error: 30488.17532410633


In [50]:
test_pred = ensemble.predict(test_dataset.values)

In [51]:
df = pd.DataFrame({'Id': test_id, 'SalePrice': test_pred})
df.to_csv('voting-all-encoded-column-tuned.csv', index=False)  

# Grid Searching

In [38]:
from sklearn.model_selection import GridSearchCV

## Random Forest

In [39]:
parameters = { 'bootstrap': [True, False],
              'max_depth': [50, 100, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2],
              'min_samples_split': [2, 5],
              'n_estimators': [100, 500, 10000]}

clf = GridSearchCV(RandomForestRegressor(), parameters, cv=3, n_jobs=8, verbose=10)

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    7.8s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   18.8s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:  3.9min
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:  4.1min
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:  6.0min
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  6.2min
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:  7.3min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  7.9min
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:  9.8min
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 10.1min
[Parallel(

0.9960451180624581
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


## XGBoost

In [40]:
parameters = { 'min_child_weight': [1, 5, 10],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]}

clf = GridSearchCV(xgb.XGBRegressor(), parameters, cv=3, n_jobs=8, verbose=10)

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

Fitting 3 folds for each of 405 candidates, totalling 1215 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    6.7s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    7.9s
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:    8.3s
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:    8.6s
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:    8.9s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    9.2s
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:    9.6s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   10.1s
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:   10.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   10.8s
[Parallel(

0.9989969806114242
{'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8}


## Gradient Boosting

In [None]:
parameters = {
    "learning_rate": [0.001, 0.01, 0.05, 0.1],
    "min_samples_split": [5, 10, 30, 50],
    "max_depth": [3, 5, 7],
    "max_features": ["log2", "sqrt"],
    "subsample": [0.8, 0.9, 0.95, 1.0],
    "n_estimators": [500, 1000, 5000]
    }

clf = GridSearchCV(GradientBoostingRegressor(), parameters, cv=3, n_jobs=8, verbose=10)

clf.fit(X_encoded_train, y_encoded_train)
print(clf.score(X_encoded_train, y_encoded_train))
print(clf.best_params_)

# Elastic Net

In [52]:
from sklearn.linear_model import ElasticNet

In [54]:
regr = ElasticNet(random_state=0, max_iter=100000, alpha=0.1, l1_ratio=0.5)
regr.fit(X_train, y_train)

  positive)


ElasticNet(alpha=0.1, max_iter=100000, random_state=0)

In [55]:
y_pred = regr.predict(X_test)

In [56]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 21552.193426189027
Mean Squared Error: 2287130520.622509
Root Mean Squared Error: 47823.95341899819


In [57]:
test_pred = regr.predict(test_dataset.values)

In [58]:
df = pd.DataFrame({'Id': test_id, 'SalePrice': test_pred})
df.to_csv('elastic-net-all-encoded-column-tuned.csv', index=False)  

## Elastic Net Grid Searching

In [53]:
parameters = {'alpha': [0, 0.5, 0.1, 0.01, 0.001],
                                'l1_ratio': [0, 0.25, 0.5, 0.75, 1]}

clf = GridSearchCV(ElasticNet(random_state=0, max_iter=100000), parameters, cv=3, n_jobs=8, verbose=10)

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    7.7s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   14.0s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   19.0s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   21.8s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   32.8s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:   40.3s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   47.6s
[Parallel(n_jobs=8)]: Done  68 out of  75 | elapsed:   55.1s remaining:    5.7s
[Parallel(n_jobs=8)]: Done  75 out of  75 | elapsed:   59.6s finished


0.8777144413375937
{'alpha': 0.1, 'l1_ratio': 0.5}


  positive)
