<h1>House Prices Project - Part 2: Regression Techniques</h1>

This code got me into the top 12% on this competition.

<h2>Loads the libraries and the datasets</h2>

In [1]:
import numpy as np
import pandas as pd
from multiprocessing import cpu_count

from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from vecstack import stacking
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv('train_mod.csv')
test = pd.read_csv('test_mod.csv')
train['dataset'] = 'train'
test['dataset'] = 'test'
dataset = pd.concat([train, test], ignore_index = True)
dataset['MSSubClass'] = dataset['MSSubClass'].astype(str)
dataset['MoSold'] = dataset['MoSold'].astype(str)
dataset['OverallQual'] = dataset['OverallQual'].astype(int)
dataset = pd.get_dummies(dataset)
dataset.shape

(2919, 349)

In [3]:
train = dataset[dataset['dataset_train'] == 1].drop(['dataset_train', 'dataset_test'], axis = 1)
test = dataset[dataset['dataset_test'] == 1].drop(['dataset_train', 'dataset_test'], axis = 1)

In [4]:
X_train, y_train = train.drop(['SalePrice', 'SalePriceLog'], axis = 1), train['SalePriceLog']
X_test = test.drop(['SalePrice', 'SalePriceLog'], axis = 1)

In [5]:
scaler = preprocessing.RobustScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [6]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=2099)

<h2>Ridge Regression</h2>

In [7]:
# define the grid search parameters
parameters = {
    'alpha': [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5],
    'random_state': [2099]
}

Ridge_reg = GridSearchCV(Ridge(), parameters, scoring='neg_root_mean_squared_error',cv = kfolds, verbose = 2, n_jobs = 3)
Ridge_reg.fit(X_train_transformed, y_train)
print(Ridge_reg.best_params_)
print(Ridge_reg.best_score_)

Fitting 10 folds for each of 11 candidates, totalling 110 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  47 tasks      | elapsed:    3.3s


{'alpha': 14.5, 'random_state': 2099}
-0.11719575585556738


[Parallel(n_jobs=3)]: Done 110 out of 110 | elapsed:    4.3s finished


In [8]:
ridge = Ridge(alpha = 14.5, max_iter = 1e7, random_state = 2099)

<h2>Lasso</h2>

In [9]:
# define the grid search parameters
parameters = {
    'max_iter': [2099],
    'alpha': [1e-4, 2e-4, 3e-4, 4e-4, 5e-4, 6e-4, 7e-4, 8e-4],
    'selection': ['random'],
    'random_state': [2099]
}

Lasso_reg = GridSearchCV(Lasso(), parameters, scoring='neg_root_mean_squared_error',cv = kfolds, verbose = 2, n_jobs = 3)
Lasso_reg.fit(X_train_transformed, y_train)
print(Lasso_reg.best_params_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   19.3s
[Parallel(n_jobs=3)]: Done  80 out of  80 | elapsed:   32.7s finished


{'alpha': 0.0004, 'max_iter': 2099, 'random_state': 2099, 'selection': 'random'}


In [10]:
lasso = Lasso(alpha = 4e-4, max_iter = 1e7, random_state = 2099, selection = 'random')

<h2>Elastic Net K-fold Cross-Validation</h2>

In [11]:
# define the grid search parameters
parameters = {
    'alpha': [4e-4, 5e-4, 6e-4, 7e-4, 8e-4],
    'l1_ratio': [0.55, 0.57, 0.58, 0.59, 0.6],
    'max_iter': [1e7],
    'selection': ['random'],
    'random_state': [2099]
}

ElasticNet_reg = GridSearchCV(ElasticNet(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
ElasticNet_reg.fit(X_train_transformed, y_train)
print(ElasticNet_reg.best_params_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   23.8s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 250 out of 250 | elapsed:  2.0min finished


{'alpha': 0.0006, 'l1_ratio': 0.58, 'max_iter': 10000000.0, 'random_state': 2099, 'selection': 'random'}


In [12]:
eNet = ElasticNet(alpha = 6e-4, max_iter=1e7, l1_ratio = 0.58, random_state = 2099, selection = 'random')

<h2>Support Vector Regression</h2>

In [13]:
# define the grid search parameters
parameters = {
    'C': [20],
    'epsilon': [5e-3, 8e-3, 1e-2],
    'gamma': [1e-4, 3e-4, 5e-4]
}

svr_reg = GridSearchCV(SVR(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
svr_reg.fit(X_train_transformed, y_train)
print(svr_reg.best_params_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   19.0s
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed:   47.1s finished


{'C': 20, 'epsilon': 0.01, 'gamma': 0.0003}


In [14]:
svr = SVR(C = 20, epsilon = 0.01, gamma = 3e-4)

<h2>Bayesian Ridge</h2>

In [15]:
# define the grid search parameters
parameters = {
    'alpha_1': [7e-10, 9e-10, 2e-9, 3e-9],
    'lambda_1': [5.9, 6, 6.1, 6.2],
    'alpha_2': [4.8, 5, 5.2],
    'lambda_2': [7e-2, 1e-1, 5e-1]
}

bayesianridge_reg = GridSearchCV(BayesianRidge(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
bayesianridge_reg.fit(X_train_transformed, y_train)
print(bayesianridge_reg.best_params_)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    3.9s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   16.2s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:   37.0s
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 1007 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 1440 out of 1440 | elapsed:  2.4min finished


{'alpha_1': 3e-09, 'alpha_2': 5, 'lambda_1': 5.9, 'lambda_2': 0.1}


In [16]:
bayesianridge = BayesianRidge(alpha_1 = 3e-9, alpha_2 = 5, lambda_1 = 5.9, lambda_2 = 0.1)

<h2>Tree</h2>

In [17]:
# define the grid search parameters
parameters = {
    'splitter': ['best', 'random'],
    'max_depth': [16, 17, 18, 19, 20],
    'min_samples_split': [30, 31, 32, 33, 34],
    'max_features': ['sqrt', 'log2', 'auto'],
    'random_state': [2099]
}

tree_reg = GridSearchCV(DecisionTreeRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
tree_reg.fit(X_train_transformed, y_train)
print(tree_reg.best_params_)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 178 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done 1146 tasks      | elapsed:   15.8s


{'max_depth': 18, 'max_features': 'auto', 'min_samples_split': 33, 'random_state': 2099, 'splitter': 'best'}


[Parallel(n_jobs=3)]: Done 1500 out of 1500 | elapsed:   21.4s finished


In [18]:
tree = DecisionTreeRegressor(max_depth = 18, max_features = 'auto', min_samples_split = 33, random_state = 2099, splitter = 'best')

<h2>Random Forest</h2>

In [19]:
# define the grid search parameters
parameters = {
    'n_estimators': [200, 400],
    'max_depth': [14, 15, 16, 17, 18],
    'min_samples_split': [2, 4, 8],
    'max_features': ['sqrt'],
    'random_state': [2099]
}

rforest_reg = GridSearchCV(RandomForestRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
rforest_reg.fit(X_train_transformed, y_train)
print(rforest_reg.best_params_)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   30.0s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:  4.1min finished


{'max_depth': 16, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200, 'random_state': 2099}


In [20]:
rforest = RandomForestRegressor(max_depth = 16, max_features = 'sqrt', min_samples_split = 2, n_estimators = 200, random_state = 2099)

<h2>Extra Trees Regressor</h2>

In [21]:
# define the grid search parameters
parameters = {
    'n_estimators': [200],
    'max_depth': [17, 18, 19],
    'min_samples_split': [6, 7, 8],
    'max_features': ['auto'],
    'random_state': [2099]
}

extratrees_reg = GridSearchCV(ExtraTreesRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
extratrees_reg.fit(X_train_transformed, y_train)
print(extratrees_reg.best_params_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed:  6.3min finished


{'max_depth': 18, 'max_features': 'auto', 'min_samples_split': 7, 'n_estimators': 200, 'random_state': 2099}


In [22]:
extratrees = ExtraTreesRegressor(max_depth = 18, max_features = 'auto', min_samples_split = 7, n_estimators = 200, random_state = 2099)

<h2>Gradient Boosting Regressor</h2>

In [23]:
# define the grid search parameters
parameters = {
    'loss': ['huber'],
    'learning_rate': [0.05],
    'n_estimators': [3000],
    'max_depth': [2, 4, 8],
    'min_samples_split': [6, 8, 10],
    'max_features': ['sqrt'],
    'random_state': [2099]
}

gradientboosting_reg = GridSearchCV(GradientBoostingRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
gradientboosting_reg.fit(X_train_transformed, y_train)
print(gradientboosting_reg.best_params_)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed:  9.9min finished


{'learning_rate': 0.05, 'loss': 'huber', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_split': 6, 'n_estimators': 3000, 'random_state': 2099}


In [24]:
gradientboosting = GradientBoostingRegressor(learning_rate = 0.05, loss = 'huber', max_depth = 2, max_features = 'sqrt', min_samples_split = 6, n_estimators = 3000, random_state = 2099)

<h2>Lightgbm</h2>

In [25]:
# define the grid search parameters
parameters = {
    'objective': ['regression'],
    'num_leaves': [7, 8, 9],
    'learning_rate': [0.1],
    'min_data_in_leaf': [5, 6, 7],
    'max_depth': [5, 6, 7],
    'random_state': [2099]
}

lightgbm_reg = GridSearchCV(LGBMRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
lightgbm_reg.fit(X_train_transformed, y_train)
print(lightgbm_reg.best_params_)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    7.4s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   32.6s
[Parallel(n_jobs=3)]: Done 270 out of 270 | elapsed:   56.8s finished


{'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 6, 'num_leaves': 8, 'objective': 'regression', 'random_state': 2099}


In [26]:
lightgbm = LGBMRegressor(objective = 'regression', num_leaves = 8, learning_rate = 0.1, 
                         max_depth = 6, min_data_in_leaf = 6, random_state = 2099)

<h2>XGBoost</h2>

In [27]:
# define the grid search parameters
parameters = {
    'objective': ['reg:squarederror'],
    'learning_rate': [0.1],
    'min_child_weight': [5, 6],
    'subsample': [0.75],
    'colsample_bytree': [0.75],
    'min_data_in_leaf': [2, 3, 4],
    'max_depth': [15, 16, 17],
    'random_state': [2099]
}

xgboost_reg = GridSearchCV(XGBRegressor(), parameters, scoring = 'neg_root_mean_squared_error', cv = kfolds, verbose = 2, n_jobs = 3)
xgboost_reg.fit(X_train_transformed, y_train)
print(xgboost_reg.best_params_)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 180 out of 180 | elapsed:  5.4min finished


{'colsample_bytree': 0.75, 'learning_rate': 0.1, 'max_depth': 16, 'min_child_weight': 5, 'min_data_in_leaf': 2, 'objective': 'reg:squarederror', 'random_state': 2099, 'subsample': 0.75}


In [28]:
xgboost = XGBRegressor(learning_rate = 0.1, max_depth = 16, min_data_in_leaf = 2, min_child_weight = 5, 
                       subsample = 0.75, colsample_bytree = 0.75, objective = 'reg:squarederror')

<h2>Predictions</h2>

In [29]:
ridge = Ridge(alpha = 14.5, max_iter = 1e7, random_state = 2099)
lasso = Lasso(alpha = 4e-4, max_iter = 1e7, random_state = 2099, selection = 'random')
eNet = ElasticNet(alpha = 6e-4, max_iter=1e7, l1_ratio = 0.58, random_state = 2099, selection = 'random')
svr = SVR(C = 20, epsilon = 0.01, gamma = 3e-4)
bayesianridge = BayesianRidge(alpha_1 = 3e-9, alpha_2 = 5, lambda_1 = 5.9, lambda_2 = 0.1)
tree = DecisionTreeRegressor(max_depth = 18, max_features = 'auto', min_samples_split = 33, random_state = 2099, splitter = 'best')
rforest = RandomForestRegressor(max_depth = 16, max_features = 'sqrt', min_samples_split = 2, n_estimators = 200, random_state = 2099)
extratrees = ExtraTreesRegressor(max_depth = 18, max_features = 'auto', min_samples_split = 7, n_estimators = 200, random_state = 2099)
gradientboosting = GradientBoostingRegressor(learning_rate = 0.05, loss = 'huber', max_depth = 2, max_features = 'sqrt', min_samples_split = 8, min_samples_leaf = 15, n_estimators = 5000, random_state = 2099)
lightgbm = LGBMRegressor(bagging_fraction = 0.75, bagging_freq = 5, bagging_seed = 7, feature_fraction = 0.2, feature_fraction_seed = 7, learning_rate = 0.01, max_bin = 200, max_depth = 4, min_data_in_leaf = 4, n_estimators = 5000, num_leaves = 8, objective = 'regression', random_state = 2099)
xgboost = XGBRegressor(learning_rate = 0.1, max_depth = 16, min_data_in_leaf = 2, min_child_weight = 5, subsample = 0.75, colsample_bytree = 0.75, objective = 'reg:squarederror')

In [30]:
models = [ridge, lasso, eNet, svr, bayesianridge, 
          tree, rforest, extratrees, gradientboosting, lightgbm, xgboost]

S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True, mode = 'oof_pred_bag', 
                           needs_proba = False, save_dir = None, metric = mean_squared_error, 
                           n_folds = 10, stratified = False, shuffle = True, random_state = 2099, verbose=2)

task:         [regression]
metric:       [mean_squared_error]
mode:         [oof_pred_bag]
n_models:     [11]

model  0:     [Ridge]
    fold  0:  [0.01895957]
    fold  1:  [0.01696827]


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


    fold  2:  [0.01615838]
    fold  3:  [0.01254398]
    fold  4:  [0.01337560]
    fold  5:  [0.00761028]
    fold  6:  [0.01320632]
    fold  7:  [0.01836430]
    fold  8:  [0.01209864]
    fold  9:  [0.01462718]
    ----
    MEAN:     [0.01439125] + [0.00321807]
    FULL:     [0.01439125]

model  1:     [Lasso]

  overwrite_a=True).T
  overwrite_a=True).T



    fold  0:  [0.01966990]
    fold  1:  [0.01740386]
    fold  2:  [0.01505695]
    fold  3:  [0.01081076]
    fold  4:  [0.01321318]
    fold  5:  [0.00663181]
    fold  6:  [0.01171878]
    fold  7:  [0.01831340]
    fold  8:  [0.01096353]
    fold  9:  [0.01371758]
    ----
    MEAN:     [0.01374997] + [0.00377380]
    FULL:     [0.01374997]

model  2:     [ElasticNet]
    fold  0:  [0.01957326]
    fold  1:  [0.01735307]
    fold  2:  [0.01507354]
    fold  3:  [0.01096000]
    fold  4:  [0.01309205]
    fold  5:  [0.00667082]
    fold  6:  [0.01178047]
    fold  7:  [0.01812224]
    fold  8:  [0.01113508]
    fold  9:  [0.01348942]
    ----
    MEAN:     [0.01372500] + [0.00369961]
    FULL:     [0.01372500]

model  3:     [SVR]
    fold  0:  [0.15020624]
    fold  1:  [0.17637557]
    fold  2:  [0.17651356]
    fold  3:  [0.15688831]
    fold  4:  [0.13417871]
    fold  5:  [0.10765849]
    fold  6:  [0.15888198]
    fold  7:  [0.19113056]
    fold  8:  [0.15058870]
    fold  9

In [31]:
# define the grid search parameters
parameters = {
    'alpha': [4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9],
    'random_state': [2099]
}

ridge = GridSearchCV(Ridge(), parameters, scoring='neg_root_mean_squared_error',cv = kfolds, verbose = 2, n_jobs = 3)
ridge.fit(S_train, y_train)
print(ridge.best_params_)

Fitting 10 folds for each of 19 candidates, totalling 190 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  56 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 190 out of 190 | elapsed:    2.5s finished


{'alpha': 5.4, 'random_state': 2099}


In [32]:
model = Ridge(alpha = 5.4, random_state = 2099)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)

In [33]:
idlist = pd.Series(range(1461, 2920))

In [34]:
SalePrice = pd.Series(np.floor(np.exp(y_pred)))

In [35]:
submission = pd.DataFrame({'Id': idlist, 'SalePrice': SalePrice})

In [36]:
submission

Unnamed: 0,Id,SalePrice
0,1461,122430.0
1,1462,154822.0
2,1463,182752.0
3,1464,197522.0
4,1465,190063.0
...,...,...
1454,2915,82612.0
1455,2916,80127.0
1456,2917,166121.0
1457,2918,115564.0


In [37]:
submission.to_csv('Submission.csv', index = False)