In [None]:
from google.colab import files

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet, Lasso,  Ridge, LinearRegression, BayesianRidge, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

uploaded = files.upload()

In [None]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

train_poly = pd.read_csv('train_poly.csv')
test_poly = pd.read_csv('test_poly.csv')

In [None]:
### Data Preprocessing

# One-hot-encode data
combined = pd.concat([train, test])
combined = pd.get_dummies(combined)
train = combined[:1460]
test = combined[1460:]

combined = pd.concat([train_poly, test_poly])
combined = pd.get_dummies(combined)
train_poly = combined[:1460]
test_poly = combined[1460:]

# Train-test split
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']
X_test = test.drop(['SalePrice'], axis=1)

X_train_poly = train_poly.drop(['SalePrice'], axis=1)
y_train_poly = train_poly['SalePrice']
X_test_poly = test_poly.drop(['SalePrice'], axis=1)

# Scale data
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

std_scaler = StandardScaler()
X_train_poly = std_scaler.fit_transform(X_train_poly)
X_test_poly = std_scaler.transform(X_test_poly)

In [None]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
en = ElasticNet()
bay = BayesianRidge()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
ada = AdaBoostRegressor()
xgb = XGBRegressor()

models = [lr, ridge, lasso, en, bay, rfr, gbr, ada, xgb]

def rmse_cv(model):
    rmse = -np.mean(cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=3))
    return rmse

In [None]:
# Baseline model performances

table = []

for m in models:
  model = m.fit(X_train, y_train)
  table.append({'Model': m.__class__.__name__,
                'RMSE': rmse_cv(model)})

model_results_df = pd.DataFrame(table)

In [None]:
# Baseline model performances with poly scaling

table = []

for m in models:
  model = m.fit(X_train_poly, y_train_poly)
  table.append({'Model': m.__class__.__name__,
                'RMSE': rmse_cv(model)})

model_poly_results_df = pd.DataFrame(table)

In [None]:
model_results_df

Unnamed: 0,Model,RMSE
0,LinearRegression,68613790000.0
1,Ridge,0.1348264
2,Lasso,0.399314
3,ElasticNet,0.399314
4,BayesianRidge,0.1317248
5,RandomForestRegressor,0.1457755
6,GradientBoostingRegressor,0.1326038
7,AdaBoostRegressor,0.1777415
8,XGBRegressor,0.1409736


In [None]:
# model_poly_results_df

Non-polynomial best models: `bay`, `ridge`, `rfr`, `gbr`, `xgb`

Polynomial best models: `bay`, `gbr`, `ridge`, `lr`

In [None]:
# Ridge optimisation
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60], cv=10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_

ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15],
                cv = 10)

ridge_tuned = ridge.fit(X_train, y_train)

# coefs = pd.Series(ridge.coef_, index = X_train.columns)
# print("Ridge picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  \
#       str(sum(coefs == 0)) + " features")
# imp_coefs = pd.concat([coefs.sort_values().head(10),
#                      coefs.sort_values().tail(10)])
# imp_coefs.plot(kind = "barh")

In [None]:
# BayesianRidge optimisation

param_grid = {'max_iter': [200, 300, 400, 500, 600, 700],
              'tol': [0.003, 0.001, 0.01, 0.1, 0.2, 0.5, 0.8, 1, 1.5, 2, 4, 5],
              'alpha_1': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
              'alpha_2': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
              'lambda_1': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
              'lambda_2': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04]}

bay_search = RandomizedSearchCV(bay, param_grid, n_iter=100, cv=3, scoring='neg_root_mean_squared_error')
bay_tuned = bay_search.fit(X_train, y_train)
rmse_cv(bay_tuned)

0.13172472260185272

In [None]:
# Random forest optimisation

param_grid = {'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
              'max_depth': [None, 1, 2, 5, 10, 20],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

rfr_search = RandomizedSearchCV(rfr, param_grid, n_iter=10, cv=3, scoring='neg_root_mean_squared_error')
rfr_tuned = rfr_search.fit(X_train, y_train)

In [None]:
# GradientBoosted optimisation

param_grid = {'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
              'learning_rate': [0, 0.01, 0.1, 0.2, 1, 2],
              'n_estimators': [50, 100, 200, 500, 1000],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'max_depth': [1, 2, 3, 4, 10, 20]}

gbr_search = RandomizedSearchCV(gbr, param_grid, n_iter=50, cv=3, scoring='neg_root_mean_squared_error')
gbr_tuned = gbr_search.fit(X_train, y_train)
rmse_cv(gbr_tuned)

In [None]:
# XGBoost optimisation

param_grid = {'n_estimators': [100, 200, 300, 400, 500, 600, 70],
              'learning_rate': [0.01, 0.1, 0.001],
              'max_depth': [0, 1, 2, 3, 4, 5, 6, 7]}

xgb_search = RandomizedSearchCV(xgb, param_grid, n_iter=50, cv=3, scoring='neg_root_mean_squared_error')
xgb_tuned = xgb_search.fit(X_train, y_train)
rmse_cv(xgb_tuned)

In [None]:
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                             learning_rate=0.05, max_depth=3,
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state =5)

model_xgb.fit(X_train, y_train)
GBoost.fit(X_train, y_train)

Parameters: { "silent" } are not used.



In [None]:
# Model Ensembling

ridge_pred = ridge_tuned.predict(X_test)
bay_pred = bay.predict(X_test)
rfr_pred = rfr.predict(X_test)
gbr_pred = GBoost.predict(X_test)
xgb_pred = model_xgb.predict(X_test)

ensemble_pred = (ridge_pred + bay_pred + gbr_pred + xgb_pred) / 5

predictions = np.expm1(ensemble_pred)

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['SalePrice'] = predictions
submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.ensemble import StackingRegressor

estimators = [('ridge', ridge_tuned), ('bay', bay), ('gbr', GBoost), ('xgb', model_xgb)]

stregr = StackingRegressor(estimators=estimators)

streg_fit = stregr.fit(X_train, y_train)
streg_fit.predict(X_test)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



array([11.70701539, 12.02344659, 12.10672205, ..., 12.01701797,
       11.71896643, 12.28540481])

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['SalePrice'] = streg_fit.predict(X_test)
submission.to_csv('submission.csv', index=False)