In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('data/input/non_standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)

In [3]:
X_test = pd.read_csv('data/input/non_standardized_X_test.csv', index_col=0)

In [4]:
np.random.seed(42)

### 10-fold Cross-Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [6]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

Gradient boosting have even more tunable parameters than random forest, therefore we shall concern only `learning_rate` and `max_depth` in this project. `n_estimators`, `max_features`, `min_samples_split` are fixed to 1000, 'sqrt' and 0.01. Since boosting is robust to overfitting, we can use as many estimators as computationally feasible.

In [7]:
learning_rates = [0.01, 0.1, 0.5, 1]
max_depths = [None, 3, 5, 10]
min_samples_leafs = [1, 3, 5, 10]

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

In [9]:
best_mses = []
best_avg_score = 1e9
best_lr = None
best_depth = None
best_msl = None
for lr in learning_rates:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, min_samples_split=0.01, n_estimators=1000, max_features='sqrt', random_state=42, learning_rate=lr, max_depth=depth)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_lr = lr
                best_depth = depth
                best_msl = min_samples_leaf
                best_mses = mses
            print(f'lr: {lr}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

lr: 0.01, max_depth: None, min_samples_leaf: 1, avg_mse: 323.1870080543094
lr: 0.01, max_depth: None, min_samples_leaf: 3, avg_mse: 298.96337252306313
lr: 0.01, max_depth: None, min_samples_leaf: 5, avg_mse: 297.39655719089626
lr: 0.01, max_depth: None, min_samples_leaf: 10, avg_mse: 285.84276754681474
lr: 0.01, max_depth: 3, min_samples_leaf: 1, avg_mse: 398.45692375512573
lr: 0.01, max_depth: 3, min_samples_leaf: 3, avg_mse: 394.4754014580629
lr: 0.01, max_depth: 3, min_samples_leaf: 5, avg_mse: 393.21799456122886
lr: 0.01, max_depth: 3, min_samples_leaf: 10, avg_mse: 394.5661960561368
lr: 0.01, max_depth: 5, min_samples_leaf: 1, avg_mse: 331.7722639292242
lr: 0.01, max_depth: 5, min_samples_leaf: 3, avg_mse: 325.2723113538419
lr: 0.01, max_depth: 5, min_samples_leaf: 5, avg_mse: 326.3387935166943
lr: 0.01, max_depth: 5, min_samples_leaf: 10, avg_mse: 328.90009861828065
lr: 0.01, max_depth: 10, min_samples_leaf: 1, avg_mse: 294.0796158180413
lr: 0.01, max_depth: 10, min_samples_leaf:

### In-sample analysis

In [10]:
print(f'best lr: {best_lr}, best max_depth: {best_depth}, best min_samples_leaf: {best_msl}, best avg_mse: {np.mean(best_mses)}')

best lr: 0.1, best max_depth: 5, best min_samples_leaf: 10, best avg_mse: 245.06471090277887


In [11]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,223.259709
1,79.61775
2,341.156212
3,179.810244
4,206.721286
5,90.953113
6,272.386957
7,185.19526
8,785.322327
9,86.224252


In [12]:
cv_df.to_csv('data/output/cv/gbr_log.csv')