In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

X_train = pd.read_csv('data/input/standardized_train_log_sales.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)


In [2]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

ccp_alphas = [0.0, 0.01, 0.05]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = RandomForestRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt', n_estimators=1000)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 347.31035645652463
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 368.4939603422497
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 384.29243118796387
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 419.4745978181683
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 658.046826012769
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 658.183045406982
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 658.1633581137626
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 658.979642720413
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 575.4074459878099
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 575.7950135840152
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 576.7481991862041
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 578.6768140501455
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 1

In [3]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,355.575399
1,140.71225
2,390.811499
3,243.544602
4,338.483561
5,104.000829
6,568.613525
7,314.707569
8,953.899597
9,62.754733


In [4]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.0, best max_depth: None, best min_samples_leaf: 1, best avg_mse: 347.31035645652463


In [5]:
from sklearn.ensemble import GradientBoostingRegressor

learning_rates = [0.01, 0.1, 0.5, 1]
max_depths = [None, 3, 5, 10]
min_samples_leafs = [1, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_lr = None
best_depth = None
best_msl = None
for lr in learning_rates:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, min_samples_split=0.01, n_estimators=1000, max_features='sqrt', random_state=42, learning_rate=lr, max_depth=depth)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_lr = lr
                best_depth = depth
                best_msl = min_samples_leaf
                best_mses = mses
            print(f'lr: {lr}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

lr: 0.01, max_depth: None, min_samples_leaf: 1, avg_mse: 197.96634481108896
lr: 0.01, max_depth: None, min_samples_leaf: 3, avg_mse: 158.33171445540404
lr: 0.01, max_depth: None, min_samples_leaf: 5, avg_mse: 151.55345697124227
lr: 0.01, max_depth: None, min_samples_leaf: 10, avg_mse: 141.9555845009017
lr: 0.01, max_depth: 3, min_samples_leaf: 1, avg_mse: 254.49131159522267
lr: 0.01, max_depth: 3, min_samples_leaf: 3, avg_mse: 248.23425961801323
lr: 0.01, max_depth: 3, min_samples_leaf: 5, avg_mse: 246.81220939062638
lr: 0.01, max_depth: 3, min_samples_leaf: 10, avg_mse: 246.7766027692732
lr: 0.01, max_depth: 5, min_samples_leaf: 1, avg_mse: 191.69421068833256
lr: 0.01, max_depth: 5, min_samples_leaf: 3, avg_mse: 185.6358720539944
lr: 0.01, max_depth: 5, min_samples_leaf: 5, avg_mse: 186.76093194962598
lr: 0.01, max_depth: 5, min_samples_leaf: 10, avg_mse: 183.51405435738042
lr: 0.01, max_depth: 10, min_samples_leaf: 1, avg_mse: 180.73526365136405
lr: 0.01, max_depth: 10, min_samples_l

In [6]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,18.928878
1,22.303529
2,220.840145
3,52.82204
4,45.455427
5,23.653805
6,95.550959
7,32.560332
8,468.983601
9,19.542616


In [7]:
print(f'best lr: {best_lr}, best max_depth: {best_depth}, best min_samples_leaf: {best_msl}, best avg_mse: {np.mean(best_mses)}')

best lr: 0.1, best max_depth: 3, best min_samples_leaf: 10, best avg_mse: 100.06413322871329


In [8]:
from sklearn.tree import DecisionTreeRegressor

ccp_alphas = [0.0, 0.01, 0.02, 0.05, 0.1]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = DecisionTreeRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt')
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 444.91878364511183
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 428.5485559805209
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 366.08494364672686
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 435.2059374410425
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 690.1374170432966
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 690.1278760927732
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 671.796141106811
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 669.4903366563216
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 619.326419895111
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 640.4536096720785
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 554.8433781152224
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 638.0310534964899
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 

In [9]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,450.858102
1,138.288108
2,587.941556
3,135.070283
4,460.580141
5,180.019215
6,552.358138
7,274.216715
8,830.404742
9,51.112435


In [10]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.0, best max_depth: None, best min_samples_leaf: 5, best avg_mse: 366.08494364672686
