In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

X_train = pd.read_csv('data/input/standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

In [6]:
from sklearn.ensemble import RandomForestRegressor

ccp_alphas = [0.0, 0.01, 0.05]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = RandomForestRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt', n_estimators=1000)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 449.795205911368
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 466.5833851594368
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 481.0847154287885
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 511.99738949145586
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 700.035323276167
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 699.8218274392399
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 699.7900774932552
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 700.2282731589246
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 635.4192802973595
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 635.2475965855176
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 635.2435440509715
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 636.9351024270994
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 1

In [7]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,474.549089
1,234.055917
2,482.996077
3,327.428166
4,447.355815
5,182.82912
6,720.636845
7,424.239794
8,1085.242167
9,118.619068


In [8]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.0, best max_depth: None, best min_samples_leaf: 1, best avg_mse: 449.795205911368


In [9]:
cv_df.to_csv('data/output/cv/rf_sta_log.csv')

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

learning_rates = [0.01, 0.1, 0.5, 1]
max_depths = [None, 3, 5, 10]
min_samples_leafs = [1, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_lr = None
best_depth = None
best_msl = None
for lr in learning_rates:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf, min_samples_split=0.01, n_estimators=1000, max_features='sqrt', random_state=42, learning_rate=lr, max_depth=depth)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_lr = lr
                best_depth = depth
                best_msl = min_samples_leaf
                best_mses = mses
            print(f'lr: {lr}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

lr: 0.01, max_depth: None, min_samples_leaf: 1, avg_mse: 323.2214177878385
lr: 0.01, max_depth: None, min_samples_leaf: 3, avg_mse: 298.9954062970762
lr: 0.01, max_depth: None, min_samples_leaf: 5, avg_mse: 297.40998943637226
lr: 0.01, max_depth: None, min_samples_leaf: 10, avg_mse: 285.74703175057914
lr: 0.01, max_depth: 3, min_samples_leaf: 1, avg_mse: 398.4550496201102
lr: 0.01, max_depth: 3, min_samples_leaf: 3, avg_mse: 394.47427734960235
lr: 0.01, max_depth: 3, min_samples_leaf: 5, avg_mse: 393.21148459321023
lr: 0.01, max_depth: 3, min_samples_leaf: 10, avg_mse: 394.56382914852117
lr: 0.01, max_depth: 5, min_samples_leaf: 1, avg_mse: 331.7697252264397
lr: 0.01, max_depth: 5, min_samples_leaf: 3, avg_mse: 325.2805902442218
lr: 0.01, max_depth: 5, min_samples_leaf: 5, avg_mse: 326.3353851635758
lr: 0.01, max_depth: 5, min_samples_leaf: 10, avg_mse: 328.91592193336885
lr: 0.01, max_depth: 10, min_samples_leaf: 1, avg_mse: 294.0695274172244
lr: 0.01, max_depth: 10, min_samples_leaf:

In [11]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,223.179765
1,79.630942
2,341.164208
3,179.8097
4,206.72221
5,90.952385
6,272.385424
7,185.198464
8,785.044739
9,86.222171


In [12]:
print(f'best lr: {best_lr}, best max_depth: {best_depth}, best min_samples_leaf: {best_msl}, best avg_mse: {np.mean(best_mses)}')

best lr: 0.1, best max_depth: 5, best min_samples_leaf: 10, best avg_mse: 245.0310008487545


In [13]:
cv_df.to_csv('data/output/cv/gbr_sta_log.csv')

In [14]:
from sklearn.tree import DecisionTreeRegressor

ccp_alphas = [0.0, 0.01, 0.02, 0.05, 0.1]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = DecisionTreeRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt')
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 582.6597958501792
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 529.448812077285
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 532.9028842162293
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 536.2032041629008
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 760.5995828163129
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 760.8167050572296
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 760.8219292251181
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 759.0577385978467
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 675.1867704176565
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 689.9640106158282
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 694.1699808907703
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 715.4753494015393
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 1

In [15]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,461.495418
1,408.030876
2,635.322117
3,444.929879
4,515.511622
5,212.891228
6,752.387121
7,535.975563
8,1108.258555
9,219.68574


In [16]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.0, best max_depth: None, best min_samples_leaf: 3, best avg_mse: 529.448812077285


In [17]:
cv_df.to_csv('data/output/cv/gt_sta_log.csv')