In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('data/input/standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_train.csv', index_col=0)

In [3]:
X_test = pd.read_csv('data/input/standardized_X_test.csv', index_col=0)

In [4]:
np.random.seed(42)

### 10-fold Cross-Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [6]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(y_pred, y_val['sales']))
    return mses

We shall concern only `ccp_alpha`, `max_depth` and `min_samples_leaf` in this project. `n_estimators`, `max_features`, `min_samples_split`, are fixed to 1000, `sqrt` and 0.01. Actually, using more estimators is better as random forest does not overfit. 1000 is a good number to stay within computational time limits.

In [7]:
ccp_alphas = [0.0, 0.01, 0.05]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = RandomForestRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt', n_estimators=1000)
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 365.2329653790077
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 374.8186598227851
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 384.86975410428727
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 405.8773045096028
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 571.4343714397394
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 564.7067592168867
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 562.9080473056408
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 562.1968424078129
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 504.5884867096414
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 495.8447190511608
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 493.057937778591
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 493.5251392898041
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 

### In-sample analysis

In [10]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,303.500274
1,151.985123
2,422.033843
3,273.992864
4,342.346263
5,154.503412
6,571.606804
7,341.983707
8,963.666237
9,126.343554


In [11]:
cv_df.to_csv('data/output/cv/rf_sta.csv')

In [12]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.05, best max_depth: None, best min_samples_leaf: 1, best avg_mse: 365.1962080458769
