In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('data/input/non_standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)

In [3]:
X_test = pd.read_csv('data/input/non_standardized_X_test.csv', index_col=0)

In [4]:
np.random.seed(42)

### 10-fold Cross-Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [6]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

For decision tree we shall concern tuning `ccp_alpha`, `max_depth` and `min_samples_leaf` in this project. `max_features`, `min_samples_split` are fixed to 'sqrt' and 0.01. 

In [11]:
ccp_alphas = [0.0, 0.01, 0.02, 0.05, 0.1]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

In [8]:
from sklearn.tree import DecisionTreeRegressor

In [12]:
best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = DecisionTreeRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt')
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 582.641392076291
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 529.4488506022003
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 532.8449508858971
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 536.2330264799557
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 760.5995828163129
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 760.8167050572296
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 760.8219292251181
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 759.0577385978467
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 675.1867704176565
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 689.9640106158282
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 694.1699808907703
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 715.4753494015393
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 1

### In-sample analysis

In [13]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.0, best max_depth: None, best min_samples_leaf: 3, best avg_mse: 529.4488506022003


In [14]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,461.495418
1,408.030876
2,635.322117
3,444.929879
4,515.511622
5,212.891228
6,752.387121
7,535.975563
8,1108.258941
9,219.68574


In [15]:
cv_df.to_csv('data/output/cv/dt_log.csv')