In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
X_train = pd.read_csv('data/input/standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_train.csv', index_col=0)

In [12]:
X_test = pd.read_csv('data/input/standardized_X_test.csv', index_col=0)

In [13]:
np.random.seed(42)

### 10-fold Cross-Validation

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [15]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(y_pred, y_val['sales']))
    return mses

For decision tree we shall concern tuning `ccp_alpha`, `max_depth` and `min_samples_leaf` in this project. `max_features`, `min_samples_split` are fixed to 'sqrt' and 0.01. 

In [16]:
ccp_alphas = [0.0, 0.01, 0.02, 0.05, 0.1]
min_samples_leafs = [1, 3, 5, 10]
max_depths = [None, 3, 5, 10]

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
best_mses = []
best_avg_score = 1e9
best_ccp_alpha = None
best_depth = None
best_min_samples_leaf = None
for ccp_alpha in ccp_alphas:
    for depth in max_depths:
        for min_samples_leaf in min_samples_leafs:
            model = DecisionTreeRegressor(ccp_alpha=ccp_alpha, max_depth=depth, min_samples_leaf=min_samples_leaf, random_state=42, min_samples_split=0.01, max_features='sqrt')
            mses = kfold_mse(X_train, y_train, model)
            if np.mean(mses) < best_avg_score:
                best_avg_score = np.mean(mses)
                best_ccp_alpha = ccp_alpha
                best_depth = depth
                best_min_samples_leaf = min_samples_leaf
                best_mses = mses
            print(f'ccp_alpha: {ccp_alpha}, max_depth: {depth}, min_samples_leaf: {min_samples_leaf}, avg_mse: {np.mean(mses)}')

ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 1, avg_mse: 593.5616231624102
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 3, avg_mse: 557.0525186664853
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 5, avg_mse: 575.5615947857348
ccp_alpha: 0.0, max_depth: None, min_samples_leaf: 10, avg_mse: 588.8558827593738
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 1, avg_mse: 646.640310358137
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 3, avg_mse: 645.8752332181648
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 5, avg_mse: 645.9347106622994
ccp_alpha: 0.0, max_depth: 3, min_samples_leaf: 10, avg_mse: 645.8274129824874
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 1, avg_mse: 597.0682083350323
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 3, avg_mse: 610.3702848013156
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 5, avg_mse: 602.8548875955396
ccp_alpha: 0.0, max_depth: 5, min_samples_leaf: 10, avg_mse: 608.3120131310989
ccp_alpha: 0.0, max_depth: 10, min_samples_leaf: 1

### In-sample analysis

In [19]:
print(f'best ccp_alpha: {best_ccp_alpha}, best max_depth: {best_depth}, best min_samples_leaf: {best_min_samples_leaf}, best avg_mse: {np.mean(best_mses)}')

best ccp_alpha: 0.1, best max_depth: 10, best min_samples_leaf: 3, best avg_mse: 556.9297735401863


In [20]:
cv_df = pd.DataFrame(best_mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,403.20049
1,342.490907
2,587.428174
3,431.444526
4,542.614023
5,323.315097
6,797.677927
7,508.218422
8,1185.250506
9,447.657664


In [21]:
cv_df.to_csv('data/output/cv/dt_sta.csv')