In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
X_train = pd.read_csv('data/input/standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)

In [4]:
X_test = pd.read_csv('data/input/standardized_X_test.csv', index_col=0)

In [5]:
np.random.seed(42)

### 10-fold Cross-Validation

In [6]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [7]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

For catboost, we shall not tune any parameters as the default parameters often perform well.

In [8]:
from catboost import CatBoostRegressor

In [9]:
mses = []
model = CatBoostRegressor()
mses = kfold_mse(X_train, y_train, model)

Learning rate set to 0.045642
0:	learn: 1.1097674	total: 150ms	remaining: 2m 29s
1:	learn: 1.0797005	total: 154ms	remaining: 1m 17s
2:	learn: 1.0493685	total: 159ms	remaining: 52.8s
3:	learn: 1.0245079	total: 164ms	remaining: 41s
4:	learn: 0.9971233	total: 169ms	remaining: 33.7s
5:	learn: 0.9729844	total: 174ms	remaining: 28.8s
6:	learn: 0.9466217	total: 179ms	remaining: 25.3s
7:	learn: 0.9233848	total: 183ms	remaining: 22.7s
8:	learn: 0.9005831	total: 187ms	remaining: 20.6s
9:	learn: 0.8782728	total: 191ms	remaining: 18.9s
10:	learn: 0.8564285	total: 195ms	remaining: 17.6s
11:	learn: 0.8375617	total: 199ms	remaining: 16.4s
12:	learn: 0.8206611	total: 203ms	remaining: 15.4s
13:	learn: 0.8037611	total: 208ms	remaining: 14.7s
14:	learn: 0.7876603	total: 213ms	remaining: 14s
15:	learn: 0.7700891	total: 218ms	remaining: 13.4s
16:	learn: 0.7545594	total: 223ms	remaining: 12.9s
17:	learn: 0.7419564	total: 230ms	remaining: 12.5s
18:	learn: 0.7274199	total: 236ms	remaining: 12.2s
19:	learn: 0.

### In-sample analysis

In [10]:
print(f'avg_mse: {np.mean(mses)}')

avg_mse: 241.24796912904495


In [11]:
cv_df = pd.DataFrame(mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,159.765188
1,60.839203
2,336.693035
3,184.558159
4,178.937266
5,103.475004
6,341.379996
7,174.998832
8,784.115356
9,87.717652


In [12]:
cv_df.to_csv('data/output/cv/cb_sta_log.csv')