In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('data/input/non_standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_log_train.csv', index_col=0)

In [3]:
X_test = pd.read_csv('data/input/non_standardized_X_test.csv', index_col=0)

In [4]:
np.random.seed(42)

### 10-fold Cross-Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [6]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(np.exp(y_pred), np.exp(y_val['sales'])))
    return mses

For catboost, we shall not tune any parameters as the default parameters often perform well.

In [7]:
from catboost import CatBoostRegressor

In [8]:
mses = []
model = CatBoostRegressor()
mses = kfold_mse(X_train, y_train, model)

Learning rate set to 0.045642
0:	learn: 1.1097674	total: 140ms	remaining: 2m 19s
1:	learn: 1.0797005	total: 144ms	remaining: 1m 11s
2:	learn: 1.0493685	total: 148ms	remaining: 49.1s
3:	learn: 1.0245079	total: 151ms	remaining: 37.7s
4:	learn: 0.9971233	total: 156ms	remaining: 31.1s
5:	learn: 0.9729844	total: 161ms	remaining: 26.6s
6:	learn: 0.9466217	total: 165ms	remaining: 23.4s
7:	learn: 0.9233848	total: 170ms	remaining: 21.1s
8:	learn: 0.9005831	total: 174ms	remaining: 19.2s
9:	learn: 0.8782728	total: 178ms	remaining: 17.6s
10:	learn: 0.8564285	total: 183ms	remaining: 16.4s
11:	learn: 0.8375617	total: 187ms	remaining: 15.4s
12:	learn: 0.8206611	total: 191ms	remaining: 14.5s
13:	learn: 0.8037611	total: 195ms	remaining: 13.8s
14:	learn: 0.7876603	total: 200ms	remaining: 13.1s
15:	learn: 0.7700891	total: 206ms	remaining: 12.6s
16:	learn: 0.7545594	total: 210ms	remaining: 12.1s
17:	learn: 0.7419564	total: 214ms	remaining: 11.7s
18:	learn: 0.7274199	total: 218ms	remaining: 11.3s
19:	learn

### In-sample analysis

In [9]:
print(f'avg_mse: {np.mean(mses)}')

avg_mse: 241.24703181517665


In [11]:
cv_df = pd.DataFrame(mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,159.765188
1,60.839203
2,336.692991
3,184.558159
4,178.935643
5,103.475004
6,341.378145
7,174.994251
8,784.11569
9,87.716044


In [12]:
cv_df.to_csv('data/output/cv/cb_log.csv')