In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('data/input/non_standardized_X_train.csv', index_col=0)
y_train = pd.read_csv('data/input/y_train.csv', index_col=0)

In [3]:
X_test = pd.read_csv('data/input/non_standardized_X_test.csv', index_col=0)

In [4]:
np.random.seed(42)

### 10-fold Cross-Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [6]:
def kfold_mse(X, y, model, k=10):
    kf = KFold(n_splits=k)
    mses = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train['sales']) # Avoid deprecation warning
        y_pred = model.predict(X_val)
        mses.append(mean_squared_error(y_pred, y_val['sales']))
    return mses

For catboost, we shall not tune any parameters as the default parameters often perform well.

In [7]:
from catboost import CatBoostRegressor

In [8]:
mses = []
model = CatBoostRegressor()
mses = kfold_mse(X_train, y_train, model)

Learning rate set to 0.045642
0:	learn: 26.8581839	total: 143ms	remaining: 2m 22s
1:	learn: 26.3751642	total: 147ms	remaining: 1m 13s
2:	learn: 25.8395827	total: 151ms	remaining: 50.3s
3:	learn: 25.3404704	total: 157ms	remaining: 39s
4:	learn: 24.8745442	total: 161ms	remaining: 32s
5:	learn: 24.4781805	total: 165ms	remaining: 27.3s
6:	learn: 24.0826293	total: 169ms	remaining: 23.9s
7:	learn: 23.6622351	total: 174ms	remaining: 21.5s
8:	learn: 23.2431249	total: 178ms	remaining: 19.6s
9:	learn: 22.8667382	total: 181ms	remaining: 17.9s
10:	learn: 22.5168678	total: 186ms	remaining: 16.7s
11:	learn: 22.1995611	total: 190ms	remaining: 15.6s
12:	learn: 21.8485618	total: 194ms	remaining: 14.7s
13:	learn: 21.5590396	total: 197ms	remaining: 13.9s
14:	learn: 21.2565277	total: 205ms	remaining: 13.4s
15:	learn: 20.9667433	total: 208ms	remaining: 12.8s
16:	learn: 20.6916904	total: 212ms	remaining: 12.2s
17:	learn: 20.4599410	total: 216ms	remaining: 11.8s
18:	learn: 20.1945511	total: 221ms	remaining: 

### In-sample analysis

In [9]:
print(f'avg_mse: {np.mean(mses)}')

avg_mse: 243.61750054756476


In [10]:
cv_df = pd.DataFrame(mses, columns=['Cross-Validation MSE'])
cv_df

Unnamed: 0,Cross-Validation MSE
0,112.028908
1,126.108521
2,277.564768
3,184.995589
4,154.126929
5,116.222199
6,381.079689
7,148.524615
8,769.344884
9,166.178902


In [12]:
cv_df.to_csv('data/output/cv/cb.csv')