# Naive approach catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_path = "./data/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

# drop id
df_train.drop('id', axis=1, inplace=True)

df_train['date'] = pd.to_datetime(df_train['date'])

df_train.head(10)

In [None]:
# split date into year, month, day
df_train['year'] = df_train['date'].dt.year
df_train['month'] = df_train['date'].dt.month
df_train['day'] = df_train['date'].dt.day

# drop date
df_train_0 = df_train.drop('date', axis=1)

# one-hot encoding of 'country', 'store', 'product' columns
#df_train = pd.get_dummies(df_train, columns=['country', 'store', 'product'])


df_train_0['country'] = df_train_0['country'].astype('category')
df_train_0['store'] = df_train_0['store'].astype('category')
df_train_0['product'] = df_train_0['product'].astype('category')

df_train_0.head(10)

In [None]:
df_train_0.info()

In [None]:
# split into train and validation
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, median_absolute_error

from catboost import Pool, CatBoostRegressor

results_mse = []
results_mae = []


for train_index, val_index in TimeSeriesSplit(n_splits=5).split(df_train_0):
    X_train = df_train_0.iloc[train_index]
    y_train = df_train_0.iloc[train_index]['num_sold']

    X_val = df_train_0.iloc[val_index]
    y_val = df_train_0.iloc[val_index]['num_sold']

    cat_features = [0, 1, 2]
      
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, cat_features=cat_features)

    model = CatBoostRegressor(random_state=42, objective='MAE', task_type='GPU')
    model.fit(train_pool, early_stopping_rounds=50, silent=False)
    
    # evaluate on validation set
    y_pred = model.predict(val_pool)
    mse = mean_squared_error(y_val, y_pred)
    mae = median_absolute_error(y_val, y_pred)
    
    results_mse.append("%.2f" % mse)
    results_mae.append("%.2f" % mae)
    del model

print(f"Mean squared error for each split: | {' | '.join(results_mse)} |")
print(f"Median absolute error for each split: | {' | '.join(results_mae)} |")

In [None]:
# train on full data
X_train = df_train_0.drop('num_sold', axis=1)
y_train = df_train_0['num_sold']

train_pool = Pool(X_train, y_train, cat_features=cat_features)

model = CatBoostRegressor(silent=True, random_state=42, objective='MAE')
model.fit(train_pool)

In [None]:
X_train['date'] = pd.to_datetime(X_train['year'].astype(str) + '-' + X_train['month'].astype(str) + '-' + X_train['day'].astype(str))
X_val['date'] = pd.to_datetime(X_val['year'].astype(str) + '-' + X_val['month'].astype(str) + '-' + X_val['day'].astype(str))

# sort by date
X_train.sort_values(by='date', inplace=True)
X_val.sort_values(by='date', inplace=True)

# plot 'num_sold' vs 'date'
fig, ax = plt.subplots(figsize=(15, 5))

ax.plot(X_train['date'], y_train, label='train', color='lightblue')
ax.plot(X_val['date'], y_val, label='validation', color='red')

ax.set_xlabel('date')
ax.set_ylabel('num_sold')

ax.legend()
plt.show()

In [None]:
df_val['prediction'] = model.predict(df_val[['year', 'month', 'day', 'country', 'store', 'product']])
df_val['prediction'] = df_val['prediction'].astype(int)
df_val

In [None]:
# test 

df_test = pd.read_csv(data_path + "test.csv")
df_test_0 = df_test.drop('id', axis=1)
df_test_0['date'] = pd.to_datetime(df_test_0['date'])

df_test_0['year'] = df_test_0['date'].dt.year
df_test_0['month'] = df_test_0['date'].dt.month
df_test_0['day'] = df_test_0['date'].dt.day


df_test_0['country'] = df_test_0['country'].astype('category')
df_test_0['store'] = df_test_0['store'].astype('category')
df_test_0['product'] = df_test_0['product'].astype('category')

# predict for test set

df_test_0['prediction'] = model.predict(df_test_0[['year', 'month', 'day', 'country', 'store', 'product']])
df_test_0['prediction'] = df_test_0['prediction'].astype(int)
df_test_0

In [None]:
# join df_test with df_test_0 on date, country, store and product

df_test_0['date'] = df_test_0['date'].astype(str)
df_test['date'] = df_test['date'].astype(str)

df_test_1 = pd.merge(df_test, df_test_0, on=['date', 'country', 'store', 'product'], how='outer')
df_test_1

In [None]:
df_test_1['num_sold'] = df_test_1['prediction']
df_test_1[['id', 'num_sold']].to_csv('first_submission.csv', index=False)