In [1]:
import numpy as np
import pandas as pd
import holidays
import optuna

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.metrics import mean_absolute_percentage_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

## Знакомство с данными
- *num_sold* - 8871 nan значений (целевой признак)
- *country* - 'Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore'
- *store* - 'Discount Stickers', 'Stickers for Less', 'Premium Sticker Mart'
- *product* - 'Holographic Goose', 'Kaggle', 'Kaggle Tiers', 'Kerneler', 'Kerneler Dark Mode'
- *date* - будем думать

In [3]:
df

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [4]:
# Привожу дату к типу pd.datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df_pred['date'] = pd.to_datetime(df_pred['date'], format='%Y-%m-%d')

In [5]:
# Новые признаки из даты
df['day'] = df.date.dt.day
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
# df['quarter'] = df.date.dt.quarter
# df['dayofyear'] = df.date.dt.dayofyear
# df['weekday'] = df.date.dt.weekday

df_pred['day'] = df_pred.date.dt.day
df_pred['month'] = df_pred.date.dt.month
df_pred['year'] = df_pred.date.dt.year
# df_pred['quarter'] = df_pred.date.dt.quarter
# df_pred['dayofyear'] = df_pred.date.dt.dayofyear
# df_pred['weekday'] = df_pred.date.dt.weekday

In [6]:
# Удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

# Удаляю Nan значения
df.dropna(subset=['num_sold'], inplace=True)

In [7]:
# holiday_dict = {
#     'Canada': holidays.CA(),
#     'Finland': holidays.FI(),
#     'Italy': holidays.IT(),
#     'Kenya': holidays.KE(),
#     'Norway': holidays.NO(),
#     'Singapore': holidays.SG()
# }


# def is_holiday(row):
#     country = row['country']
#     date = row['date']
    
#     if date in holiday_dict[country]:
#         return True
#     else:
#         return False


# def is_weekend(row):
#     date = row['date']

#     if date.weekday() >= 5:
#         return True
#     else:
#         return False


# df['weekend'] = df.apply(is_weekend, axis=1)
# df['holiday'] = df.apply(is_holiday, axis=1)

# df_pred['weekend'] = df_pred.apply(is_weekend, axis=1)
# df_pred['holiday'] = df_pred.apply(is_holiday, axis=1)

In [8]:
# удаляю date
df.drop(columns='date', inplace=True)
df_pred.drop(columns='date', inplace=True)

In [9]:
# метка столбцов с категориальным признаком
lst_features = ['country', 'store', 'product', 'day', 'month', 'year']

In [10]:
X = df.drop(columns='num_sold')
y = df['num_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# cat_model = CatBoostRegressor(
#     cat_features=lst_features, verbose=0, random_seed=42,
#     loss_function='MAPE', eval_metric='MAPE'
# )

# # Создаем пользовательский scorer для MAPE
# scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# # Настраиваем кросс-валидацию
# kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-кратная кросс-валидация

# # Выполняем кросс-валидацию
# cv_scores = cross_val_score(cat_model, X_train, y_train, cv=kf, scoring=scorer)

# # Меняем знак, так как MAPE в cross_val_score возвращается со знаком минус
# cv_scores = -cv_scores

# # Выводим среднее значение и стандартное отклонение метрики MAPE
# print(f"Средняя MAPE: {np.mean(cv_scores):.4f}")
# print(f"Стандартное отклонение MAPE: {np.std(cv_scores):.4f}")

# # Если необходимо обучить модель на всем тренировочном наборе:
# cat_model.fit(X_train, y_train)

# # Использование модели для предсказаний
# y_pred = cat_model.predict(X_test)
# mae_test = mean_absolute_percentage_error(y_test, y_pred)
# print(f"Тестовая MAPE: {mae_test:.4f}")

In [12]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     params = {
#         'random_seed': 42,
#         'verbose': 0,
#         'eval_metric': 'MAPE',
#         'iterations': trial.suggest_int('iterations', 1000, 2100),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         'depth': trial.suggest_int('depth', 3, 12),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
#         'cat_features': lst_features,
#         # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
#         # 'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
#     }
    
#     model = CatBoostRegressor(**params)
#     model.fit(X_train, y_train, eval_set=(X_test, y_test))
#     predictions = model.predict(X_test)
#     return mean_absolute_percentage_error(y_test, predictions)
    
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

In [13]:
params = {'iterations': 1344,
 'learning_rate': 0.050046525553752046,
 'depth': 12,
 'l2_leaf_reg': 0.001428611939083863}

In [14]:
cat_model = CatBoostRegressor(
    cat_features=lst_features, verbose=0, random_seed=42,
    loss_function='MAPE', eval_metric='MAPE', **params)

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
y_pred = cat_model.predict(X_test)
mae = mean_absolute_percentage_error(y_test, y_pred)
pred = cat_model.predict(df_pred)
mae

0.1451575830196869

In [15]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'num_sold': pred})
output.to_csv('data/cat_model.csv', index=False)