In [53]:
import pandas as pd
import numpy as np
from datetime import datetime
import catboost
from sklearn.model_selection import train_test_split

In [83]:
df = pd.read_csv(r'train/train.csv')
df = df.rename(columns={'Date': 'datetime'})

In [84]:
df

Unnamed: 0,datetime,store,product,number_sold
0,2010-01-01,0,0,801
1,2010-01-02,0,0,810
2,2010-01-03,0,0,818
3,2010-01-04,0,0,796
4,2010-01-05,0,0,808
...,...,...,...,...
230085,2018-12-27,6,9,890
230086,2018-12-28,6,9,892
230087,2018-12-29,6,9,895
230088,2018-12-30,6,9,899


In [103]:
def create_many_time_features(df, dop_col):
    df = df[['datetime', dop_col]].copy()
    df['datetime'] = pd.to_datetime(df['datetime'])

    # 1. Базовые признаки даты и времени
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['day_of_week'] = df['datetime'].dt.dayofweek  # 0 = Monday, 6 = Sunday
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df['second'] = df['datetime'].dt.second

    # 2. Бинарные признаки
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)  # 1, если суббота или воскресенье
    # Допустим, список праздников (потребуются данные по регионам для точности)
    holidays = ['2023-01-01', '2023-02-05']  # примерный список праздников
    df['is_holiday'] = df['datetime'].dt.date.astype(str).isin(holidays).astype(int)
    # Утренние и вечерние часы (например, с 7:00 до 10:00 и с 17:00 до 19:00) как час пик
    df['is_rush_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

    # 3. Сезонные признаки
    # Определим сезоны: зима = 12, 1, 2; весна = 3, 4, 5; лето = 6, 7, 8; осень = 9, 10, 11
    df['season'] = df['month'] % 12 // 3 + 1  # 1=зима, 2=весна, 3=лето, 4=осень
    df['quarter'] = df['datetime'].dt.quarter

    # 4. Циклические признаки
    # Циклическое представление часа, месяца и дня недели
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # 5. Признаки временного интервала
    # Время с начала года
    df['days_since_start_of_year'] = (df['datetime'] - pd.to_datetime(df['datetime'].dt.year.astype(str) + '-01-01')).dt.days
    # Время с последнего события (например, с предыдущей строки)
    df['time_since_last_event'] = df['datetime'].diff().dt.total_seconds().fillna(0)

    # 6. Скользящие и агрегированные статистики
    # Предположим, что у нас есть значение target для каждого временного ряда
    df['rolling_mean_2'] = df[dop_col].rolling(window=2).mean()
    df['rolling_sum_2'] = df[dop_col].rolling(window=2).sum()
    df['rolling_max_2'] = df[dop_col].rolling(window=2).max()
    df['rolling_min_2'] = df[dop_col].rolling(window=2).min()

    # 7. Тренд и сезонность
    # Декомпозиция тренда и сезонности требует дополнительных библиотек, таких как statsmodels или pandas
    # Примерный код на основе STL доступен с statsmodels

    # 8. Показатель активности
    # Количество событий за последнюю неделю (например, из предыдущих данных)
    df['events_last_week'] = df['datetime'].rolling(7).count()
    # Среднее время между событиями
    df['mean_time_between_events'] = df['time_since_last_event'].rolling(window=2).mean()
    df = df.drop(columns=['datetime', dop_col])
    df = df.fillna(0)
    return 


In [86]:
df = df.join(create_many_time_features(df, 'number_sold'))

In [87]:
df

Unnamed: 0,datetime,store,product,number_sold,year,month,day,day_of_week,hour,minute,...,day_of_week_sin,day_of_week_cos,month_sin,month_cos,days_since_start_of_year,time_since_last_event,rolling_mean_2,rolling_sum_2,rolling_max_2,rolling_min_2
0,2010-01-01,0,0,801,2010,1,1,4,0,0,...,-0.433884,-0.900969,5.000000e-01,0.866025,0,0.0,0.0,0.0,0.0,0.0
1,2010-01-02,0,0,810,2010,1,2,5,0,0,...,-0.974928,-0.222521,5.000000e-01,0.866025,1,86400.0,805.5,1611.0,810.0,801.0
2,2010-01-03,0,0,818,2010,1,3,6,0,0,...,-0.781831,0.623490,5.000000e-01,0.866025,2,86400.0,814.0,1628.0,818.0,810.0
3,2010-01-04,0,0,796,2010,1,4,0,0,0,...,0.000000,1.000000,5.000000e-01,0.866025,3,86400.0,807.0,1614.0,818.0,796.0
4,2010-01-05,0,0,808,2010,1,5,1,0,0,...,0.781831,0.623490,5.000000e-01,0.866025,4,86400.0,802.0,1604.0,808.0,796.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230085,2018-12-27,6,9,890,2018,12,27,3,0,0,...,0.433884,-0.900969,-2.449294e-16,1.000000,360,86400.0,893.0,1786.0,896.0,890.0
230086,2018-12-28,6,9,892,2018,12,28,4,0,0,...,-0.433884,-0.900969,-2.449294e-16,1.000000,361,86400.0,891.0,1782.0,892.0,890.0
230087,2018-12-29,6,9,895,2018,12,29,5,0,0,...,-0.974928,-0.222521,-2.449294e-16,1.000000,362,86400.0,893.5,1787.0,895.0,892.0
230088,2018-12-30,6,9,899,2018,12,30,6,0,0,...,-0.781831,0.623490,-2.449294e-16,1.000000,363,86400.0,897.0,1794.0,899.0,895.0


In [88]:
X = df.drop(columns=['store', 'datetime'])
y = df['store']

In [89]:
params = {
    "n_estimators": 1500,
    "learning_rate": 0.03,
    "depth": 3,
    "use_best_model": True,
    "border_count": 64,
    "l2_leaf_reg": 1,
    "bagging_temperature": 2,
    "rsm": 0.5,
    "loss_function": "MultiClass", 
    "auto_class_weights" : 'Balanced',
    "random_state": 22,
    "custom_metric": ["Precision", "Recall", "F1"],
}

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
train_dataset = catboost.Pool(X_train, y_train)
test_dataset = catboost.Pool(X_test, y_test)

In [91]:
model_class = catboost.CatBoostClassifier(**params)

In [92]:
model_class.fit(train_dataset, eval_set=test_dataset)

0:	learn: 1.8972523	test: 1.8974995	best: 1.8974995 (0)	total: 49.7ms	remaining: 1m 14s
1:	learn: 1.8550394	test: 1.8554133	best: 1.8554133 (1)	total: 89.5ms	remaining: 1m 7s
2:	learn: 1.8230376	test: 1.8236740	best: 1.8236740 (2)	total: 132ms	remaining: 1m 5s
3:	learn: 1.7895312	test: 1.7902483	best: 1.7902483 (3)	total: 176ms	remaining: 1m 5s
4:	learn: 1.7591349	test: 1.7599130	best: 1.7599130 (4)	total: 220ms	remaining: 1m 5s
5:	learn: 1.7315574	test: 1.7323985	best: 1.7323985 (5)	total: 263ms	remaining: 1m 5s
6:	learn: 1.7063341	test: 1.7072010	best: 1.7072010 (6)	total: 303ms	remaining: 1m 4s
7:	learn: 1.6831126	test: 1.6840037	best: 1.6840037 (7)	total: 343ms	remaining: 1m 3s
8:	learn: 1.6617140	test: 1.6626312	best: 1.6626312 (8)	total: 383ms	remaining: 1m 3s
9:	learn: 1.6417111	test: 1.6426607	best: 1.6426607 (9)	total: 423ms	remaining: 1m 3s
10:	learn: 1.6226659	test: 1.6235928	best: 1.6235928 (10)	total: 462ms	remaining: 1m 2s
11:	learn: 1.6078587	test: 1.6087974	best: 1.6087

<catboost.core.CatBoostClassifier at 0x1d4cd475a10>

In [98]:
test = pd.read_csv(r'train/test.csv')
y_true = test['store']

In [99]:
test = test.rename(columns={'Date': 'datetime'})
test = test.join(create_many_time_features(test, 'number_sold'))
test = test.drop(columns=['store', 'datetime'])

In [100]:
pred = model_class.predict(test)

In [101]:
from sklearn.metrics import f1_score

In [102]:
f1_score(y_pred=pred, y_true=y_true, average='micro')

0.6399608610567514