In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')
import gc

from tqdm.notebook import tqdm
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

from baseline import build_base_dataset
from m5.metric import WRMSSE
from m5.constants import *
from m5.funcs import only_days_columns
from utils.dtype import downcast
from baseline import window_stats

In [None]:
import logging
logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
ds = build_base_dataset()
ds = ds[ds['d'] > 1380]

In [None]:
metric = WRMSSE(ds[ds['d'] < 1914], ds[ds['d'] >= 1914])

In [None]:
# затираем реальные продажи и цену, дабы не ликнуть данные
ds['sold'] = ds['sold'].astype(np.float32)
ds.loc[:, 'target_sold'] = ds.loc[:, 'sold']
ds.loc[ds['d'] >= 1914, 'sold'] = np.nan
ds['sell_price'] = ds['sell_price'].astype(np.float32)

# Подготовка набора данных

## Исключение данных "до старта продаж"

In [None]:
# определяем стартпродаж по критерию - как только был продана хотяб 1 единца - старт т наступил
ds['start_of_sales'] = ds.groupby('id')['sold'].transform(lambda x: np.argmax(x > 0))
# поскольку сдвиг агрегатов у нас минимум 28 дней, то сдвинем старт продаж на этот срок
ds = ds[ds['d'] > (ds['start_of_sales'])].reset_index(drop=True)

In [None]:
ds['event_type_1'] = ds['event_type_1'].cat.add_categories('NaN').fillna('NaN')
ds['event_name_1'] = ds['event_name_1'].cat.add_categories('NaN').fillna('NaN')
ds['event_type_2'] = ds['event_type_2'].cat.add_categories('NaN').fillna('NaN')
ds['event_name_2'] = ds['event_name_2'].cat.add_categories('NaN').fillna('NaN')

In [None]:
ds['revenue'] = ds['sold'] * ds['sell_price']
ds['holliday_tommorow'] = ((ds['event_type_1'] != 'NaN') | (ds['event_type_2'] != 'NaN')).shift(-1).fillna(False).astype('int')

ds['wday'] = ds.date.dt.weekday.astype('int16')
ds['week'] = ds.date.dt.weekofyear.astype('int16')
ds['month'] = ds.date.dt.month.astype('int16')
ds['quarter'] = ds.date.dt.quarter.astype('int16')
ds['year'] = ds.date.dt.year.astype('int16')
ds['mday'] = ds.date.dt.day.astype('int16')

In [None]:
train_ds = ds[(ds['d'] < 1914)]
val_ds = ds[(ds['d'] >= 1914)]

In [None]:
def calculate_features(ds: pd.DataFrame):
    group_by_id = ds.groupby('id')
    features = pd.DataFrame({
        'id': ds['id'],
        'item_id': ds['item_id'],
        'd': ds['d'],
        'sold_shift_1': group_by_id['sold'].shift(1),
        'sold_shift_7': group_by_id['sold'].shift(7),
        'price_diff_1': group_by_id['sell_price'].diff(),
        'price_diff_7': group_by_id['sell_price'].diff(7),
    })
    
    group_by_id = features.groupby('id')
    features['rmean_s1_w7'] = group_by_id['sold_shift_1'].transform(lambda x: x.rolling(7).mean())
    features['rmean_s1_w7'] = group_by_id['sold_shift_1'].transform(lambda x: x.rolling(28).mean())
    features['rmean_s7_w28'] = group_by_id['sold_shift_7'].transform(lambda x: x.rolling(7).mean())
    features['rmean_s7_w28'] = group_by_id['sold_shift_7'].transform(lambda x: x.rolling(28).mean())
    
    features = features.drop(['id', 'item_id', 'd'], axis=1)
    return features

In [None]:
%%time
features = calculate_features(train_ds)
new_features_col = features.columns.tolist()

In [None]:
train_ds = pd.concat([train_ds, features], axis=1)

# Обучение модели

In [None]:
CATEGORICAL_FEATURES = [
    'item_id', 'store_id', 'cat_id', 'state_id', 'dept_id',
    'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2',
    'snap_CA', 'snap_TX', 'snap_WI',
    'week', 'wday', 'month', 'mday',
    'holliday_tommorow'
]

NUMERICAL_FEATURES = new_features_col

XS = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
TARGET = 'target_sold'

In [None]:
params = {
    'objective': 'poisson',
    'num_iterations': 2000,
    'learning_rate': 0.075,
    'verbose': 20,
    'bagging_fraction': 0.5,
    'feature_fraction': 0.7,
    'metric': ['rmse'],
    'min_data_in_leaf': 50,
    'max_depth': 7
}

In [None]:
from sklearn.model_selection import train_test_split
train_ds = train_ds.dropna(subset=XS + [TARGET])
train, val = train_test_split(train_ds, test_size=0.1, random_state=42)
train.shape, val.shape

In [None]:
models = {}
field = 'constant_id'
for idx_ in ds[field].unique().tolist():
    logging.info(f'Train with {field} = {idx_}')
    
    cat_train = train[train[field] == idx_]
    cat_train = cat_train[XS + [TARGET]].dropna()
    cat_val = val[val[field] == idx_]
    cat_val = cat_val[XS + [TARGET]].dropna()
    logging.debug(f'Shape of train set: {cat_train.shape}')
    logging.debug(f'Shape of valid set: {cat_val.shape}')

    cat_train_set = lgb.Dataset(cat_train[XS], cat_train[TARGET])
    cat_valid_set = lgb.Dataset(cat_val[XS], cat_val[TARGET])

    logging.info('Starting model train')
    evals_result = {}
    model = lgb.train(
        params,
        train_set=cat_train_set,
        valid_sets=[cat_train_set, cat_valid_set],
        early_stopping_rounds=10,
        categorical_feature=CATEGORICAL_FEATURES,
        evals_result=evals_result,
        verbose_eval=20 
    )
    models[idx_] = {
        'model': model,
        'evals_result': evals_result
    }
    logging.info('The model is trained')

In [None]:
import seaborn as sns

In [None]:
feature_importances = pd.DataFrame(
    sorted(
        sorted(zip(model.feature_importance(importance_type='gain'), model.feature_name()))
    ),
    columns=['values', 'names']
)
feature_importances = feature_importances.sort_values(by='values', ascending=False)

plt.figure(figsize=(15, 10))
sns.barplot(x='values', y='names', data=feature_importances.head(50))
plt.title('Общий график значимости ТОП-50 признаков (LightGBM gain)')
plt.ylabel('Название признака')
plt.xlabel('Важность (Gain)')
plt.show()

In [None]:
lgb.plot_metric(evals_result, metric='rmse')

In [None]:
# model.save_model('recurent.lgb')

In [None]:
max_lag = 36
days = [x for x in range(1914, 1941 + 1)]
val_pred = ds[ds['d'] >= (1914 - max_lag)].reset_index(drop=True)
val_pred.loc[val_pred['d'] >= 1914, 'sold'] = np.nan

In [None]:
for day in tqdm(days):
    local_ds = val_pred[val_pred['d'].between(day - max_lag, day)]
    features = calculate_features(local_ds)
    ds_for_predict = pd.concat([local_ds, features], axis=1)
    for_predict = ds_for_predict.loc[ds_for_predict['d'] == day, :]
    for idx_, model_dict in models.items():
        model = model_dict['model']
        for_predict_cat = for_predict.loc[for_predict[field] == idx_]
        predict = model.predict(for_predict_cat[model.feature_name()])
        val_pred.loc[(val_pred['d'] == day) & (val_pred[field] == idx_), 'sold'] = predict
        

In [None]:
predict = val_pred[val_pred['d'] >= 1914]
predict['d'] = predict.d.apply(lambda x: f'd_{x}')
predict = predict.pivot(index='id', columns='d', values='sold')
predict = val_pred.loc[val_pred['d'] >= 1914, ID_COLUMNS].drop_duplicates().merge(predict, on='id')

In [None]:
metric.score(predict)

# Отправка сабмита

In [None]:
submit_days = only_days_columns(val_pred)
submit_evaluation = val_pred[['id'] + submit_days]
submit_rename_dict = {
    col: f'F{i}' 
    for i, col in enumerate(only_days_columns(submit_evaluation), start=1)
}
submit_evaluation = submit_evaluation.rename(columns=submit_rename_dict)

submit_validation = submit_evaluation.copy()
submit_validation['id'] = submit_validation.id.str.replace('evaluation', 'validation')

submit = pd.concat([submit_evaluation, submit_validation])

In [None]:
submit.to_csv('submit.csv', index=False)

In [None]:
!kaggle competitions submit -c m5-forecasting-accuracy -f submit.csv -m "Daily predict"