In [1]:
import os
os.chdir('..')

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

from baseline import build_base_dataset
from utils.read import read_prices_dataset
from m5.metric import WRMSSE
from m5.constants import *
from utils.funcs import day_to_week_dict, select_tail_days, only_days_columns

# Подготовка набора данных

In [3]:
ds = build_base_dataset()

In [59]:
ds = ds.drop(['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'], axis=1)

In [4]:
ds['sold_r7s28mean'] = ds['sold'].rolling(7).mean().shift(28)
ds['sold_r28s28mean'] = ds['sold'].rolling(28).mean().shift(28)
ds['sold_r183s28mean'] = ds['sold'].rolling(183).mean().shift(28)
ds['sold_r365s28mean'] = ds['sold'].rolling(365).mean().shift(28)
ds['sold_r365s183mean'] = ds['sold'].rolling(365).mean().shift(183)
ds['sold_r365s365mean'] = ds['sold'].rolling(365).mean().shift(365)

In [5]:
ds['price_r7s28mean'] = ds['sell_price'].rolling(7).mean().shift(28)
ds['price_r28s28mean'] = ds['sell_price'].rolling(28).mean().shift(28)
ds['price_r183s28mean'] = ds['sell_price'].rolling(183).mean().shift(28)
ds['price_r365s28mean'] = ds['sell_price'].rolling(365).mean().shift(28)
ds['price_r365s183mean'] = ds['sell_price'].rolling(365).mean().shift(183)
ds['price_r365s365mean'] = ds['sell_price'].rolling(365).mean().shift(365)

# Обучение модели

In [None]:
CATEGORICAL_FEATURES = [
    'dept_id', 'cat_id', 'store_id', 'state_id',
    'weekday', 'month', 'year'
]
NUMERICAL_FEATURES = [
    'sell_price', 
    'sold_r7s28mean', 'sold_r28s28mean', 'sold_r183s28mean', 'sold_r365s28mean', 'sold_r365s183mean', 'sold_r365s365mean',
    'price_r7s28mean', 'price_r28s28mean', 'price_r183s28mean', 'price_r365s28mean', 'price_r365s183mean', 'price_r365s365mean',
    'price_dept_r7s28mean', 'price_dept_r28s28mean', 'price_state_r7s28mean', 'price_state_r28s28mean', 'price_store_r7s28mean', 'price_store_r28s28mean'
]

XS = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
TARGET = 'sold'

In [11]:
train_ds = ds[ds['d'] < 1914]
val_ds = ds[ds['d'] >= 1914]
# объект для подсчета метрик
metric = WRMSSE(train_ds, val_ds)

# объекты для обучения
train_set = lgb.Dataset(train_ds[XS], train_ds[TARGET], categorical_feature=CATEGORICAL_FEATURES)
valid_set = lgb.Dataset(val_ds[XS], val_ds[TARGET], categorical_feature=CATEGORICAL_FEATURES)

In [None]:
params = {
    'objective': 'rmse',
    'num_iterations': 5000,
    'learning_rate': 0.1,
    'verbose': 20,
'    bagging_fraction': 0.5,
    'feature_fraction': 0.7,
    'max_depth': 8,
    'num_leaves': 32
}

In [None]:
model = lgb.train(
    params,
    train_set=train_set,
    valid_sets=[train_set, valid_set],
    early_stopping_rounds=10,
    categorical_feature=CATEGORICAL_FEATURES
)

In [None]:
plt.figure(figsize=(15, 5))
lgb.plot_importance(model, importance_type='gain')

# Подготовка "станартного" вида валидации

In [None]:
val_pred = val_ds.copy()
val_pred['sold_predict'] = model.predict(val_ds[XS])
val_pred['d'] = val_pred.d.apply(lambda x: f'd_{x}')
val_pred = val_pred.pivot(index='id', columns='d', values='sold_predict')
val_pred = val_ds[ID_COLUMNS].drop_duplicates().merge(val_pred, on='id')

In [61]:
print('WRMSSE: ', metric.score(val_pred))

WRMSSE:  0.6200938982182481


# Отправка сабмита

In [107]:
submit_days = only_days_columns(val_pred)
submit_evaluation = val_pred[['id'] + submit_days]
submit_rename_dict = {
    col: f'F{i}' 
    for i, col in enumerate(only_days_columns(submit_evaluation), start=1)
}
submit_evaluation = submit_evaluation.rename(columns=submit_rename_dict)

submit_validation = submit_evaluation.copy()
submit_validation['id'] = submit_validation.id.str.replace('evaluation', 'validation')

submit = pd.concat([submit_evaluation, submit_validation])

In [108]:
submit.to_csv('submit.csv', index=False)

In [20]:
!kaggle competitions submit -c m5-forecasting-accuracy -f submit.csv -m "Test pipeline"

100%|██████████████████████████████████████| 32.5M/32.5M [00:03<00:00, 9.07MB/s]
Successfully submitted to M5 Forecasting - Accuracy