In [2]:
# исследование данных   

import pandas as pd

In [3]:
shop_sales = pd.read_csv('shop_sales.csv')

In [4]:
shop_sales_dates = pd.read_csv('shop_sales_dates.csv')

In [5]:
shop_sales_prices = pd.read_csv('shop_sales_prices.csv')

In [6]:
# объединить данные получить общий датасет 
STORE_1_727 = pd.merge(shop_sales[shop_sales['item_id'] == 'STORE_1_727'], shop_sales_dates, on = ['date_id'])

In [11]:
STORE_1_727

Unnamed: 0,item_id,store_id,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3
0,STORE_1_727,STORE_1,1,6,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
1,STORE_1_727,STORE_1,2,8,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
2,STORE_1_727,STORE_1,3,6,2011-01-31,11101,Monday,3,1,2011,,,,,0,0,0
3,STORE_1_727,STORE_1,4,0,2011-02-01,11101,Tuesday,4,2,2011,,,,,0,1,1
4,STORE_1_727,STORE_1,5,2,2011-02-02,11101,Wednesday,5,2,2011,,,,,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_1_727,STORE_1,1815,2,2016-01-17,11551,Sunday,2,1,2016,,,,,0,0,0
1815,STORE_1_727,STORE_1,1816,3,2016-01-18,11551,Monday,3,1,2016,MartinLutherKingDay,National,,,0,0,0
1816,STORE_1_727,STORE_1,1817,1,2016-01-19,11551,Tuesday,4,1,2016,,,,,0,0,0
1817,STORE_1_727,STORE_1,1818,4,2016-01-20,11551,Wednesday,5,1,2016,,,,,0,0,0


In [8]:
def create_lag_features(df, feature_list, min_lag, max_lag):
    """
    Создает лаговые признаки
    Параметры:
    - df: DataFrame с исходными данными.
    - feature_list: список колонок, для которых создаются лаги.
    - min_lag: минимальный лаг.
    - max_lag: максимальный лаг.
    
    Возвращает:
    - DataFrame, содержащий лаги для указанных фичей.
    """
    lagged_df = pd.concat(
        [df[feature_list].shift(lag).add_suffix(f'_lag_{lag}') for lag in range(min_lag, max_lag + 1)], 
        axis=1
    )
    
    return lagged_df

In [13]:
create_lag_features(STORE_1_727, 'cnt', 5)

Unnamed: 0,item_id,store_id,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,...,event_name_2,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3,cnt_lag_1,cnt_lag_2,cnt_lag_3,cnt_lag_4,cnt_lag_5
0,STORE_1_727,STORE_1,1,6,2011-01-29,11101,Saturday,1,1,2011,...,,,0,0,0,,,,,
1,STORE_1_727,STORE_1,2,8,2011-01-30,11101,Sunday,2,1,2011,...,,,0,0,0,6.0,,,,
2,STORE_1_727,STORE_1,3,6,2011-01-31,11101,Monday,3,1,2011,...,,,0,0,0,8.0,6.0,,,
3,STORE_1_727,STORE_1,4,0,2011-02-01,11101,Tuesday,4,2,2011,...,,,0,1,1,6.0,8.0,6.0,,
4,STORE_1_727,STORE_1,5,2,2011-02-02,11101,Wednesday,5,2,2011,...,,,1,1,0,0.0,6.0,8.0,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_1_727,STORE_1,1815,2,2016-01-17,11551,Sunday,2,1,2016,...,,,0,0,0,4.0,2.0,4.0,5.0,2.0
1815,STORE_1_727,STORE_1,1816,3,2016-01-18,11551,Monday,3,1,2016,...,,,0,0,0,2.0,4.0,2.0,4.0,5.0
1816,STORE_1_727,STORE_1,1817,1,2016-01-19,11551,Tuesday,4,1,2016,...,,,0,0,0,3.0,2.0,4.0,2.0,4.0
1817,STORE_1_727,STORE_1,1818,4,2016-01-20,11551,Wednesday,5,1,2016,...,,,0,0,0,1.0,3.0,2.0,4.0,2.0


In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_simple_time_series(df, date_col, value_col, title="Time Series", xlabel="Date", ylabel="Value", figsize=(10, 5)):
    """
    Простая функция для построения графика временного ряда.
    
    Параметры:
    - df: DataFrame с данными.
    - date_col: Колонка с датой или индексом.
    - value_col: Колонка с величинами для отображения.
    - title: Заголовок графика.
    - xlabel: Подпись для оси X.
    - ylabel: Подпись для оси Y.
    - figsize: Размер фигуры.
    """
    plt.figure(figsize=figsize)
    sns.lineplot(data=df, x=date_col, y=value_col)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

# Пример использования:
# plot_simple_time_series(df, 'date', 'sales')


In [24]:
STORE_1_727

Unnamed: 0,item_id,store_id,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3
0,STORE_1_727,STORE_1,1,6,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
1,STORE_1_727,STORE_1,2,8,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
2,STORE_1_727,STORE_1,3,6,2011-01-31,11101,Monday,3,1,2011,,,,,0,0,0
3,STORE_1_727,STORE_1,4,0,2011-02-01,11101,Tuesday,4,2,2011,,,,,0,1,1
4,STORE_1_727,STORE_1,5,2,2011-02-02,11101,Wednesday,5,2,2011,,,,,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_1_727,STORE_1,1815,2,2016-01-17,11551,Sunday,2,1,2016,,,,,0,0,0
1815,STORE_1_727,STORE_1,1816,3,2016-01-18,11551,Monday,3,1,2016,MartinLutherKingDay,National,,,0,0,0
1816,STORE_1_727,STORE_1,1817,1,2016-01-19,11551,Tuesday,4,1,2016,,,,,0,0,0
1817,STORE_1_727,STORE_1,1818,4,2016-01-20,11551,Wednesday,5,1,2016,,,,,0,0,0


In [47]:
pd.set_option('display.max_rows', 300)

In [10]:
id = 'STORE_3_114'

In [11]:
shop_sales_with_dates = pd.merge(shop_sales[shop_sales['item_id'] == id], shop_sales_dates, on = 'date_id')

In [12]:
data_p = pd.merge(shop_sales_with_dates, shop_sales_prices[shop_sales_prices['item_id'] == id], on = ['wm_yr_wk'], how= 'left')

In [14]:
data_p['sell_price'] = data_p['sell_price'].ffill().bfill()

In [15]:
data_p

Unnamed: 0,item_id_x,store_id_x,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3,store_id_y,item_id_y,sell_price
0,STORE_3_114,STORE_3,1,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,,,2.77
1,STORE_3_114,STORE_3,2,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0,,,2.77
2,STORE_3_114,STORE_3,3,0,2011-01-31,11101,Monday,3,1,2011,,,,,0,0,0,,,2.77
3,STORE_3_114,STORE_3,4,0,2011-02-01,11101,Tuesday,4,2,2011,,,,,0,1,1,,,2.77
4,STORE_3_114,STORE_3,5,0,2011-02-02,11101,Wednesday,5,2,2011,,,,,1,1,0,,,2.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_3_114,STORE_3,1815,0,2016-01-17,11551,Sunday,2,1,2016,,,,,0,0,0,STORE_3,STORE_3_114,2.73
1815,STORE_3_114,STORE_3,1816,0,2016-01-18,11551,Monday,3,1,2016,MartinLutherKingDay,National,,,0,0,0,STORE_3,STORE_3_114,2.73
1816,STORE_3_114,STORE_3,1817,0,2016-01-19,11551,Tuesday,4,1,2016,,,,,0,0,0,STORE_3,STORE_3_114,2.73
1817,STORE_3_114,STORE_3,1818,0,2016-01-20,11551,Wednesday,5,1,2016,,,,,0,0,0,STORE_3,STORE_3_114,2.73


In [16]:
from Tools import *

In [17]:
# Задаем значение для генератора случайных чисел
seed_value = 23
np.random.seed(seed_value)

In [18]:
data_p['date'] = pd.to_datetime(data_p['date'])

In [20]:
X = data_p.copy()
y = data_p['cnt']

In [21]:
# creating cross validator
cv_datetime = DateTimeSeriesSplit(window=300, n_splits=3, test_size=30, margin=0)
group_dt = X['date']

## get parametres from train and test for each fold
for fold, (train_idx, val_idx) in enumerate(cv_datetime.split(X, groups=group_dt), 1):
    print(fold)
    train, test = X.iloc[train_idx], X.iloc[val_idx]
    print(f'треин мин {train.date.min()} треин макс {train.date.max()} shape {train.shape}')
    print(f'тест мин {test.date.min()} тест макс {test.date.max()} shape {test.shape}')

1
треин мин 2014-12-28 00:00:00 треин макс 2015-10-23 00:00:00 shape (300, 21)
тест мин 2015-10-24 00:00:00 тест макс 2015-11-22 00:00:00 shape (30, 21)
2
треин мин 2015-01-27 00:00:00 треин макс 2015-11-22 00:00:00 shape (300, 21)
тест мин 2015-11-23 00:00:00 тест макс 2015-12-22 00:00:00 shape (30, 21)
3
треин мин 2015-02-26 00:00:00 треин макс 2015-12-22 00:00:00 shape (300, 21)
тест мин 2015-12-23 00:00:00 тест макс 2016-01-21 00:00:00 shape (30, 21)


In [22]:
X

Unnamed: 0,item_id_x,store_id_x,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,...,event_type_1,event_name_2,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3,store_id_y,item_id_y,sell_price,index_time
0,STORE_3_114,STORE_3,1,0,2011-01-29,11101,Saturday,1,1,2011,...,,,,0,0,0,,,2.77,0
1,STORE_3_114,STORE_3,2,0,2011-01-30,11101,Sunday,2,1,2011,...,,,,0,0,0,,,2.77,1
2,STORE_3_114,STORE_3,3,0,2011-01-31,11101,Monday,3,1,2011,...,,,,0,0,0,,,2.77,2
3,STORE_3_114,STORE_3,4,0,2011-02-01,11101,Tuesday,4,2,2011,...,,,,0,1,1,,,2.77,3
4,STORE_3_114,STORE_3,5,0,2011-02-02,11101,Wednesday,5,2,2011,...,,,,1,1,0,,,2.77,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_3_114,STORE_3,1815,0,2016-01-17,11551,Sunday,2,1,2016,...,,,,0,0,0,STORE_3,STORE_3_114,2.73,1814
1815,STORE_3_114,STORE_3,1816,0,2016-01-18,11551,Monday,3,1,2016,...,National,,,0,0,0,STORE_3,STORE_3_114,2.73,1815
1816,STORE_3_114,STORE_3,1817,0,2016-01-19,11551,Tuesday,4,1,2016,...,,,,0,0,0,STORE_3,STORE_3_114,2.73,1816
1817,STORE_3_114,STORE_3,1818,0,2016-01-20,11551,Wednesday,5,1,2016,...,,,,0,0,0,STORE_3,STORE_3_114,2.73,1817


In [24]:
features = ['sell_price', 'CASHBACK_STORE_1', 'CASHBACK_STORE_2', 'CASHBACK_STORE_3','wday', 'month']

In [25]:
# create model for selector
from lightgbm import LGBMRegressor
model = LGBMRegressor(max_depth=3, objective='MAPE', verbosity = -1)

# create list from which we will take vars for model creating
list_of_vars = features

Minimal version of pyarrow will soon be increased to 14.0.1. You are using 11.0.0. Please consider upgrading.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [26]:
# import metrics for var selection
from sklearn.metrics import mean_absolute_percentage_error as MAPE

# create selector
selector1 = Kraken(model, cv_datetime, MAPE, 'exp1')

In [27]:
# get rank dict from vars
selector1.get_rank_dict(X, y, list_of_vars, group_dt)

# get ranked dict in descending order of abs shap value
selector1.rank_dict

{'sell_price': 1,
 'CASHBACK_STORE_1': 2,
 'CASHBACK_STORE_2': 3,
 'CASHBACK_STORE_3': 4,
 'wday': 5,
 'month': 6}

In [28]:
## get vars
selector1.get_vars(X, y, early_stopping_rounds = 10, group_dt = group_dt)

запуск первого шага
new var_for_add ! sell_price
едем дальше
в итоге получили список ['sell_price']
запуск первого шага
мы сошлись
['sell_price']
0.028


['sell_price']