In [1]:
# исследование данных   
import pandas as pd
import numpy as np

In [2]:
def simple_moving_average(df, column, window):
    df[f'{column}_SMA_{window}'] = df[column].rolling(window=window).mean()
    return df

# Пример использования:
# df = simple_moving_average(df, 'sales', window=3)


def weighted_moving_average(df, column, window):
    weights = np.arange(1, window + 1)  # Веса от 1 до размера окна
    wma = df[column].rolling(window).apply(lambda x: np.dot(x, weights)/weights.sum(), raw=True)
    df[f'{column}_WMA_{window}'] = wma
    return df

# Пример использования:
# df = weighted_moving_average(df, 'sales', window=3)

def exponential_moving_average(df, column, span):
    df[f'{column}_EMA_{span}'] = df[column].ewm(span=span, adjust=False).mean()
    return df

# Пример использования:
# df = exponential_moving_average(df, 'sales', span=3)

In [3]:
shop_sales = pd.read_csv('shop_sales.csv')
shop_sales_dates = pd.read_csv('shop_sales_dates.csv')
shop_sales_prices = pd.read_csv('shop_sales_prices.csv')

In [4]:
def create_lag_features(df, feature_list, min_lag, max_lag):
    """
    Создает лаговые признаки
    Параметры:
    - df: DataFrame с исходными данными.
    - feature_list: список колонок, для которых создаются лаги.
    - min_lag: минимальный лаг.
    - max_lag: максимальный лаг.
    
    Возвращает:
    - DataFrame, содержащий лаги для указанных фичей.
    """
    lagged_df = pd.concat(
        [df[feature_list].shift(lag).add_suffix(f'_lag_{lag}') for lag in range(min_lag, max_lag + 1)], 
        axis=1
    )
    
    return lagged_df

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_simple_time_series(df, date_col, value_col, title="Time Series", xlabel="Date", ylabel="Value", figsize=(10, 5)):
    plt.figure(figsize=figsize)
    sns.lineplot(data=df, x=date_col, y=value_col)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

# Пример использования:
# plot_simple_time_series(df, 'date', 'sales')

In [6]:
id = 'STORE_3_114'

In [7]:
shop_sales_with_dates = pd.merge(shop_sales[shop_sales['item_id'] == id], shop_sales_dates, on = 'date_id')

In [8]:
data_p = pd.merge(shop_sales_with_dates, shop_sales_prices[shop_sales_prices['item_id'] == id], on = ['wm_yr_wk'], how= 'left')

In [9]:
## заполняем миссинги по ценее если они есть
data_p['sell_price'] = data_p['sell_price'].ffill().bfill()

In [10]:
# создаем лаги для фичей от 1 до 3 + скользящие средние лаг 1 (поскольку предсказываем на 1 шаг) для окна 3

df = simple_moving_average(data_p, 'cnt', 3)
df = weighted_moving_average(data_p, 'cnt', 3)
df = exponential_moving_average(data_p, 'cnt', 3)

In [11]:
df

Unnamed: 0,item_id_x,store_id_x,date_id,cnt,date,wm_yr_wk,weekday,wday,month,year,...,event_type_2,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3,store_id_y,item_id_y,sell_price,cnt_SMA_3,cnt_WMA_3,cnt_EMA_3
0,STORE_3_114,STORE_3,1,0,2011-01-29,11101,Saturday,1,1,2011,...,,0,0,0,,,2.77,,,0.000000e+00
1,STORE_3_114,STORE_3,2,0,2011-01-30,11101,Sunday,2,1,2011,...,,0,0,0,,,2.77,,,0.000000e+00
2,STORE_3_114,STORE_3,3,0,2011-01-31,11101,Monday,3,1,2011,...,,0,0,0,,,2.77,0.000000,0.0,0.000000e+00
3,STORE_3_114,STORE_3,4,0,2011-02-01,11101,Tuesday,4,2,2011,...,,0,1,1,,,2.77,0.000000,0.0,0.000000e+00
4,STORE_3_114,STORE_3,5,0,2011-02-02,11101,Wednesday,5,2,2011,...,,1,1,0,,,2.77,0.000000,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,STORE_3_114,STORE_3,1815,0,2016-01-17,11551,Sunday,2,1,2016,...,,0,0,0,STORE_3,STORE_3_114,2.73,0.000000,0.0,9.094947e-12
1815,STORE_3_114,STORE_3,1816,0,2016-01-18,11551,Monday,3,1,2016,...,,0,0,0,STORE_3,STORE_3_114,2.73,0.000000,0.0,4.547474e-12
1816,STORE_3_114,STORE_3,1817,0,2016-01-19,11551,Tuesday,4,1,2016,...,,0,0,0,STORE_3,STORE_3_114,2.73,0.000000,0.0,2.273737e-12
1817,STORE_3_114,STORE_3,1818,0,2016-01-20,11551,Wednesday,5,1,2016,...,,0,0,0,STORE_3,STORE_3_114,2.73,0.000000,0.0,1.136868e-12


In [12]:
list_for_lags = ['wday','month','CASHBACK_STORE_1','CASHBACK_STORE_2','CASHBACK_STORE_3','sell_price','cnt_SMA_3','cnt_WMA_3','cnt_EMA_3']

In [13]:
df[list_for_lags]

Unnamed: 0,wday,month,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3,sell_price,cnt_SMA_3,cnt_WMA_3,cnt_EMA_3
0,1,1,0,0,0,2.77,,,0.000000e+00
1,2,1,0,0,0,2.77,,,0.000000e+00
2,3,1,0,0,0,2.77,0.000000,0.0,0.000000e+00
3,4,2,0,1,1,2.77,0.000000,0.0,0.000000e+00
4,5,2,1,1,0,2.77,0.000000,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...
1814,2,1,0,0,0,2.73,0.000000,0.0,9.094947e-12
1815,3,1,0,0,0,2.73,0.000000,0.0,4.547474e-12
1816,4,1,0,0,0,2.73,0.000000,0.0,2.273737e-12
1817,5,1,0,0,0,2.73,0.000000,0.0,1.136868e-12


In [14]:
X = create_lag_features(df, list_for_lags, 1, 3)

Unnamed: 0,wday_lag_1,month_lag_1,CASHBACK_STORE_1_lag_1,CASHBACK_STORE_2_lag_1,CASHBACK_STORE_3_lag_1,sell_price_lag_1,cnt_SMA_3_lag_1,cnt_WMA_3_lag_1,cnt_EMA_3_lag_1,wday_lag_2,...,cnt_EMA_3_lag_2,wday_lag_3,month_lag_3,CASHBACK_STORE_1_lag_3,CASHBACK_STORE_2_lag_3,CASHBACK_STORE_3_lag_3,sell_price_lag_3,cnt_SMA_3_lag_3,cnt_WMA_3_lag_3,cnt_EMA_3_lag_3
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,0.0,0.0,0.0,2.77,,,0.000000e+00,,...,,,,,,,,,,
2,2.0,1.0,0.0,0.0,0.0,2.77,,,0.000000e+00,1.0,...,0.000000e+00,,,,,,,,,
3,3.0,1.0,0.0,0.0,0.0,2.77,0.0,0.0,0.000000e+00,2.0,...,0.000000e+00,1.0,1.0,0.0,0.0,0.0,2.77,,,0.000000e+00
4,4.0,2.0,0.0,1.0,1.0,2.77,0.0,0.0,0.000000e+00,3.0,...,0.000000e+00,2.0,1.0,0.0,0.0,0.0,2.77,,,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,1.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,1.818989e-11,7.0,...,3.637979e-11,6.0,1.0,1.0,0.0,0.0,2.73,0.0,0.0,7.275958e-11
1815,2.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,9.094947e-12,1.0,...,1.818989e-11,7.0,1.0,1.0,0.0,1.0,2.73,0.0,0.0,3.637979e-11
1816,3.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,4.547474e-12,2.0,...,9.094947e-12,1.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,1.818989e-11
1817,4.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,2.273737e-12,3.0,...,4.547474e-12,2.0,1.0,0.0,0.0,0.0,2.73,0.0,0.0,9.094947e-12
