# Задание 1.1: Модель базового спроса

In [1]:
from typing import List, Dict, Any, Tuple, Union, Optional

from datetime import datetime, timedelta
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

import logging

logging.basicConfig()
logger = logging.getLogger("model")
logger.setLevel(logging.INFO)

## Данные

In [2]:
sales_df = pd.read_parquet("./hm/sales.parquet")
sales_df["ds"] = sales_df["ds"].astype(str)
sales_df.head()

Unnamed: 0,sku_id,gmv,orders_num,price,ds
283,284,121161.0,116,1044.49,20230601
302,303,10413.0,186,55.98,20230601
235,236,550034.0,2569,214.1,20230601
206,207,18895.0,250,75.58,20230601
245,246,264666.0,3232,81.89,20230601


## Модель базового спроса

In [3]:
def calc_rolling_window(
    df: pd.DataFrame,
    window: int,
    col: str,
    lvl: str,
    shift: int,
) -> pd.DataFrame:
    df = df.sort_values(["ds", lvl])
    df["ts"] = pd.to_datetime(df["ds"].astype(str))
    df["col"] = df[col]
    rolling_df = df.set_index("ts")
    rolling_df = (
        rolling_df.groupby([lvl], group_keys=True)["col"]
        # используем shift, так как потом нам нужно предсказывать на N дней вперед:
        # для однородности датасета используем сдвиг на кол-во дат в предсказании
        .apply(
            lambda x: x.asfreq("1D")
            .rolling(window=window, closed="left", min_periods=0)
            .mean()
            .shift(shift)
        )
        .reset_index()
        .rename(columns={"col": f"rolling_{col}_w_{window}"})
    )
    df = df.merge(rolling_df, how="left", on=[lvl, "ts"])
    df = df.drop(columns=["ts", "col"])
    return df

In [4]:
def generate_date_list(start_date_str: str, end_date_str: str) -> List[str]:
    start_date = datetime.strptime(start_date_str, "%Y%m%d")
    end_date = datetime.strptime(end_date_str, "%Y%m%d")
    num_days = (end_date - start_date).days + 1
    return [
        (start_date + timedelta(days=i)).strftime("%Y%m%d") for i in range(num_days)
    ]

In [5]:
START_DS, END_DS = "20240101", "20240107"

# Берем уникальные товары
df = sales_df.copy(deep=True)[["sku_id"]].drop_duplicates()

# Расширяем датасет на разные даты
ds_list = generate_date_list(start_date_str=START_DS, end_date_str=END_DS)
ds_df = pd.DataFrame({"ds": ds_list})
df = df.merge(ds_df, how="cross")

# Конкатим с историческими данными
df["gmv"] = None
df["orders_num"] = None
cols = df.columns.tolist()
df = pd.concat(
    [sales_df.copy(deep=True).sort_values(by=["sku_id", "ds"])[cols], df[cols]]
)

# Считаем базовый спрос
df = calc_rolling_window(
    df=df,
    window=14,
    col="gmv",
    lvl="sku_id",
    shift=7,
)
df = calc_rolling_window(
    df=df,
    window=14,
    col="orders_num",
    lvl="sku_id",
    shift=7,
)
df = df[df["ds"].between(START_DS, END_DS)]
df = df[["sku_id", "ds", "rolling_gmv_w_14", "rolling_orders_num_w_14"]]
df = df.rename(
    columns={
        "rolling_gmv_w_14": "gmv",
        "rolling_orders_num_w_14": "orders_num",
    }
)
df.head()

Unnamed: 0,sku_id,ds,gmv,orders_num
41209,1,20240101,31787.54,2289.615385
41210,3,20240101,14897.77,1097.846154
41211,4,20240101,225048.6,3871.461538
41212,7,20240101,1166771.0,5239.0
41213,8,20240101,613595.3,5803.142857


In [6]:
df.to_csv("./to_karp_5_1.csv", index=False)