# Задание 2: Применение алгоритма

Предскажите для каждого алгоритма количество заказов на каждый день с помощью BackTest'а

In [1]:
from typing import List, Dict, Any, Tuple, Union, Optional

import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig()
logger = logging.getLogger("back_test")
logger.setLevel(logging.INFO)

In [2]:
from enum import Enum
from typing import List


class BackTestAlgo(str, Enum):
    epsilon_greedy_sum = "epsilon_greedy_sum"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


class BackTestLevel(str, Enum):
    group_1 = "group_1"
    item_id = "item_id"
    sku_id = "sku_id"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


class BackTestMetric(str, Enum):
    revenue = "revenue"
    margin = "margin"
    orders_num = "orders_num"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


In [3]:
class EpsilonGreedySum:
    def __init__(
        self,
        epsilon: float = 0.005,
        do_show_intersection: bool = True,
    ):
        """
        epsilon - разница между наценками, которую считаем незначимой
        do_show_intersection - показывать ли пересечение исторических и предсказанных наценок:
        - если высокий процент, то BackTest'у можно доверять
        - если низкий процент, то недостаточно данных для проведения BackTest'a
        """
        self.epsilon = epsilon
        self.do_show_intersection = do_show_intersection

    def calculate_group_metrics(
        self,
        df: pd.DataFrame,
        lvl: str,
        prefix: str,
        metrics: List[BackTestMetric],
    ) -> pd.DataFrame:
        """
        Считает предсказания по метрикам с учетом epsilon
        """
        df_filtered = df[
            df["markup"].between(
                df[f"{prefix}_markup"] - self.epsilon,
                df[f"{prefix}_markup"] + self.epsilon,
            )
        ]
        agg_functions = {metric: "mean" for metric in metrics}
        stats_df = (
            df_filtered.groupby(["ds", lvl, f"{prefix}_markup"])
            .agg(agg_functions)
            .reset_index()
        )
        stats_df = stats_df.rename(
            columns={metric: f"{prefix}_{metric}" for metric in metrics}
        )
        stats_df = stats_df.round(2)
        return stats_df

    def calculate_groups_metrics(
        self,
        df: pd.DataFrame,
        lvl: str,
        metrics: List[BackTestMetric],
        algo_names: List[str],
    ) -> pd.DataFrame:
        """
        1) Считает для каждой группы предсказания по метрикам с учетом epsilon
        2) Показывает пересечение исторических и предсказанных наценок
        """
        stats_df = None
        for algo_name in algo_names:
            test_stats_df = self.calculate_group_metrics(
                df=df,
                lvl=lvl,
                prefix=algo_name,
                metrics=metrics,
            )
            if stats_df is None:
                stats_df = test_stats_df
            else:
                common_columns = list(
                    set(stats_df.columns).intersection(set(test_stats_df.columns))
                )
                stats_df = stats_df.merge(test_stats_df, how="inner", on=common_columns)
        return stats_df

    @staticmethod
    def calculate_statistics(df: pd.DataFrame, metrics: List[BackTestMetric], algo_names: List[str]) -> pd.DataFrame:
        """
        Считает значения метрик по дням для контрольной и тестовой группам
        """
        result: Dict[str, Any] = {
            "algo": [],
            "ds": [],
        }
        result.update({metric: [] for metric in metrics})
        agg_functions = {}
        for algo_name in algo_names:
            agg_functions.update({f"{algo_name}_{metric}": "sum" for metric in metrics})
        stats_df = df.groupby("ds").agg(agg_functions).reset_index()
        stats_rows_num = stats_df.shape[0]
        for group in algo_names:
            result["algo"] += [group] * stats_rows_num
            for metric in metrics:
                result[metric] += stats_df[f"{group}_{metric}"].tolist()
            result["ds"] += stats_df["ds"].tolist()
        result_df = pd.DataFrame(data=result)
        return result_df

    def run(
        self,
        df: pd.DataFrame,
        lvl: str,
        metrics: List[BackTestMetric],
        algo_names: List[str],
    ) -> pd.DataFrame:
        """
        Входная точка алгоритма BackTest'а на основе epsilon
        """
        stats_df = self.calculate_groups_metrics(df=df, lvl=lvl, metrics=metrics, algo_names=algo_names)
        result_df = self.calculate_statistics(df=stats_df, metrics=metrics, algo_names=algo_names)
        return result_df


register = {
    "epsilon_greedy_sum": EpsilonGreedySum,
}


def run_algo(
    df: pd.DataFrame,
    lvl: BackTestLevel,
    algo: BackTestAlgo,
    metrics: List[BackTestMetric],
    algo_names: List[str],
    algo_params: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
    """
    Входная точка алгоритма BackTest'а
    """
    algo_obj = register.get(algo)
    if algo_obj is None:
        raise ValueError(
            f"You should provide `algo` from the list: {BackTestAlgo.to_list()}"
        )
    algo_params = algo_params or {}
    result_df = algo_obj(**algo_params).run(df=df, lvl=lvl, metrics=metrics, algo_names=algo_names)
    return result_df

In [5]:
test_dfs = {}
for algo_id in range(1, 6):
    algo_name = f"algo_{algo_id}"
    algo_df = pd.read_parquet(f"./hm/algo/{algo_name}.parquet")
    algo_df["ds"] = algo_df["ds"].astype(str)
    test_dfs[algo_name] = algo_df

In [6]:
test_dfs["algo_1"].head()

Unnamed: 0,group_1,markup,ds
0,group_1000,0.06,20231008
1,group_1001,0.04,20231008
2,group_1002,0.06,20231008
3,group_1003,0.05,20231008
4,group_1004,0.06,20231008


In [7]:
ALGO = BackTestAlgo.epsilon_greedy_sum
LVL = BackTestLevel.group_1
ALGO_PARAMS = {
    "epsilon": 0.01,
    "do_show_intersection": True,
}
DS_RANGES = [("20231008", "20231021")]
METRICS = [BackTestMetric.orders_num]

In [11]:
result_1_df = pd.read_csv("./to_karp.csv")
result_1_df.head()

Unnamed: 0,ds,group_1,markup,orders_num,algo_1_markup,algo_2_markup,algo_3_markup,algo_4_markup,algo_5_markup
0,20231008,group_1000,0.01,37050.0,0.06,0.02,0.01,0.02,0.05
1,20231008,group_1000,0.03,34168.89,0.06,0.02,0.01,0.02,0.05
2,20231008,group_1000,0.04,38100.0,0.06,0.02,0.01,0.02,0.05
3,20231008,group_1000,0.05,32920.0,0.06,0.02,0.01,0.02,0.05
4,20231008,group_1001,0.01,29882.86,0.04,0.06,0.04,0.02,0.0


In [12]:
result_2_df = run_algo(
    df=result_1_df,
    algo=ALGO,
    lvl=LVL,
    algo_params=ALGO_PARAMS,
    metrics=METRICS,
    algo_names=list(test_dfs.keys()),
)
result_2_df = result_2_df.rename(columns={BackTestMetric.orders_num: "orders_num"})
result_2_df.head()

Unnamed: 0,algo,ds,orders_num
0,algo_1,20231008,279686.6
1,algo_1,20231009,322938.31
2,algo_1,20231010,349101.83
3,algo_1,20231011,223993.36
4,algo_1,20231012,265142.84


In [19]:
result_2_df.to_csv("./to_karp_2.csv", index=False)