# Скрипт обучения и настройка гиперпараметров

### Загрузка данных

In [None]:
import pandas as pd
import numpy as np

dataset = pd.read_excel("../api/data/data_v1.xlsx", parse_dates = ["Дата"], engine = "openpyxl")
dataset.head()

Unnamed: 0,Дата,Склад,Товар,Количество товара
0,2021-01-01,1,Кирпич,123
1,2021-01-02,2,Кирпич,43
2,2021-01-03,3,Кирпич,164
3,2021-01-04,1,Кирпич,252
4,2021-01-05,2,Кирпич,132


### Конструирование признаков и преобразование исходных данных

Месяц-sin

In [22]:
import math

dataset["month-sin"] = dataset["Дата"].dt.month
dataset["month-sin"] = dataset["month-sin"] * (2 * math.pi / 12)
dataset["month-sin"] = dataset["month-sin"].apply(math.sin)

Месяц-cos

In [23]:
dataset["month-cos"] = dataset["Дата"].dt.month
dataset["month-cos"] = dataset["month-cos"] * (2 * math.pi / 12)
dataset["month-cos"] = dataset["month-cos"].apply(math.cos)

Бинарные признаки, описывающие треть месяца:
- 0 - 10 - первая треть
- 11 - 20 - вторая треть
- 21 - 30 (31) - третья треть

In [24]:
dataset["first-third"] = dataset["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 0 and day <= 10])
dataset["second-third"] = dataset["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 11 and day <= 20])
dataset["third-third"] = dataset["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 21 and day <= 31])

День-недели-sin

In [25]:
dataset["day-sin"] = dataset["Дата"].dt.day_of_week + 1
dataset["day-sin"] = dataset["day-sin"] * (2 * math.pi / 7)
dataset["day-sin"] = dataset["day-sin"].apply(math.sin)

День-недели-cos

In [26]:
dataset["day-cos"] = dataset["Дата"].dt.day_of_week + 1
dataset["day-cos"] = dataset["day-cos"] * (2 * math.pi / 7)
dataset["day-cos"] = dataset["day-cos"].apply(math.cos)

Усреднённое количество товара за соседствующие 2k записей.

В качестве демо (без настройки гиперпараметра k) k = 1.

In [27]:
def two_mean_product(row):
    differ = pd.DataFrame()
    differ = dataset[(dataset["Товар"] == row["Товар"]) & (dataset["Склад"] == row["Склад"])]
    differ["Дата"] = differ["Дата"] - row["Дата"]
    past = differ[differ["Дата"].dt.days < 0]
    future = differ[differ["Дата"].dt.days > 0]
    past = past.sort_values(by = "Дата", ascending = False)
    future = future.sort_values(by = "Дата")
    if future.empty and past.empty:
        return row["Количество товара"]
    elif future.empty:
        return past["Количество товара"].iloc[0]
    elif past.empty:
        return future["Количество товара"].iloc[0]
    else:
        return (future["Количество товара"].iloc[0] + past["Количество товара"].iloc[0])/2

dataset["2k-mean-product"] = dataset.apply(lambda row: two_mean_product(row), axis = 1)

Удаление ненужных столбцов

In [28]:
dataset = dataset.drop(["Дата", "Товар"], axis = 1)

Переименование столбца

In [29]:
dataset = dataset.rename(columns = {"Склад": "N_warehouse", "Количество товара": "amount"})

Итоговый набор данных

In [30]:
dataset.head()

Unnamed: 0,N_warehouse,amount,month-sin,month-cos,first-third,second-third,third-third,day-sin,day-cos,2k-mean-product
0,1,123,0.5,0.866025,1,0,0,-0.9749279,-0.222521,252.0
1,2,43,0.5,0.866025,1,0,0,-0.7818315,0.62349,132.0
2,3,164,0.5,0.866025,1,0,0,-2.449294e-16,1.0,211.0
3,1,252,0.5,0.866025,1,0,0,0.7818315,0.62349,173.0
4,2,132,0.5,0.866025,1,0,0,0.9749279,-0.222521,104.0


### Обучение модели

Создание оценщика предобработки

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass

    def fit(self, df, y, **fit_params):
        self.dataset = df.copy()
        self.dataset["Количество товара"] = y
        return self
        
    def transform(self, df):
        
        df["month-sin"] = df["Дата"].dt.month
        df["month-sin"] = df["month-sin"] * (2 * math.pi / 12)
        df["month-sin"] = df["month-sin"].apply(math.sin)

        df["month-cos"] = df["Дата"].dt.month
        df["month-cos"] = df["month-cos"] * (2 * math.pi / 12)
        df["month-cos"] = df["month-cos"].apply(math.cos)

        df["first-third"] = df["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 0 and day <= 10])
        df["second-third"] = df["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 11 and day <= 20])
        df["third-third"] = df["Дата"].dt.day.apply(lambda day: (0, 1)[day >= 21 and day <= 31])

        df["day-sin"] = df["Дата"].dt.day_of_week + 1
        df["day-sin"] = df["day-sin"] * (2 * math.pi / 7)
        df["day-sin"] = df["day-sin"].apply(math.sin)

        df["day-cos"] = df["Дата"].dt.day_of_week + 1
        df["day-cos"] = df["day-cos"] * (2 * math.pi / 7)
        df["day-cos"] = df["day-cos"].apply(math.cos)

        df["2k-mean-product"] = df.apply(lambda row: self.two_mean_product(row), axis = 1)

        df = df.drop(["Дата", "Товар"], axis = 1)

        df = df.rename(columns = {"Склад": "N_warehouse"})

        return df

    def two_mean_product(self, row):
        differ = pd.DataFrame()
        differ = self.dataset[(self.dataset["Товар"] == row["Товар"]) & (self.dataset["Склад"] == row["Склад"])]
        differ["Дата"] = differ["Дата"] - row["Дата"]
        past = differ[differ["Дата"].dt.days < 0]
        future = differ[differ["Дата"].dt.days > 0]
        past = past.sort_values(by = "Дата", ascending = False)
        future = future.sort_values(by = "Дата")
        if future.empty and past.empty:
            return 0
        elif future.empty:
            return past["Количество товара"].iloc[0]
        elif past.empty:
            return future["Количество товара"].iloc[0]
        else:
            return (future["Количество товара"].iloc[0] + past["Количество товара"].iloc[0])/2

Создание округления в конце

In [32]:
class Ceil(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
        
    def transform(self, y):
        return y

    def inverse_transform(self, y):
        return np.ceil(y)

Разделение выборки на обучающую и тестовую

In [33]:
from sklearn import model_selection

dataset = pd.read_excel("data.xlsx", parse_dates = ["Дата"], engine = "openpyxl")

X = dataset.copy()
X = X.drop("Количество товара", axis = 1)

y = dataset["Количество товара"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 777)

Создание пайплайна

In [34]:
from sklearn.linear_model import QuantileRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor

pipe = make_pipeline(PreProcessor(),
                    TransformedTargetRegressor(regressor = QuantileRegressor(), transformer = Ceil()))

In [35]:
pipe

Поиск гиперпараметров по сетке

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "transformedtargetregressor__regressor__quantile": [0.3, 0.4, 0.5, 0.6, 0.7],
    "transformedtargetregressor__regressor__alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

grid = GridSearchCV(pipe, param_grid = param_grid, scoring = "r2")
grid.fit(X_train, y_train)

Выбранные гиперпараметры

In [37]:
print(f"Best parameters: {grid.best_params_}")

Best parameters: {'transformedtargetregressor__regressor__alpha': 0.2, 'transformedtargetregressor__regressor__quantile': 0.5}


### Оценивание модели

Процент данных, которые описывает модель:

In [38]:
from sklearn import metrics

y_pred = grid.predict(X_test)

percent = metrics.r2_score(y_test, y_pred) * 100
print(f"{percent}%")

-7.877191666240679%


### Сериализация модели

In [None]:
import dill as pickle

filename = "model_v1.pk"
with open(filename, "wb") as file:
    pickle.dump(grid, file, recurse = True)

Проверка десериализации

In [None]:
with open("../api/models/model_v1.pk", "rb") as file:
    model = pickle.load(file)

Deser_Test = pd.read_excel("data.xlsx", parse_dates = ["Дата"], engine = "openpyxl")
Deser_Test = dataset.drop("Количество товара", axis = 1)
model.predict(Deser_Test)[:5]

array([182., 213., 193., 207., 204.])