In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from darts.utils.utils import ModelMode, SeasonalityMode
import matplotlib.pyplot as plt
from tasks_support_system_ai.utils.utils import get_correct_data_path
from tasks_support_system_ai.data.parse_data import read_proper_ts_tree, ts_read_daily_tickets
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error
from tqdm import tqdm
from lightgbm import LGBMRegressor

В этом Jupyter Notebook представлен результат анализа, проведенного в другом ноутбуке /TS/boosting_analysis.ipynb. Там были тщательно исследованы различные алгоритмы градиентного бустинга, их параметры и особенности работы на конкретных данных. Итогом этого ноутбука стали - пайплайн предобработки данных и обучение LightGBM с подобранными гиперпараметрами.

In [2]:
df = ts_read_daily_tickets(get_correct_data_path("tickets_daily/tickets_daily.csv"))
tree = read_proper_ts_tree(get_correct_data_path("custom_data/tree_proper.csv"))

In [3]:
top_level_tree = tree[(tree["level"] == 1) & (tree["full_load"] != 0)]
top_level_tree.sort_values("full_load", ascending=False)

Unnamed: 0,queueId,level,immediateDescendants,allDescendants,full_load
7,10,1,"[4472, 472, 4733, 1571, 100118, 2657, 2658, 27...","[10, 104, 462, 472, 1322, 1571, 2190, 2200, 24...",2463146
481,1866,1,"[4307, 4352, 4587, 4667, 4682, 762, 100081, 10...","[762, 1866, 2382, 2733, 3536, 3576, 3617, 3836...",1470013
285,742,1,"[39, 98, 118, 4322, 4412, 2458, 2610, 2921, 3093]","[9, 13, 26, 39, 83, 98, 101, 110, 116, 118, 37...",1387729
26,34,1,"[1, 111, 4522, 842, 1212, 1471, 1531, 1701, 10...","[1, 34, 42, 85, 111, 392, 842, 1212, 1471, 153...",1247019
163,4447,1,"[57, 4457, 4612, 4617, 100254, 100261, 4793, 1...","[57, 382, 1162, 1741, 1881, 1887, 1981, 2007, ...",1121534
485,1871,1,"[4202, 4262, 4723, 100264, 2150, 2170, 2321, 2...","[28, 63, 122, 632, 1062, 1201, 1401, 1871, 198...",1006818
143,4397,1,"[18, 29, 442, 100272, 100042, 2199, 2403, 3750]","[18, 29, 442, 2199, 2240, 2403, 2404, 2535, 32...",986415
90,115,1,"[25, 88, 4547, 4763, 1910, 100044, 100085, 216...","[25, 88, 115, 1910, 2163, 2166, 2219, 2259, 22...",896296
607,2002,1,"[71, 4482, 731, 2226, 2267, 2269, 2298, 2338, ...","[71, 731, 2002, 2226, 2267, 2269, 2298, 2335, ...",825237
41,4147,1,"[7, 14, 4442, 100329, 100034, 100092, 100105, ...","[7, 14, 2544, 3045, 3086, 3095, 3551, 3884, 41...",681754


In [4]:
def get_df_slice(queue_id: int):
    queues = tree[tree["queueId"] == queue_id]["allDescendants"].values[0]
    df_slice = df[df["queueId"].isin(queues)].groupby("date")[["new_tickets"]].sum()
    return df_slice

In [5]:
top_level_queue_ids = top_level_tree['queueId'].tolist()

In [6]:
global_df_top_level = pd.DataFrame()

for queue_id in top_level_queue_ids:
    df_queue = get_df_slice(queue_id)
    df_queue['queue_id'] = queue_id 
    global_df_top_level = pd.concat([global_df_top_level, df_queue])

# Reset index чтобы 'date' снова стал столбцом, а не индексом
global_df_top_level = global_df_top_level.reset_index()

global_df_top_level

Unnamed: 0,date,new_tickets,queue_id
0,2017-01-01,195,10
1,2017-01-02,274,10
2,2017-01-03,300,10
3,2017-01-04,283,10
4,2017-01-05,311,10
...,...,...,...
27819,2020-09-26,299,3936
27820,2020-09-27,330,3936
27821,2020-09-28,494,3936
27822,2020-09-29,554,3936


In [7]:
def create_time_features(df, date_col='date'):
    """
    Создает временные признаки: день недели, месяц, год.
    """
    df[date_col] = pd.to_datetime(df[date_col])
    df['dayofweek'] = df[date_col].dt.dayofweek  # День недели (0-6, понедельник-воскресенье)
    df['month'] = df[date_col].dt.month          # Месяц (1-12)
    df['year'] = df[date_col].dt.year            # Год
    return df

In [8]:
def create_X_y (global_df_top_level, features, target):
    X = global_df_top_level[features]
    y = global_df_top_level[target]
    groups = global_df_top_level['queue_id']
    return X, y, groups

In [9]:
def create_lag_features(df, lags, queue_id_col='queue_id', target_col='new_tickets'):
    """
    Создает лаговые признаки для временного ряда в DataFrame.
    """
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df.groupby(queue_id_col)[target_col].shift(lag)
    return df

lags_to_create = [1, 2, 3, 7, 14, 21, 28, 30]

In [10]:
global_df_top_level = global_df_top_level.dropna()

In [11]:
def create_anomaly_feature(df, queue_id_col='queue_id', date_col='date', target_col='new_tickets', contamination=0.05, random_state=42):
    """
    Создает бинарный признак 'is_anomaly' на основе детекции аномалий с помощью Isolation Forest.
    Детекция аномалий выполняется отдельно для каждого временного ряда (очереди).
    """

    df['is_anomaly'] = 0  # Инициализируем колонку 'is_anomaly' нулями

    for queue_id in df[queue_id_col].unique():
        df_queue = df[df[queue_id_col] == queue_id].copy() 

        
        model_if = IsolationForest(contamination=contamination, random_state=random_state)
        model_if.fit(df_queue[[target_col]]) 

       
        anomaly_labels = model_if.predict(df_queue[[target_col]])


        is_anomaly_flag = (anomaly_labels == -1).astype(int) 

  
        df.loc[df[queue_id_col] == queue_id, 'is_anomaly'] = is_anomaly_flag 


    return df

In [12]:
def create_rolling_lag_features(df, lags, windows, queue_id_col='queue_id', target_col='new_tickets', statistics=['mean', 'median', 'std']):
    """
    Создает признаки скользящих статистик для лаговых признаков временного ряда.
    """

    for lag in lags:
        for window in windows:
            for stat in statistics:
                col_name = f'{target_col}_lag_{lag}_rolling_{stat}_{window}'
                if stat == 'mean':
                    df[col_name] = df.groupby(queue_id_col)[f'{target_col}_lag_{lag}'].rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
                elif stat == 'median':
                    df[col_name] = df.groupby(queue_id_col)[f'{target_col}_lag_{lag}'].rolling(window=window, min_periods=1).median().reset_index(level=0, drop=True)
                elif stat == 'std':
                    df[col_name] = df.groupby(queue_id_col)[f'{target_col}_lag_{lag}'].rolling(window=window, min_periods=1).std().reset_index(level=0, drop=True)
                else:
                    raise ValueError(f"Statistic '{stat}' is not supported. Choose from 'mean', 'median', 'std'.")
    return df

In [13]:
class GlobalPreprocessor(BaseEstimator, TransformerMixin):
    """
    Выполняет все этапы предобработки данных, включая создание временных признаков, лаговых признаков,
    детектирование аномалий и создание признаков скользящих статистик.
    """

    def __init__(self, lags, lags_for_rolling, windows_for_rolling, date_col='date', queue_id_col='queue_id',
                 target_col='new_tickets', contamination=0.05, random_state=42, statistics=['mean', 'median', 'std']):
        self.lags = lags
        self.lags_for_rolling = lags_for_rolling
        self.windows_for_rolling = windows_for_rolling
        self.date_col = date_col
        self.queue_id_col = queue_id_col
        self.target_col = target_col
        self.contamination = contamination
        self.random_state = random_state
        self.statistics = statistics

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()

        # 1. Создаем временные признаки
        X_ = create_time_features(X_, date_col=self.date_col)

        # 2. Создаем лаговые признаки
        X_ = create_lag_features(X_, lags=self.lags, queue_id_col=self.queue_id_col, target_col=self.target_col)

        X_ = X_.dropna()

        # 3. Создаем признак аномалии (требует обучения Isolation Forest для каждой очереди)
        X_ = create_anomaly_feature(X_, queue_id_col=self.queue_id_col, date_col=self.date_col, target_col=self.target_col,
                                   contamination=self.contamination, random_state=self.random_state)

        # 4. Создаем признаки скользящих статистик
        X_ = create_rolling_lag_features(X_, lags=self.lags_for_rolling, windows=self.windows_for_rolling,
                                        queue_id_col=self.queue_id_col, target_col=self.target_col, statistics=self.statistics)
        
        X_ = X_.dropna()

        return X_

In [14]:
lags_to_create = [1, 2, 3, 7, 14, 21, 28, 30]
lags_for_rolling = [1, 7, 14]
windows_for_rolling = [3, 7, 14]

global_preprocessor = GlobalPreprocessor(lags=lags_to_create, lags_for_rolling=lags_for_rolling,
                                            windows_for_rolling=windows_for_rolling)

pipeline_global = Pipeline(steps=[
    ('feature_preprocessing', global_preprocessor),
])

global_df_transformed = pipeline_global.fit_transform(global_df_top_level)

In [15]:
features = [col for col in global_df_transformed.columns if col not in ['date', 'new_tickets', 'year']]

target = 'new_tickets'

X, y, groups = create_X_y(
    global_df_top_level= global_df_transformed,
    features= features,
    target= target
    )

In [16]:
def boosting_fitting(model, X, y, groups):
    """
    Обучает и оценивает LGBMRegressor с использованием TimeSeriesSplit кросс-валидации.

    """
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)

    model_boosting = model

    mae_scores_lgbm = []
    rmse_scores_lgbm = []
    mape_scores_lgbm = []

    for train_index, test_index in tscv.split(X, y, groups=groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model_boosting.fit(X_train, y_train) # Обучаем LGBMRegressor

        y_pred_lgbm = model_boosting.predict(X_test)

        mape_lgbm = mean_absolute_percentage_error(y_test, y_pred_lgbm)
        rmse_lgbm = root_mean_squared_error(y_test, y_pred_lgbm) 
        mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)

        mae_scores_lgbm.append(mae_lgbm)
        rmse_scores_lgbm.append(rmse_lgbm)
        mape_scores_lgbm.append(mape_lgbm)

    avg_mae_lgbm = np.mean(mae_scores_lgbm)
    avg_rmse_lgbm = np.mean(rmse_scores_lgbm)
    avg_mape_lgbm = np.mean(mape_scores_lgbm)
    return model_boosting, avg_mae_lgbm, avg_rmse_lgbm, avg_mape_lgbm

Применяем лучшую модель из анализа

In [17]:
lgb_params = {
        'objective': 'regression_l1',
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': 880,
        'learning_rate': 0.04134475267814951,
        'num_leaves': 126,
        'max_depth': 5,
        'min_child_samples': 46,
        'subsample': 0.7664953412082741,
        'colsample_bytree': 0.7804768275120312
    }

model_lgb , avg_mae, avg_rmse, avg_mape = boosting_fitting(
    model= LGBMRegressor(**lgb_params),
    X= X,
    y= y,
    groups= groups
    )
print("Результаты LightGBM c лагами и аномалиями:")
print(f"Средний RMSE на кросс-валидации: {avg_rmse:.2f}")
print(f"Средний MAE на кросс-валидации: {avg_mae:.2f}")
print(f"Средний MAPE на кросс-валидации: {avg_mape:.2f}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8952
[LightGBM] [Info] Number of data points in the train set: 4527, number of used features: 39
[LightGBM] [Info] Start training from score 589.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8955
[LightGBM] [Info] Number of data points in the train set: 9050, number of used features: 39
[LightGBM] [Info] Start training from score 344.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8960
[LightGBM] [Info] Number of data points in the train s

Результаты LightGBM c лагами и аномалиями:

Средний RMSE на кросс-валидации: 137.21

Средний MAE на кросс-валидации: 68.33

Средний MAPE на кросс-валидации: 0.35

Данный пайплайн и гиперпараметры LightGBM будут использоваться в сервисе Streamlit.