In [1]:
import pandas as pd
import numpy as np
import pickle
import shap
from typing import List, Callable, Optional, Tuple, Any
from sklearn.model_selection import BaseCrossValidator
from sklearn.base import BaseEstimator
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error
from pandas.tseries.offsets import MonthEnd

class DateTimeSeriesSplit:
    def __init__(self, n_splits: int = 4, test_size: int = 1, margin: int = 1, window: int = 3):
        self.n_splits = n_splits
        self.test_size = test_size
        self.margin = margin
        self.window = window

    def get_n_splits(self) -> int:
        return self.n_splits

    def split(self, X: pd.DataFrame, y: Optional[Any] = None, groups: pd.DataFrame = None) -> Tuple[np.ndarray, np.ndarray]:
        unique_dates = sorted(groups.unique())
        rank_dates = {date:rank for rank, date in enumerate(unique_dates)}
        X['index_time'] = groups.map(rank_dates)
        X = X.reset_index(drop = True)
        index_time_list = list(rank_dates.values())

        for i in reversed(range(1, self.n_splits + 1)):
            left_train = int((index_time_list[-1] - i*self.test_size + 1 - self.window - self.margin)*(self.window/np.max([1,self.window])))
            right_train = index_time_list[-1] - i*self.test_size - self.margin + 1
            left_test = index_time_list[-1] - i*self.test_size + 1
            right_test = index_time_list[-1] - (i-1)*self.test_size + 1
            index_test = X.index.get_indexer(X.index[X.index_time.isin(index_time_list[left_test: right_test])])
            index_train = X.index.get_indexer(X.index[X.index_time.isin(index_time_list[left_train: right_train])])
            yield index_train, index_test

class Kraken:
    def __init__(self, estimator: BaseEstimator, cv: BaseCrossValidator, metric: Callable, meta_info_name: str):
        self.estimator = estimator
        self.cv = cv
        self.metric = metric
        self.meta_info_name = meta_info_name

    def get_rank_dict(self, X: np.ndarray, y: np.ndarray, list_of_vars: List[str], group_dt: Optional[np.ndarray]):
        self.dict_fold_importances = {'Feature': list_of_vars, 'abs_shap': np.zeros(len(list_of_vars))}
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(X, groups = group_dt), 1):
            X_train, X_test = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
            y_train, y_test = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()
            self.estimator.fit(X_train[list_of_vars], y_train.values)
            explainer = shap.Explainer(self.estimator)
            shap_values = explainer.shap_values(X_test[list_of_vars])
            self.dict_fold_importances['abs_shap'] += np.abs(shap_values).mean(axis=0)
        self.fe_dict = {key: value for key, value in zip(self.dict_fold_importances['Feature'], self.dict_fold_importances['abs_shap'])}
        self.rank_dict = {key: rank for rank, key in enumerate(sorted(self.fe_dict, key=self.fe_dict.get, reverse=True), 1)}

    def get_cross_val_score(self, X: np.ndarray, y: np.ndarray, var: str, old_scores: np.ndarray, selected_vars: Optional[List[str]] = None, group_dt: Optional[np.ndarray] = None, round_num: int = 3):
        if selected_vars is None:
            selected_vars = []
        selected_vars.append(var)
        list_scores = []
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(X, groups=group_dt), 1):
            X_train, X_test = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[val_idx]
            self.estimator.fit(X_train[selected_vars], y_train)
            error = round(self.metric(np.exp(y_test), np.exp(self.estimator.predict(X_test[selected_vars]))), round_num)
            list_scores.append(error)
        fold_scores = np.array(list_scores)
        summa = sum(fold_scores - old_scores < 0) * 1 + sum(fold_scores - old_scores > 0) * -1
        mean_cv_score = round(np.mean(fold_scores), round_num)
        return fold_scores, summa, mean_cv_score

    def get_vars(self, X: np.ndarray, y: np.ndarray, early_stopping_rounds: int = 30, summa_approve: int = 1, best_mean_cv: int = 100, vars_in_model: Optional[List] = list(), group_dt: Optional[np.ndarray] = None, round_num: int = 3, old_scores: Optional[np.ndarray] = None):
        self.round_num = round_num
        if old_scores == None:
            old_scores = np.array([100 for i in range(self.cv.get_n_splits())])
        iteration_step = 0
        the_list_from_which_we_take_vars = [i for i in list(self.rank_dict.keys()) if i not in vars_in_model]
        feature_was_added = True
        while feature_was_added:
            iteration_step = 0
            var_for_add = ''
            print('начинаем след этап', best_mean_cv)
            best_positive_groups = summa_approve
            for var in the_list_from_which_we_take_vars:
                iteration_step += 1
                if iteration_step > early_stopping_rounds:
                    print(f'early_stopping_rounds {early_stopping_rounds}')
                    break
                fold_scores, summa, mean_cv_score = self.get_cross_val_score(X = X, y = y, var = var, old_scores = old_scores, selected_vars = vars_in_model.copy(), group_dt = group_dt, round_num = self.round_num)
                if (summa > best_positive_groups) or (summa == best_positive_groups and mean_cv_score < best_mean_cv):
                    best_positive_groups = summa
                    best_mean_cv = mean_cv_score
                    old_scores = fold_scores
                    var_for_add = var
                    iteration_step = 0
                    print(f'new var_for_add ! {var_for_add}')
            if var_for_add != '':
                vars_in_model.append(var_for_add)
                the_list_from_which_we_take_vars.remove(var_for_add)
                print('едем дальше')
                print('в итоге получили список', vars_in_model)
                list_meta = ['vars_list'] + [best_positive_groups] + [best_mean_cv] + old_scores.tolist()
                df_meta = pd.DataFrame(list_meta).T
                df_meta.columns = ['vars', 'summa', 'mean_cv_scores'] + ['cv' + str(i) for i in range(1, self.cv.get_n_splits() + 1)]
                df_meta.at[0, 'vars'] = vars_in_model.copy()
                try:
                    df_meta_info = pd.concat([df_meta_info, df_meta])
                except:
                    df_meta_info = df_meta.copy()
                df_meta_info.to_csv(f'df_meta_info_{self.meta_info_name}.csv')
                continue
            else:
                feature_was_added = False
        print('мы сошлись')
        print(vars_in_model)
        print(best_mean_cv)
        return vars_in_model

class AddTrain:
    """
    Class for add train
    """
    def __init__(self, df_: pd.DataFrame, model_path: 'str', train_end: str, oot_dates: List[str], vsp_test: np.array, used_features: List[str]):
        """
        Initialize AddTrain class with given df_, model, train_end, oot_dates.
        Args:
            df_ (pd.DataFrame): dataset with all features and target
            model_path (str): old model path from oper plan
            train_end (str): last report date in train set from develop process
            oot_dates (str): list oot dates in df_ from develop process
            vsp_test (np.array): set of test(oos) urf_code_map
            used_features (List): old model has incorrect naming features, that`s why need write explicit
        """
        self.df_ = df_
        self.model_path = model_path
        self.train_end = train_end
        self.oot_dates = oot_dates
        self.vsp_test = vsp_test
        self.used_features = used_features

    def scoring_constant_model(self):
        with open(self.model_path, 'rb') as mod_pkl:
            model = pickle.load(mod_pkl)

        cond1_oot = (self.df_['dt'] > self.train_end)
        X_oot = self.df_[cond1_oot]
        y_oot = np.log(X_oot['target'])

        macro_list = []
        print(X_oot['dt'].value_counts().sort_index())

        for dt, subset in X_oot.groupby('dt'):
            y_pred_oot = np.exp(model.predict(subset[self.used_features]))
            mape_oot = round(mean_absolute_percentage_error(subset['target'], y_pred_oot), 2)
            macro_oot = round(y_pred_oot.sum(), 2)
            macro_fact = subset['target'].sum()
            ape_macro = round(100*(macro_oot - macro_fact)/macro_fact, 2)

            macro_list.append([dt, mape_oot, macro_oot, macro_fact, ape_macro])

        self.results_scor_constant = pd.DataFrame(macro_list, columns = ['dt', 'const_mape_oot', 'const_macro_oot',
                                                                         'const_macro_fact', 'const_ape_macro'])
        #self.results_scor_constant['const_mape_oot'] = self.results_scor_constant['const_mape_oot'] * (-1)

    def scoring_update_model(self):
        with open(self.model_path, 'rb') as mod_pkl:
            old_model = pickle.load(mod_pkl)

        macro_list = []

        for i, _ in enumerate(sorted(self.df_[self.df_['dt'] > self.train_end]['dt'].unique()[0:-3]), 1):
            if i == 1:
                cond1_train = (self.df_['dt'] <= pd.to_datetime(self.train_end) + MonthEnd(len(self.oot_dates)))
                cond2_train = (~self.df_['urf_code_map'].isin(self.vsp_test))
                X_train = self.df_[cond1_train & cond2_train]
                y_train = np.log(X_train['target'])

                cond1_test = (self.df_['urf_code_map'].isin(self.vsp_test))
                X_test = self.df_[cond1_train & cond1_test]
                y_test = np.log(X_test['target'])

                cond1_oot = (self.df_['dt'] == pd.to_datetime(self.train_end) + MonthEnd(len(self.oot_dates) + 2))
                X_oot = self.df_[cond1_oot]
                y_oot = np.log(X_oot['target'])

            elif i > 1:
                cond1_train = (self.df_['dt'] <= pd.to_datetime(self.train_end) + MonthEnd(len(self.oot_dates) + i - 1))
                cond2_train = (~self.df_['urf_code_map'].isin(self.vsp_test))
                X_train = self.df_[cond1_train & cond2_train]
                y_train = np.log(X_train['target'])

                cond1_test = (self.df_['urf_code_map'].isin(self.vsp_test))
                X_test = self.df_[cond1_train & cond1_test]
                y_test = np.log(X_test['target'])

                cond1_oot = (self.df_['dt'] == pd.to_datetime(self.train_end) + MonthEnd(len(self.oot_dates) + i + 1))
                X_oot = self.df_[cond1_oot]
                y_oot = np.log(X_oot['target'])

            print('*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-')
            print(f'step_{i}')

            train_test_vc = pd.merge(X_train['dt'].value_counts().sort_index().reset_index(), X_test['dt'].value_counts().sort_index().reset_index(),  how = 'outer', on = 'index')
            stats_val_cnt = pd.merge(train_test_vc, X_oot['dt'].value_counts().sort_index().reset_index(), how = 'outer', on = 'index')
            stats_val_cnt.columns = ['dt', 'cnt_train', 'cnt_oos', 'cnt_oot']
            display(stats_val_cnt)

            model = LGBMRegressor(**old_model.get_params())

            model.fit(X_train[self.used_features], y_train)

            dt = X_oot['dt'].unique()[0]
            y_pred_oot = np.exp(model.predict(X_oot[self.used_features]))
            mape_oot = round(mean_absolute_percentage_error(X_oot['target'], y_pred_oot), 2)
            macro_oot = round(y_pred_oot.sum(), 2)
            macro_fact = X_oot['target'].sum()
            ape_macro = round(100*(macro_oot - macro_fact)/macro_fact, 2)

            macro_list.append([dt, mape_oot, macro_oot, macro_fact, ape_macro])

        self.results_scor_update = pd.DataFrame(macro_list, columns = ['dt', 'update_w_mape_oot', 'update_w_macro_oot',
                                                                     'update_w_macro_fact', 'update_w_ape_macro'])
        #self.results_scor_update['update_w_mape_oot'] = self.results_scor_update['update_w_mape_oot'] * (-1)

    def final_report(self):
        report = pd.merge(self.results_scor_constant,
                         self.results_scor_update,
                         how = 'left',
                         on = 'dt')
        report['diff'] = round(100*(report['update_w_mape_oot'] - report['const_mape_oot'])/report['const_mape_oot'], 2)
        return report
    
    def scoring_new_model(self, start_month: str, window: int, n_splits: int, test_size: int, margin: int, lgbm_params: dict, early_stopping_rounds: int, round_num: int, metric: Callable):
        """
        Метод создает новую модель для каждой даты отчета и эмулирует скоринг с добавлением обучения.

        Параметры:
        window (int): размер окна для DateTimeSeriesSplit.
        n_splits (int): количество разбиений в DateTimeSeriesSplit.
        test_size (int): размер тестовой выборки в DateTimeSeriesSplit.
        margin (int): маржа между тренировочным и тестовым набором в DateTimeSeriesSplit.
        lgbm_params (dict): параметры для инициализации LGBMRegressor.
        early_stopping_rounds (int): количество раундов для ранней остановки в Kraken.
        round_num (int): количество знаков после запятой для округления результатов.
        metric (Callable): метрика для оценки модели (например, mean_absolute_percentage_error).
        """
            # Загрузка старой модели
        with open(self.model_path, 'rb') as file:
            old_model = pickle.load(file)

        start_month_dt = pd.to_datetime(start_month)
        results = []
        meta_info = []

        print(f"Начинаем обработку данных, начиная с {start_month_dt.strftime('%Y-%m')}")

        while start_month_dt <= self.df_['dt'].max():
            print(f"Обрабатываем месяц {start_month_dt.strftime('%Y-%m')}")

            # Разделение на train и OOT
            train_data = self.df_[self.df_['dt'] < start_month_dt].copy()
            oot_data = self.df_[self.df_['dt'] == start_month_dt].copy()

            print(f'Тренировочные данные с {train_data.dt.min()} по {train_data.dt.max()}')
            print(f'Всего  {train_data.shape}')

            print(f'OOT данные с {oot_data.dt.min()} по {oot_data.dt.max()}')
            print(f'Всего  {oot_data.shape}')

            # Инициализация DateTimeSeriesSplit и Kraken
            cv_datetime = DateTimeSeriesSplit(window = window, n_splits = n_splits, test_size = test_size, margin = margin) 
            group_dt = train_data['dt']
            model = LGBMRegressor(**lgbm_params)  # Необходимо инициализировать с параметрами
            selector = Kraken(model, cv_datetime, metric, 'updated_model')  # Необходимо инициализировать с параметрами

            # Подбор фичей на основе SHAP значений
            selector.get_rank_dict(train_data, train_data['target'], self.used_features, group_dt = group_dt)
            new_vars_class = selector.get_vars(train_data, train_data['target'], vars_in_model = [], 
                                               early_stopping_rounds = early_stopping_rounds, group_dt = train_data['dt'], round_num = round_num)

            # Обучение новой модели с отобранными переменными
            model.fit(train_data[new_vars_class], train_data['target'])

            # Оценка новой модели на OOT данных
            y_pred_new = np.exp(model.predict(oot_data[new_vars_class]))
            mape_new = round(mean_absolute_percentage_error(np.exp(oot_data['target']), y_pred_new), 3)

            # Оценка старой модели на OOT данных
            y_pred_old = np.exp(old_model.predict(oot_data[self.used_features]))
            mape_old = round(mean_absolute_percentage_error(np.exp(oot_data['target']), y_pred_old), 3)

            # Сравнение старой и новой модели
            if mape_new < mape_old:
                print(f"Новая модель ({mape_new}) лучше старой ({mape_old}) для {start_month_dt.strftime('%Y-%m')}")
                old_model = model
                # Сохраняем новую модель
                with open(self.model_path, 'wb') as file:
                    pickle.dump(model, file)
                results.append({'month': start_month_dt.strftime('%Y-%m'), 'model': 'new', 'mape': mape_new})
                self.used_features = [i for i in new_vars_class]
            else:
                print(f"Старая модель ({mape_old}) лучше новой ({mape_new}) для {start_month_dt.strftime('%Y-%m')}")
                results.append({'month': start_month_dt.strftime('%Y-%m'), 'model': 'old', 'mape': mape_old})

            # Сохраняем метаинформацию
            meta_info.append({
                'month': start_month_dt.strftime('%Y-%m'),
                'features': new_vars_class,
                'mape_new': mape_new,
                'mape_old': mape_old
            })

            next_month = start_month_dt + pd.DateOffset(months=1)
            start_month_dt = (next_month + pd.DateOffset(months=1)).replace(day=1) - pd.DateOffset(days=1)

        # Сохранение метаинформации в CSV
        meta_info_df = pd.DataFrame(meta_info)
        meta_info_df.to_csv('meta_info.csv', index=False)

        return pd.DataFrame(results)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
import pandas as pd
import numpy as np
import random

# Генерация данных для 10000 магазинов за 2 года
np.random.seed(0)
dates = pd.date_range(start='2022-01-01', end='2023-12-31', freq='M')
stores = np.arange(1, 10001)

# Создание DataFrame с добавлением большего числа переменных
data = pd.DataFrame({
    'date': np.repeat(dates, len(stores)),
    'urf_code_map': np.tile(stores, len(dates)),
    'sales': np.random.randint(100, 500, size=len(dates) * len(stores)),
})

# Добавление 20 дополнительных фичей
for i in range(1, 21):
    data[f'feature{i}'] = np.random.rand(len(dates) * len(stores))

# Вычисление временной шкалы для шума
time_scale = data['date'].dt.year + (data['date'].dt.month - 1) / 12 - 2022
data['time_scale'] = time_scale
# Расчет целевой переменной с изменяющимся во времени шумом
data['target'] = data['sales']
for i in range(1, 10):  # Используем первые 9 фичей
    noise_scale = np.where(i % 2 == 0, 1 - time_scale * 0.2, 1 + time_scale * 0.2)
    print(noise_scale)
    # Добавляем шум, увеличивающийся или уменьшающийся со временем
    data['target'] += (data[f'feature{i}'] * noise_scale)

# Нормализация целевой переменной
data['target'] = np.log(data['target'] + 1)

data.head()  # Вывод первых строк для проверки


[1.         1.         1.         ... 1.38333333 1.38333333 1.38333333]
[1.         1.         1.         ... 0.61666667 0.61666667 0.61666667]
[1.         1.         1.         ... 1.38333333 1.38333333 1.38333333]
[1.         1.         1.         ... 0.61666667 0.61666667 0.61666667]
[1.         1.         1.         ... 1.38333333 1.38333333 1.38333333]
[1.         1.         1.         ... 0.61666667 0.61666667 0.61666667]
[1.         1.         1.         ... 1.38333333 1.38333333 1.38333333]
[1.         1.         1.         ... 0.61666667 0.61666667 0.61666667]
[1.         1.         1.         ... 1.38333333 1.38333333 1.38333333]


Unnamed: 0,date,urf_code_map,sales,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,time_scale,target
0,2022-01-31,1,272,0.467442,0.390914,0.678622,0.65191,0.037711,0.008113,0.47402,...,0.242483,0.578922,0.722031,0.086884,0.898476,0.299865,0.295363,0.941421,0.0,5.621651
1,2022-01-31,2,147,0.048557,0.885066,0.50323,0.034941,0.931691,0.505793,0.658944,...,0.078565,0.72501,0.211643,0.770336,0.039544,0.618542,0.148859,0.709892,0.0,5.030085
2,2022-01-31,3,217,0.990189,0.231459,0.126226,0.280314,0.561959,0.353323,0.095009,...,0.822902,0.599953,0.203294,0.380129,0.118018,0.364317,0.979488,0.959247,0.0,5.399771
3,2022-01-31,4,292,0.761858,0.770246,0.690714,0.3056,0.745284,0.937345,0.222212,...,0.028639,0.64838,0.277394,0.666883,0.984615,0.214967,0.451599,0.524179,0.0,5.69716
4,2022-01-31,5,423,0.967657,0.76868,0.253583,0.78404,0.395119,0.068872,0.316904,...,0.787294,0.212568,0.436661,0.915605,0.464422,0.94266,0.788172,0.272017,0.0,6.061668


In [3]:
data.groupby('date')['time_scale'].mean()

date
2022-01-31    0.000000
2022-02-28    0.083333
2022-03-31    0.166667
2022-04-30    0.250000
2022-05-31    0.333333
2022-06-30    0.416667
2022-07-31    0.500000
2022-08-31    0.583333
2022-09-30    0.666667
2022-10-31    0.750000
2022-11-30    0.833333
2022-12-31    0.916667
2023-01-31    1.000000
2023-02-28    1.083333
2023-03-31    1.166667
2023-04-30    1.250000
2023-05-31    1.333333
2023-06-30    1.416667
2023-07-31    1.500000
2023-08-31    1.583333
2023-09-30    1.666667
2023-10-31    1.750000
2023-11-30    1.833333
2023-12-31    1.916667
Name: time_scale, dtype: float64

In [4]:
# список фичей для включения в модель
model_list = ['feature' + str(i) for i in range(1,21)]; model_list

['feature1',
 'feature2',
 'feature3',
 'feature4',
 'feature5',
 'feature6',
 'feature7',
 'feature8',
 'feature9',
 'feature10',
 'feature11',
 'feature12',
 'feature13',
 'feature14',
 'feature15',
 'feature16',
 'feature17',
 'feature18',
 'feature19',
 'feature20']

In [5]:
from sklearn.metrics import mean_absolute_percentage_error as mape
# создать базовую модель

data_sample = data[data['date'].dt.year == 2022]
data_sample.shape

(120000, 25)

In [6]:
model_base = LGBMRegressor(max_depth= 3, n_jobs= -1)

In [10]:
model_base.fit(data_sample[model_list], data_sample['target'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 20
[LightGBM] [Info] Start training from score 5.637951


LGBMRegressor(max_depth=3, n_jobs=-1)

In [12]:
mape(np.exp(data_sample['target']), np.exp(model_base.predict(data_sample[model_list])))

0.39637882790035894

In [13]:
mape(np.exp(data['target']), np.exp(model_base.predict(data[model_list])))

0.3975944266725568

In [None]:
with open ("model_base.pkl", 'wb') as model_alpha:
    pickle.dump(model_base, model_alpha)

In [None]:
list_stores = data.urf_code_map.unique().tolist()

In [None]:
random.seed(0)
subset_size = 2000
oos_list = random.sample(list_stores, subset_size)

In [None]:
data['dt'] = data.date

In [None]:
new_exp = AddTrain(df_ = data, 
                   model_path='model_base.pkl', 
                   train_end = '2022-10-31', 
                   oot_dates= ['2022-11-30', '2022-12-31'],
                   vsp_test = oos_list,
                   used_features = model_list)

In [None]:
new_exp.scoring_constant_model()

2022-11-30    10000
2022-12-31    10000
2023-01-31    10000
2023-02-28    10000
2023-03-31    10000
2023-04-30    10000
2023-05-31    10000
2023-06-30    10000
2023-07-31    10000
2023-08-31    10000
2023-09-30    10000
2023-10-31    10000
2023-11-30    10000
2023-12-31    10000
Name: dt, dtype: int64


In [None]:
new_exp.scoring_update_model()

*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_1


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725531
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_2


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 104000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725569
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_3


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 112000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725558
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_4


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725553
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_5


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 128000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725566
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_6


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 136000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725568
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_7


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725614
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_8


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 152000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725721
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_9


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725716
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_10


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 168000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725736
*-*-*-*-*-*-*-*-*-*- start split train/test/oot *-*-*-*-*-*-*-*-*-*-
step_11


Unnamed: 0,dt,cnt_train,cnt_oos,cnt_oot
0,2022-01-31,8000.0,2000.0,
1,2022-02-28,8000.0,2000.0,
2,2022-03-31,8000.0,2000.0,
3,2022-04-30,8000.0,2000.0,
4,2022-05-31,8000.0,2000.0,
5,2022-06-30,8000.0,2000.0,
6,2022-07-31,8000.0,2000.0,
7,2022-08-31,8000.0,2000.0,
8,2022-09-30,8000.0,2000.0,
9,2022-10-31,8000.0,2000.0,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 176000, number of used features: 20
[LightGBM] [Info] Start training from score 1.725728


In [None]:
report = new_exp.final_report()

In [None]:
report[['dt', 'const_mape_oot', 'update_w_mape_oot', 'diff']]

Unnamed: 0,dt,const_mape_oot,update_w_mape_oot,diff
0,2022-11-30,0.07,,
1,2022-12-31,0.07,,
2,2023-01-31,0.07,,
3,2023-02-28,0.07,0.07,0.0
4,2023-03-31,0.07,0.07,0.0
5,2023-04-30,0.07,0.07,0.0
6,2023-05-31,0.07,0.07,0.0
7,2023-06-30,0.07,0.07,0.0
8,2023-07-31,0.07,0.07,0.0
9,2023-08-31,0.07,0.07,0.0


In [None]:
params = {'max_depth': 3,
 'n_estimators': 100,
 'objective': 'mse',
 'boosting_type': 'goss',
 'verbose': -1,
 'n_jobs': -1}

In [None]:
import sys

# Открытие файла логов для записи
with open('log_file_scoring_new_model.txt', 'w') as f:
    # Сохранение оригинального стандартного вывода
    original_stdout = sys.stdout
    try:
        # Перенаправление стандартного вывода в файл
        sys.stdout = f
        
        # Теперь все вызовы print будут записываться в файл 'log_file.txt'
        rez = new_exp.scoring_new_model(start_month = '2023-01-31', window = 0, n_splits = 4, test_size = 1, margin = 0, 
                          lgbm_params= params, early_stopping_rounds= 30, round_num= 3, metric= mape)
        # Здесь можно разместить остальной код, вывод которого должен быть сохранён в файле
    finally:
        # Возвращение стандартного вывода в исходное состояние
        sys.stdout = original_stdout

In [None]:
rez = new_exp.scoring_new_model(start_month = '2023-01-31', window = 0, n_splits = 4, test_size = 1, margin = 0, 
                          lgbm_params= params, early_stopping_rounds= 30, round_num= 3, metric= mape)

Начинаем обработку данных, начиная с 2023-01
Обрабатываем месяц 2023-01
Тренировочные данные с 2022-01-31 00:00:00 по 2022-12-31 00:00:00
Всего  (120000, 27)
OOT данные с 2023-01-31 00:00:00 по 2023-01-31 00:00:00
Всего  (10000, 27)
начинаем след этап 100


KeyboardInterrupt: 

In [None]:
rez

Unnamed: 0,month,model,mape
0,2023-01,old,0.936
1,2023-02,new,0.668
2,2023-03,new,0.249
3,2023-04,old,0.273
4,2023-05,new,0.363
5,2023-06,new,0.444
6,2023-07,new,0.505
7,2023-08,new,0.549
8,2023-09,new,0.579
9,2023-10,new,0.608


In [None]:
import sys
from functools import wraps

def log_to_file(file_name):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Сохранение оригинального стандартного вывода
            original_stdout = sys.stdout
            # Открытие файла логов для записи
            with open(file_name, 'w') as f:
                try:
                    # Перенаправление стандартного вывода в файл
                    sys.stdout = f
                    # Вызов оригинальной функции
                    result = func(*args, **kwargs)
                finally:
                    # Возвращение стандартного вывода в исходное состояние
                    sys.stdout = original_stdout
            # Возвращение результата функции, если он есть
            return result
        return wrapper
    return decorator

# Пример использования декоратора
@log_to_file('log_file.txt')
def test_function():
    print("Это сообщение будет записано в файл логов.")
    return "Возвращаемое значение функции"

# Вызов функции для демонстрации
result = test_function()
print("Возвращаемое значение:", result)
print("Это сообщение будет напечатано в консоли, а не в файле логов.")


Возвращаемое значение: Возвращаемое значение функции
Это сообщение будет напечатано в консоли, а не в файле логов.
