# Проверка стабильности признаков с использованием подмены задачи

In [2]:
from tqdm import tqdm

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [33]:
# пройдем по всем сочетаниям дат из обучающего набора, чтобы научить модель по каждому признаку отличать одну дату от другой

def check_feature_stability(df, date_column, dates, features_list):
    results = {}
    combs = []

    for i, first_date in enumerate(dates):
        for second_date in dates[i+1:]:
            print(f"First date: {first_date}, Second date: {second_date}")
            combs.append((first_date, second_date))

            for feature in tqdm(features_list, position=0):
                print('='*50)
                print(f'Feature: {feature}')

                if feature not in results.keys():
                    results[feature] = []

                check_df = df[df[date_column].isin([first_date, second_date])][[date_column, feature]].copy()
                check_df['target'] = (check_df[date_column] == second_date).astype(int)

                x_train, x_test, y_train, y_test = train_test_split(check_df[[feature]], check_df['target'], test_size=0.33, random_state=42)
                estimator = LGBMClassifier(n_estimators=500, max_depth=5, random_state=42, verbose=-1)

                estimator.fit(x_train, y_train)

                preds = estimator.predict_proba(x_test)[:, 1]

                score = roc_auc_score(y_test, preds)
                results[feature].append(score)
        return results, combs

In [47]:
# сделаем тестовый датасет, зададим в нем искусственную метку времени

from sklearn.datasets import make_classification
import pandas as pd

x, y = make_classification(n_samples=1000, n_features=10, n_redundant=0, flip_y=0.05, class_sep=0.8, random_state=42)
df = pd.DataFrame(x, columns=[f'feature_{i}' for i in range(10)])

In [48]:
date_column = 'fold'
df[date_column] = ((df['feature_0'] < 0.5) & (df['feature_8'] > 1)).astype(int)

In [49]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,fold
0,-0.245018,-0.924063,2.129718,-0.111319,-0.630249,-2.707544,0.864248,0.833158,0.299086,-0.71999,0
1,-0.200564,1.074333,-1.302468,-0.177918,0.344054,-0.89783,0.348342,-0.41743,-0.912811,-0.487705,0
2,2.338034,-0.998212,-1.384227,-0.62451,0.690074,0.691619,1.08771,0.031492,0.746981,0.273118,0
3,-0.587362,-0.679889,0.279906,0.073863,-0.319826,-1.833706,0.14431,-1.340489,1.193587,-0.833598,1
4,-0.52133,1.796665,-1.137254,-0.760364,-1.312163,-0.041472,-0.397115,0.382433,-0.393612,-1.099039,0


In [50]:
dates = sorted(df[date_column].unique())
features = df.drop(date_column, axis=1)

stability_results, stability_combs = check_feature_stability(df, date_column=date_column, dates=dates, features_list=features)

First date: 0, Second date: 1


  0%|                                                  | 0/1000 [00:00<?, ?it/s]

Feature: feature_0


  0%|                                          | 1/1000 [00:00<05:48,  2.87it/s]

Feature: feature_1


  0%|                                          | 2/1000 [00:00<05:19,  3.12it/s]

Feature: feature_2


  0%|▏                                         | 3/1000 [00:00<05:11,  3.21it/s]

Feature: feature_3


  0%|▏                                         | 4/1000 [00:01<04:56,  3.36it/s]

Feature: feature_4


  0%|▏                                         | 5/1000 [00:01<04:50,  3.43it/s]

Feature: feature_5


  1%|▎                                         | 6/1000 [00:01<04:46,  3.47it/s]

Feature: feature_6


  1%|▎                                         | 7/1000 [00:02<04:56,  3.34it/s]

Feature: feature_7


  1%|▎                                         | 8/1000 [00:02<04:55,  3.35it/s]

Feature: feature_8


  1%|▍                                         | 9/1000 [00:02<04:33,  3.63it/s]

Feature: feature_9


  1%|▍                                        | 10/1000 [00:02<04:49,  3.41it/s]


In [51]:
stability_results

{'feature_0': [0.6131259936406995],
 'feature_1': [0.5614069952305246],
 'feature_2': [0.5312003179650238],
 'feature_3': [0.49130564387917325],
 'feature_4': [0.5753179650238474],
 'feature_5': [0.5261824324324325],
 'feature_6': [0.4335751192368839],
 'feature_7': [0.5226053259141494],
 'feature_8': [0.9653716216216217],
 'feature_9': [0.5409379968203498]}

In [52]:
# видно, что фича 0 и фича 8 нестабильные, так как ROC AUC у них выше 0.6
# то есть меняются от одного временного фолда к другому, как мы и задавали