# Ноутбук по отбору признаков

[Пример ноутбука в котором идет речь про отбор признаков](https://github.com/RomanSafronenkov/mlcourse.ai/blob/main/jupyter_russian/topic06_features/topic6_feature_engineering_feature_selection_russian.ipynb)

[Sequential Feature Selection с объяснениями](https://rasbt.github.io/mlxtend/api_subpackages/mlxtend.feature_selection/#sequentialfeatureselector)

[О том, почему плох встроенный в RandomForest feature_importance](https://explained.ai/rf-importance/)

In [1]:
from abc import ABC, abstractmethod

import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.inspection import permutation_importance

from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import shap

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# создадим искусственный датасет, в котором будут информативные признаки, их дубликаты, их комбинации и шумовые признаки
x, y = make_classification(
    n_samples=10000, n_features=100, n_informative=15, n_redundant=5, n_repeated=5, n_clusters_per_class=4, shift=0.8, scale=3.0, shuffle=False)

# если не ставить параметр shuffle в True, то сначала будут идти информативные признаки, потом комбинации и потом повторы, после них - мусор
# так удобнее оценить как это все работает

In [3]:
# замерим метрику на кросс валидации
cross_val_score(LGBMClassifier(verbose=-100), x, y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9567004795353707

In [5]:
class BaseFeatureSelector(ABC):
    def __init__(self, n_folds=5):
        self.n_folds = n_folds
        self.importances = None

    def fit(self, x, y, **importances_kwargs):
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True)  # можно число фолдов и прочие параметры

        self.importances = pd.DataFrame({'feature': np.arange(x.shape[1])})

        for i, (train_index, val_index) in enumerate(skf.split(x, y)):
            x_train, y_train = x[train_index], y[train_index]
            x_val, y_val = x[val_index], y[val_index]

            model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
            model.fit(x_train, y_train)

            imp = self._get_importances_from_model(model, x_val, y_val, **importances_kwargs)

            self.importances[f'importance_{i}'] = imp

    def get_selected_features(self, threshold):
        assert self.importances is not None, 'Сначала нужно обучить, вызвав метод fit'

        # сделаем отдельно для 0 итерации
        imps = self.importances.loc[:, ['feature', 'importance_0']].sort_values('importance_0', ascending=False)  # выберем важности признаков с 0 итерации
        imps['importance_0'] /= imps['importance_0'].sum()
        imps['cumsum'] = imps['importance_0'].cumsum()  # так как мы их отнормировали, может посчитать кумулятивную сумму
        features = imps.loc[imps['cumsum'] <= threshold, 'feature'].tolist()  # возьмем только те признаки, которые по кумулятивной сумме удовлетворяют
        
        best_features = set(features)  # сделаем множество
        for i in range(1, self.n_folds):
            imps = self.importances.loc[:, ['feature', f'importance_{i}']].sort_values(f'importance_{i}', ascending=False)
            imps[f'importance_{i}'] /= imps[f'importance_{i}'].sum()
            imps['cumsum'] = imps[f'importance_{i}'].cumsum()
            features = imps.loc[imps['cumsum'] <= threshold, 'feature'].tolist()

            best_features &= set(features)  # смотрим на пересечения множеств на разных итерациях кросс-валидации

        return list(best_features)
        
    @abstractmethod
    def _get_importances_from_model(self, model, x, y, **kwargs):
        pass

In [6]:
# отбор признаков с использованием встроеного feature_importance
class LGMFeatureSelection(BaseFeatureSelector):
    def __init__(self, n_folds=5):
        super().__init__(n_folds)

    def _get_importances_from_model(self, model, x, y, importance_type='split'):
        return model.booster_.feature_importance(importance_type=importance_type)

In [7]:
%%time
lgm_fs = LGMFeatureSelection()
lgm_fs.fit(x, y, importance_type='split')

CPU times: user 25.1 s, sys: 163 ms, total: 25.2 s
Wall time: 12.6 s


In [8]:
lgm_fs.importances.head(20)

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4
0,0,361,350,371,335,349
1,1,386,362,387,412,383
2,2,415,405,414,401,415
3,3,281,240,269,237,330
4,4,429,365,396,413,443
5,5,490,455,495,481,407
6,6,275,298,263,297,326
7,7,328,279,294,332,339
8,8,347,348,359,369,332
9,9,291,252,288,315,293


In [9]:
best_features_lgm_fs = lgm_fs.get_selected_features(threshold=0.7)
len(best_features_lgm_fs)

20

In [10]:
best_features_lgm_fs  # видно, что идут по порядку

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [11]:
# замерим метрику на кросс валидации с отобранными признаками
cross_val_score(LGBMClassifier(verbose=-100), x[:, best_features_lgm_fs], y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9644985004458443

In [12]:
# попробуем тоже самое, но с помощью shap values
class ShapFeatureSelection(BaseFeatureSelector):
    def __init__(self, n_folds=5):
        super().__init__(n_folds)

    def _get_importances_from_model(self, model, x, y, feature_perturbation='tree_path_dependent'):
        explainer = shap.TreeExplainer(model, feature_perturbation=feature_perturbation)
        shap_values = explainer.shap_values(x)

        if isinstance(shap_values, list):
            importances = np.abs(shap_values[1]).mean(0)
        else:
            importances = np.abs(shap_values).mean(0)
        return importances

In [13]:
%%time
shap_fs = ShapFeatureSelection()
shap_fs.fit(x, y, feature_perturbation='tree_path_dependent')

CPU times: user 1min 5s, sys: 192 ms, total: 1min 5s
Wall time: 23.9 s


In [14]:
shap_fs.importances.head(20)

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4
0,0,0.750796,0.739108,0.727017,0.738715,0.716016
1,1,0.562528,0.555469,0.50783,0.541028,0.534878
2,2,0.321572,0.375776,0.357566,0.31569,0.343478
3,3,0.084946,0.075137,0.083525,0.088487,0.072988
4,4,0.395545,0.340122,0.361653,0.372098,0.333939
5,5,0.804741,0.798588,0.771174,0.779608,0.76233
6,6,0.099124,0.127984,0.111789,0.129219,0.092679
7,7,0.14537,0.163453,0.143607,0.143696,0.135688
8,8,0.220041,0.193617,0.198519,0.186115,0.187533
9,9,0.103831,0.105477,0.100209,0.104462,0.098064


In [15]:
best_features_shap_fs = shap_fs.get_selected_features(threshold=0.85)
len(best_features_shap_fs)

20

In [17]:
best_features_shap_fs

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [16]:
# замерим метрику на кросс валидации с отобранными признаками
cross_val_score(LGBMClassifier(verbose=-100), x[:, best_features_shap_fs], y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9642100994230404

In [18]:
# самый интуитивно понятный метод, но самый долгий - permutation importance
class PIFeatureSelection(BaseFeatureSelector):
    def __init__(self, n_folds=5):
        super().__init__(n_folds)

    def _get_importances_from_model(self, model, x, y, **permutation_kwargs):
        importances = np.abs(permutation_importance(model, x, y, **permutation_kwargs)['importances_mean'])
        return importances

In [19]:
%%time
pi_fs = PIFeatureSelection()
pi_fs.fit(x, y, scoring='roc_auc', n_jobs=-1, n_repeats=5)

CPU times: user 1min 34s, sys: 557 ms, total: 1min 34s
Wall time: 2min 31s


In [20]:
pi_fs.importances.head(20)

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4
0,0,0.041881,0.039535,0.045238,0.047403,0.042914
1,1,0.037718,0.031919,0.033416,0.033701,0.029483
2,2,0.017671,0.018634,0.02062,0.020181,0.017859
3,3,0.002808,0.003254,0.002846,0.003017,0.002194
4,4,0.025036,0.024792,0.022989,0.029871,0.021749
5,5,0.070125,0.057745,0.066079,0.068766,0.05408
6,6,0.004505,0.003887,0.002299,0.003179,0.003585
7,7,0.009651,0.007468,0.008905,0.011085,0.008549
8,8,0.01012,0.010996,0.011636,0.012031,0.011487
9,9,0.004242,0.004496,0.00652,0.005576,0.005918


In [21]:
best_features_pi_fs = pi_fs.get_selected_features(threshold=0.99)
len(best_features_pi_fs)

22

In [22]:
best_features_pi_fs

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 41, 94]

In [23]:
# замерим метрику на кросс валидации с отобранными признаками
cross_val_score(LGBMClassifier(verbose=-100), x[:, best_features_pi_fs], y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9635214953930229

# Жадные методы отбора

In [28]:
class SequentialForwardFeatureSelection:
    def __init__(self, model, n_folds, k, smooth=False, stratified=True, scoring='roc_auc'):
        self.model = model
        self.n_folds = n_folds
        self.k = k
        self.stratified = True
        self.smooth = smooth
        self.scoring = scoring

    def _find_feature_to_add(self, x, y, cur_subset, set_to_choose_from):
        best_score = float('-inf')
        best_feature = None

        print('*'*30)
        print('Looking for feature to ADD...')

        for feature in set_to_choose_from:
            cur_subset_copy = cur_subset.copy()
            cur_subset_copy += [feature]

            subset_x = x[:, cur_subset_copy]
            if self.stratified:
                cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True)
            else:
                cv = KFold(n_splits=self.n_folds, shuffle=True)
            score = cross_val_score(estimator=self.model, X=subset_x, y=y, cv=cv, scoring=self.scoring).mean()
            if score > best_score:
                best_feature = feature
                best_score = score
                
        print(f'Best score = {best_score:.4f}')
        return best_feature

    def _find_feature_to_delete(self, x, y, cur_subset):
        if self.stratified:
            cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True)
        else:
            cv = KFold(n_splits=self.n_folds, shuffle=True)
        best_score = cross_val_score(estimator=self.model, X=x[:, cur_subset], y=y, cv=cv, scoring=self.scoring).mean()
        worst_feature = None

        print('='*30)
        print('Looking for feature to DELETE...')
        print(f'Best score = {best_score:.4f}')

        for feature in cur_subset:
            cur_subset_copy = cur_subset.copy()
            del cur_subset_copy[cur_subset_copy.index(feature)]

            subset_x = x[:, cur_subset_copy]
            if self.stratified:
                cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True)
            else:
                cv = KFold(n_splits=self.n_folds, shuffle=True)
            score = cross_val_score(estimator=self.model, X=subset_x, y=y, cv=cv, scoring=self.scoring).mean()
            if score > best_score:
                worst_feature = feature
                best_score = score
        return worst_feature

    def fit(self, x, y):
        best_features = []
        set_to_choose_from = list(range(x.shape[1]))

        best_feature = self._find_feature_to_add(x=x, y=y, cur_subset=best_features, set_to_choose_from=set_to_choose_from)
        best_features += [best_feature]
        print(f'Added feature {best_feature}, current subset = {best_features}')
        del set_to_choose_from[set_to_choose_from.index(best_feature)]

        while len(best_features) < self.k:
            best_feature = self._find_feature_to_add(x=x, y=y, cur_subset=best_features, set_to_choose_from=set_to_choose_from)
            best_features += [best_feature]
            print(f'Added feature {best_feature}, current subset = {best_features}')
            del set_to_choose_from[set_to_choose_from.index(best_feature)]

            if len(best_features) > 2 and self.smooth:
                worst_feature = self._find_feature_to_delete(x=x, y=y, cur_subset=best_features)
                if worst_feature == best_feature:
                    break

                if worst_feature is not None:
                    del best_features[best_features.index(worst_feature)]
                    print(f'Deleted feature {worst_feature}, current subset = {best_features}')
                    set_to_choose_from += [worst_feature]

                    while len(best_features) > 2:
                        worst_feature = self._find_feature_to_delete(x=x, y=y, cur_subset=best_features)
                        if worst_feature is None:
                            break
                        del best_features[best_features.index(worst_feature)]
                        print(f'Deleted feature {worst_feature}, current subset = {best_features}')
                        set_to_choose_from += [worst_feature]
        return best_features

In [29]:
%%time
# самый долгий

seq_fs = SequentialForwardFeatureSelection(
    model=LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100),
    n_folds=5,
    k=20)
best_features_seq_fs = seq_fs.fit(x, y)
len(best_features_seq_fs)

******************************
Looking for feature to ADD...
Best score = 0.6676
Added feature 0, current subset = [0]
******************************
Looking for feature to ADD...
Best score = 0.7425
Added feature 12, current subset = [0, 12]
******************************
Looking for feature to ADD...
Best score = 0.7948
Added feature 5, current subset = [0, 12, 5]
******************************
Looking for feature to ADD...
Best score = 0.8323
Added feature 1, current subset = [0, 12, 5, 1]
******************************
Looking for feature to ADD...
Best score = 0.8632
Added feature 2, current subset = [0, 12, 5, 1, 2]
******************************
Looking for feature to ADD...
Best score = 0.8893
Added feature 15, current subset = [0, 12, 5, 1, 2, 15]
******************************
Looking for feature to ADD...
Best score = 0.9056
Added feature 21, current subset = [0, 12, 5, 1, 2, 15, 21]
******************************
Looking for feature to ADD...
Best score = 0.9205
Added featu

20

In [31]:
sorted(best_features_seq_fs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 20, 21, 22]

In [195]:
# замерим метрику на кросс валидации с отобранными признаками
cross_val_score(LGBMClassifier(verbose=-100), x[:, best_features_seq_fs], y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9579893735297735

In [197]:
set(best_features_lgm_fs) & set(best_features_shap_fs), set(best_features_lgm_fs) & set(best_features_pi_fs), set(best_features_pi_fs) & set(best_features_shap_fs)

({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 40})

# Но что, если признаков очень много?

In [196]:
x, y = make_classification(
    n_samples=100000, n_features=1000, n_informative=35, n_redundant=7, n_repeated=8, n_clusters_per_class=4, shift=0.8, scale=3.0, shuffle=False)

In [199]:
%%time
lgm_fs = LGMFeatureSelection()
lgm_fs.fit(x, y, importance_type='split')

CPU times: total: 18min 55s
Wall time: 3min 18s


In [200]:
# замерим метрику на кросс валидации
cross_val_score(LGBMClassifier(verbose=-100), x, y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9699239396992393

In [201]:
lgm_fs.importances.head(20)

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4
0,0,360,358,349,396,354
1,1,370,378,361,366,367
2,2,246,245,255,260,288
3,3,306,299,305,293,313
4,4,283,314,294,298,302
5,5,401,398,375,376,396
6,6,269,259,266,263,251
7,7,319,326,333,313,352
8,8,257,269,279,249,286
9,9,276,257,250,261,273


In [207]:
best_features_lgm_fs = lgm_fs.get_selected_features(threshold=0.85)
len(best_features_lgm_fs)

42

In [208]:
# замерим метрику на кросс валидации с отобранными признаками
cross_val_score(LGBMClassifier(verbose=-100), x[:, best_features_lgm_fs], y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc').mean()

0.9699724176997242

In [204]:
# видимо имеет смысл осторожнее выбирать отсечку, либо объединять результаты нескольких методов