# Ноутбук по отбору признаков. Все вместе

In [70]:
from abc import ABC, abstractmethod
import copy

import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, train_test_split
from sklearn.inspection import permutation_importance

from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import shap

import warnings
warnings.filterwarnings('ignore')

In [14]:
x, y = make_classification(
    n_samples=10000,
    n_features=100,
    n_informative=15,
    n_redundant=5,
    weights=(0.8, 0.1, 0.1),
    n_classes=3,
    n_repeated=5,
    n_clusters_per_class=4,
    shift=0.8,
    scale=3.0,
    shuffle=False)

In [31]:
cross_val_score(
    LGBMClassifier(verbose=-100), x, y, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='f1_macro').mean()

0.6416290405617843

In [26]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, stratify=y)

In [36]:
model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
model.fit(x_train, y_train)

In [37]:
print(classification_report(y_val, model.predict(x_val)))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      2382
           1       0.71      0.33      0.45       308
           2       0.78      0.27      0.41       310

    accuracy                           0.85      3000
   macro avg       0.78      0.53      0.59      3000
weighted avg       0.84      0.85      0.82      3000



In [32]:
class BaseFeatureSelector(ABC):
    def __init__(self, n_folds=5):
        self.n_folds = n_folds
        self.importances = None

    def fit(self, x, y, **importances_kwargs):
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True)  # можно число фолдов и прочие параметры

        self.importances = pd.DataFrame({'feature': np.arange(x.shape[1])})

        for i, (train_index, val_index) in enumerate(skf.split(x, y)):
            x_train, y_train = x[train_index], y[train_index]
            x_val, y_val = x[val_index], y[val_index]

            model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
            model.fit(x_train, y_train, eval_set=(x_val, y_val))

            imp = self._get_importances_from_model(model, x_val, y_val, **importances_kwargs)

            self.importances[f'importance_{i}'] = imp

    def get_selected_features(self, threshold):
        assert self.importances is not None, 'Сначала нужно обучить, вызвав метод fit'

        # сделаем отдельно для 0 итерации
        imps = self.importances.loc[:, ['feature', 'importance_0']].sort_values('importance_0', ascending=False)  # выберем важности признаков с 0 итерации
        imps['importance_0'] /= imps['importance_0'].sum()
        imps['cumsum'] = imps['importance_0'].cumsum()  # так как мы их отнормировали, может посчитать кумулятивную сумму
        features = imps.loc[imps['cumsum'] <= threshold, 'feature'].tolist()  # возьмем только те признаки, которые по кумулятивной сумме удовлетворяют
        
        best_features = set(features)  # сделаем множество
        for i in range(1, self.n_folds):
            imps = self.importances.loc[:, ['feature', f'importance_{i}']].sort_values(f'importance_{i}', ascending=False)
            imps[f'importance_{i}'] /= imps[f'importance_{i}'].sum()
            imps['cumsum'] = imps[f'importance_{i}'].cumsum()
            features = imps.loc[imps['cumsum'] <= threshold, 'feature'].tolist()

            best_features &= set(features)  # смотрим на пересечения множеств на разных итерациях кросс-валидации

        return list(best_features)
        
    @abstractmethod
    def _get_importances_from_model(self, model, x, y, **kwargs):
        pass
    
    
class LGMFeatureSelection(BaseFeatureSelector):
    def __init__(self, n_folds=5):
        super().__init__(n_folds)

    def _get_importances_from_model(self, model, x, y, importance_type='split'):
        return model.booster_.feature_importance(importance_type=importance_type)
    

class ShapFeatureSelection(BaseFeatureSelector):
    def __init__(self, n_folds=5):
        super().__init__(n_folds)

    def _get_importances_from_model(self, model, x, y, 
                                    is_multiclass=False, feature_perturbation='tree_path_dependent'):
        explainer = shap.TreeExplainer(model, feature_perturbation=feature_perturbation)
        shap_values = explainer.shap_values(x)  # for each class, for each instance

        if is_multiclass:
            if isinstance(shap_values, list):
                # if shap_values in list of n_classes np.arrays of shape [n_samples, n_features]
                importances = []
                for cls_ in shap_values:
                    cls_value = np.abs(cls_).mean(axis=0)
                    importances.append(cls_value.reshape(1, -1))
                    
                importances = np.concatenate(importances).mean(axis=0)
            else:
                # if shap_values in np.array of shape [n_samples, n_features, n_classes]
                importances = np.abs(shap_values).mean(axis=(0, 2))
        else:
            if isinstance(shap_values, list):
                # if shap_values in list of n_classes np.arrays of shape [n_samples, n_features]
                importances = np.abs(shap_values[1]).mean(axis=0)
            else:
                # if shap_values in np.array of shape [n_samples, n_features]
                importances = np.abs(shap_values).mean(axis=0)
        return importances

# Первый этап: отбираем признаки с помощью FeatureImportance и SHAP

In [33]:
lgm_fs = LGMFeatureSelection()
lgm_fs.fit(x, y)

In [34]:
shap_fs = ShapFeatureSelection()
shap_fs.fit(x, y, is_multiclass=True)

In [35]:
lgm_features = lgm_fs.get_selected_features(threshold=0.85)
shap_features = shap_fs.get_selected_features(threshold=0.85)

selected = set(lgm_features) & set(shap_features)
len(selected)

23

In [38]:
model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
model.fit(x_train[:, list(selected)], y_train)

In [39]:
print(classification_report(y_val, model.predict(x_val[:, list(selected)])))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      2382
           1       0.76      0.48      0.59       308
           2       0.82      0.49      0.61       310

    accuracy                           0.88      3000
   macro avg       0.83      0.65      0.71      3000
weighted avg       0.87      0.88      0.87      3000



# Второй этап: пройдем по отобранным признакам жадным алгоритмом

In [40]:
from catboost import CatBoostClassifier, Pool, EFeaturesSelectionAlgorithm, EShapCalcType

model = CatBoostClassifier(iterations=1000, verbose=False)
summary = model.select_features(
    x_train,
    y_train,
    eval_set=(x_val, y_val),
    features_for_select=list(selected),     # we will select from all features
    num_features_to_select=15,  # we want to select exactly important features
    steps=len(selected)-15,                                     # more steps - more accurate selection
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,            # can be Approximate, Regular and Exact
    train_final_model=False,                          # to train model with selected features
    logging_level='Silent',
    plot=False
)

In [41]:
summary

{'selected_features': [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 17],
 'eliminated_features_names': ['', '', '', '', '', '', '', ''],
 'loss_graph': {'main_indices': [0, 1, 2, 3, 4, 5, 6, 7],
  'removed_features_count': [0, 1, 2, 3, 4, 5, 6, 7, 8],
  'loss_values': [0.35820846120472627,
   0.3543908311093811,
   0.3484200519187857,
   0.3546469304158924,
   0.3578820160975885,
   0.3540652244856185,
   0.3519230538950761,
   0.3591183702395144,
   0.3637786238206219]},
 'eliminated_features': [59, 40, 46, 15, 16, 18, 6, 19],
 'selected_features_names': ['',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '']}

In [77]:
def get_feature_to_drop(model, x, features, feature_perturbation='tree_path_dependent'):
    explainer = shap.TreeExplainer(model, feature_perturbation=feature_perturbation)
    shap_values = explainer.shap_values(x)

#     importances = np.abs(shap_values)[:, :, 1].mean(axis=(0))
    importances = np.abs(shap_values).mean(axis=(0, 2))  # mean overall for multiclass
#     importances = model.booster_.feature_importance(importance_type='split')
    imps = pd.DataFrame({'feature': features, 'importance': importances})\
    .sort_values('importance', ascending=False).reset_index(drop=True)
    
    
    return imps, imps.iloc[-1]['feature']


def recursive_feature_elimination(model, x_train, y_train, x_val, y_val, k, features=None):
    n_features = x.shape[1]
    features = features or list(range(x.shape[1]))
    
    while len(features) > k:
        model_ = copy.deepcopy(model)
        model_.fit(x_train[:, features], y_train, eval_set=(x_val, y_val))
        score = model_.best_score_['valid_0']
        
        imps, feature_to_delete = get_feature_to_drop(model_, x_val[:, features], features)
        
        print(imps.tail(5))
        print(f'F1 macro: {f1_score(y_val, model_.predict(x_val[:, features]), average="macro")}')
        print(f'Eliminated feature: {feature_to_delete}, score: {score}')
        features.remove(feature_to_delete)
    return features

In [78]:
model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
bf = recursive_feature_elimination(model, x_train, y_train, x_val, y_val, features=list(selected), k=10)

    feature  importance
18       16    0.091313
19       19    0.088026
20       40    0.080657
21       59    0.079456
22       15    0.056376
F1 macro: 0.7129263408057088
Eliminated feature: 15.0, score: OrderedDict([('multi_logloss', 0.3511714414253112)])
    feature  importance
17       18    0.108647
18       46    0.093642
19       19    0.089021
20       40    0.082309
21       59    0.080921
F1 macro: 0.71236241781039
Eliminated feature: 59.0, score: OrderedDict([('multi_logloss', 0.4482648872221985)])
    feature  importance
16       13    0.118919
17       46    0.095389
18       16    0.094524
19       19    0.092853
20       40    0.080854
F1 macro: 0.718121515388447
Eliminated feature: 40.0, score: OrderedDict([('multi_logloss', 0.4222725326088195)])
    feature  importance
15       13    0.127045
16       16    0.119332
17       18    0.108027
18       46    0.098294
19       19    0.092632
F1 macro: 0.7273602434231928
Eliminated feature: 19.0, score: OrderedDict([('multi

# Смотрим какие фичи дают максимальную метрику + минимальный лосс, обрежем

In [79]:
bf

[0, 1, 2, 3, 4, 5, 7, 8, 11, 17]

In [84]:
selected - set([15, 59, 40, 19, 18, 46])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17}

In [85]:
model = LGBMClassifier(max_depth=5, n_estimators=500, learning_rate=0.05, verbose=-100)
model.fit(x_train[:, list(selected - set([15, 59, 40, 19, 18, 46]))], y_train)

In [86]:
print(classification_report(y_val, model.predict(x_val[:, list(selected - set([15, 59, 40, 19, 18, 46]))])))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      2382
           1       0.77      0.55      0.64       308
           2       0.84      0.51      0.63       310

    accuracy                           0.89      3000
   macro avg       0.84      0.68      0.74      3000
weighted avg       0.89      0.89      0.88      3000

