# Ноутбук по отбору признаков 2

[mlxtend SequentialFeatureSelection tutorial](https://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/)

[Catboost Feature Selection Tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/feature_selection/select_features_tutorial.ipynb)

In [66]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split

from lightgbm import LGBMClassifier

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import shap

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [35]:
# создадим искусственный датасет, в котором будут информативные признаки, их дубликаты, их комбинации и шумовые признаки
x, y = make_classification(
    n_samples=1000, n_classes=3, n_features=100, n_informative=15, n_redundant=5, n_repeated=5, n_clusters_per_class=4, shift=0.3, scale=3.0, shuffle=False)

# если не ставить параметр shuffle в True, то сначала будут идти информативные признаки, потом комбинации и потом повторы, после них - мусор
# так удобнее оценить как это все работает

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Метрики

In [29]:
model = LGBMClassifier(verbose=-100)

In [52]:
cross_val_score(model, x, y, cv=4, n_jobs=-1, scoring='f1_macro').mean()

0.32249425592989556

In [37]:
model.fit(x_train, y_train)

In [38]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.61      0.45      0.52       116
           1       0.50      0.66      0.57        87
           2       0.61      0.64      0.63        97

    accuracy                           0.57       300
   macro avg       0.58      0.58      0.57       300
weighted avg       0.58      0.57      0.57       300



In [42]:
confmatrix = confusion_matrix(y_test, model.predict(x_test))
print(confmatrix)

[[52 38 26]
 [17 57 13]
 [16 19 62]]


In [48]:
precisions = []
recalls = []
f1s = []
for cls in range(3):
    print('*'*30)
    print(f"Class: {cls}")
    precision = confmatrix[cls][cls] / (confmatrix[:, cls].sum())
    print(f"Precision: {precision}")
    recall = confmatrix[cls][cls] / confmatrix[cls].sum()
    print(f"Recall: {recall}")
    f1_ = 2 * (precision*recall/(precision+recall))
    print(f"F1-score: {f1_}")
    
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1_)

print('='*30)
print(f"Predicison macro: {np.mean(precisions)}")
print(f"Recall macro: {np.mean(recalls)}")
print(f"F1 macro: {np.mean(f1s)}")

******************************
Class: 0
Precision: 0.611764705882353
Recall: 0.4482758620689655
F1-score: 0.5174129353233832
******************************
Class: 1
Precision: 0.5
Recall: 0.6551724137931034
F1-score: 0.5671641791044776
******************************
Class: 2
Precision: 0.6138613861386139
Recall: 0.6391752577319587
F1-score: 0.6262626262626262
Predicison macro: 0.5752086973403223
Recall macro: 0.5808745111980093
F1 macro: 0.5702799135634957


# Sequential Feature Selection

In [51]:
sfs1 = SFS(model, 
           k_features=25, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='f1_macro',
           cv=4,
           n_jobs=-1
          )

sfs1 = sfs1.fit(x, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished

[2024-08-23 11:51:17] Features: 1/25 -- score: 0.3819632926452176[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:  1.6min finished

[2024-08-23 11:52:56] Features: 2/25 -- score: 0.3987628901059996[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done  98 out of  98 | elapsed:  1.7min finished

[2024-08-23 11:54:41] Features: 3/25 -- score: 0.4271921531444193[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done  97 out o

In [53]:
sfs1.subsets_

{1: {'feature_idx': (3,),
  'cv_scores': array([0.3617878 , 0.33632174, 0.43118659, 0.39855704]),
  'avg_score': 0.3819632926452176,
  'feature_names': ('3',)},
 2: {'feature_idx': (3, 14),
  'cv_scores': array([0.32167069, 0.38418301, 0.43232513, 0.45687273]),
  'avg_score': 0.3987628901059996,
  'feature_names': ('3', '14')},
 3: {'feature_idx': (3, 6, 14),
  'cv_scores': array([0.38921845, 0.35095965, 0.48324144, 0.48534907]),
  'avg_score': 0.4271921531444193,
  'feature_names': ('3', '6', '14')},
 4: {'feature_idx': (3, 6, 14, 95),
  'cv_scores': array([0.40228077, 0.39500091, 0.50739449, 0.48402092]),
  'avg_score': 0.4471742732786563,
  'feature_names': ('3', '6', '14', '95')},
 5: {'feature_idx': (3, 6, 14, 50, 95),
  'cv_scores': array([0.39418935, 0.39410691, 0.52587979, 0.49411699]),
  'avg_score': 0.4520732612414541,
  'feature_names': ('3', '6', '14', '50', '95')},
 6: {'feature_idx': (3, 6, 14, 43, 50, 95),
  'cv_scores': array([0.41913608, 0.37591752, 0.51380756, 0.52009

In [54]:
sfs1.k_feature_idx_

(3,
 6,
 7,
 14,
 21,
 27,
 30,
 31,
 36,
 38,
 41,
 42,
 43,
 46,
 50,
 59,
 60,
 63,
 66,
 73,
 81,
 82,
 90,
 95,
 96)

In [55]:
pd.DataFrame.from_dict(sfs1.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)","[0.36178780240700625, 0.33632174277335575, 0.4...",0.381963,"(3,)",0.057732,0.036016,0.020794
2,"(3, 14)","[0.32167069186457947, 0.38418300653594767, 0.4...",0.398763,"(3, 14)",0.082748,0.051621,0.029803
3,"(3, 6, 14)","[0.3892184498015623, 0.35095965121329736, 0.48...",0.427192,"(3, 6, 14)",0.094076,0.058688,0.033884
4,"(3, 6, 14, 95)","[0.40228077161805303, 0.3950009122422915, 0.50...",0.447174,"(3, 6, 14, 95)",0.079026,0.049299,0.028463
5,"(3, 6, 14, 50, 95)","[0.3941893509324454, 0.39410691479656995, 0.52...",0.452073,"(3, 6, 14, 50, 95)",0.094582,0.059004,0.034066
6,"(3, 6, 14, 43, 50, 95)","[0.41913607548282467, 0.37591752290547475, 0.5...",0.457239,"(3, 6, 14, 43, 50, 95)",0.098866,0.061676,0.035609
7,"(3, 6, 14, 43, 50, 60, 95)","[0.38192116075327753, 0.37742533262642325, 0.5...",0.465943,"(3, 6, 14, 43, 50, 60, 95)",0.138401,0.08634,0.049848
8,"(3, 6, 14, 21, 43, 50, 60, 95)","[0.38192116075327753, 0.37742533262642325, 0.5...",0.465943,"(3, 6, 14, 21, 43, 50, 60, 95)",0.138401,0.08634,0.049848
9,"(3, 6, 14, 21, 30, 43, 50, 60, 95)","[0.43327454438565544, 0.4113562849051126, 0.50...",0.458251,"(3, 6, 14, 21, 30, 43, 50, 60, 95)",0.059297,0.036992,0.021357
10,"(3, 6, 7, 14, 21, 30, 43, 50, 60, 95)","[0.4098992540763846, 0.40078139889460634, 0.54...",0.463631,"(3, 6, 7, 14, 21, 30, 43, 50, 60, 95)",0.096317,0.060086,0.034691


# Recursive based on model

In [142]:
def get_feature_to_drop(model, x, features, feature_perturbation='tree_path_dependent'):
    explainer = shap.TreeExplainer(model, feature_perturbation=feature_perturbation)
    shap_values = explainer.shap_values(x)

#     importances = np.abs(shap_values)[:, :, 1].mean(axis=(0))
    importances = np.abs(shap_values).mean(axis=(0, 2))  # mean overall for multiclass
#     importances = model.booster_.feature_importance(importance_type='split')
    imps = pd.DataFrame({'feature': features, 'importance': importances})\
    .sort_values('importance', ascending=False).reset_index(drop=True)
    
    
    return imps, imps.iloc[-1]['feature']

In [143]:
import copy

def recursive_feature_elimination(model, x_train, y_train, x_val, k):
    n_features = x.shape[1]
    features = list(range(x.shape[1]))
    
    while len(features) > k:
        model_ = copy.deepcopy(model)
        model_.fit(x_train[:, features], y_train)
        
        _, feature_to_delete = get_feature_to_drop(model_, x_val[:, features], features)
        
        print(f'Eliminated feature: {feature_to_delete}')
        features.remove(feature_to_delete)
    return features

In [144]:
model = LGBMClassifier(verbose=-100)
bf = recursive_feature_elimination(model, x_train, y_train, x_test, 15)

Eliminated feature: 20.0
Eliminated feature: 21.0
Eliminated feature: 22.0
Eliminated feature: 23.0
Eliminated feature: 24.0
Eliminated feature: 55.0
Eliminated feature: 62.0
Eliminated feature: 53.0
Eliminated feature: 77.0
Eliminated feature: 26.0
Eliminated feature: 39.0
Eliminated feature: 85.0
Eliminated feature: 98.0
Eliminated feature: 72.0
Eliminated feature: 29.0
Eliminated feature: 67.0
Eliminated feature: 25.0
Eliminated feature: 44.0
Eliminated feature: 59.0
Eliminated feature: 40.0
Eliminated feature: 89.0
Eliminated feature: 36.0
Eliminated feature: 52.0
Eliminated feature: 84.0
Eliminated feature: 58.0
Eliminated feature: 47.0
Eliminated feature: 56.0
Eliminated feature: 71.0
Eliminated feature: 57.0
Eliminated feature: 48.0
Eliminated feature: 95.0
Eliminated feature: 51.0
Eliminated feature: 37.0
Eliminated feature: 73.0
Eliminated feature: 38.0
Eliminated feature: 43.0
Eliminated feature: 45.0
Eliminated feature: 33.0
Eliminated feature: 83.0
Eliminated feature: 35.0


In [145]:
bf

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17]

# CatBoost 

# Тоже рекурсивный

In [146]:
from catboost import CatBoostClassifier, Pool, EFeaturesSelectionAlgorithm, EShapCalcType

In [147]:
model = CatBoostClassifier(verbose=False)

In [148]:
summary = model.select_features(
    x_train,
    y_train,
    eval_set=(x_test, y_test),
    features_for_select=list(range(x_train.shape[1])),     # we will select from all features
    num_features_to_select=15,  # we want to select exactly important features
    steps=100-15,                                     # more steps - more accurate selection
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,            # can be Approximate, Regular and Exact
    train_final_model=False,                          # to train model with selected features
    logging_level='Silent',
    plot=False
)

In [149]:
summary

{'selected_features': [0, 1, 2, 4, 7, 8, 9, 10, 11, 13, 14, 15, 17, 19, 24],
 'eliminated_features_names': ['',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 'loss_graph': {'main_indices': [0,
   2,
   4,
   6,
   9,
   11,
   13,
   14,
   16,
   18,
   20,
   22,
   23,
   25,
   27,
   28,
   30,
   32,
   33,
   35,
   36,
   37,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   63,
   64,
   65,
   66,
   67,
   67,
 