In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [165]:
from sklearn.datasets import make_classification
x, y = make_classification(scale=1, random_state=1589)

In [166]:
models = []

In [167]:
def check_model(x, y, name):
    acc = cross_val_score(LogisticRegression(random_state=1589), x, y, scoring='accuracy').mean()
    acc = round(acc, 4)
    models.append((name, x.shape[1], acc))
    return acc

In [168]:
check_model(x, y, 'Без отбора')


0.86

# Статистический отбор

In [169]:
for i in range(x.shape[1]):
    print(i, ' = ', np.cov(x[:,i], y)[0, 1])

0  =  0.11001119547980813
1  =  -0.009231576669147978
2  =  -0.049377988793817475
3  =  0.013640489093679199
4  =  0.502999314500629
5  =  0.048228589784666966
6  =  0.03723755104874646
7  =  -0.005629807355083821
8  =  0.011769516361466885
9  =  0.04336993695456568
10  =  0.029169689237087383
11  =  -0.03133795148122613
12  =  0.034414190607436945
13  =  0.45307951985318257
14  =  0.023777256015542578
15  =  -0.46170365912733674
16  =  -0.013413016971818301
17  =  -0.015909557933945957
18  =  -0.013165470622177747
19  =  -0.08457491736055237


Отбираем признаки 0, 4, 6, 15

In [170]:
check_model(x[:, [0,4,6, 15]], y, 'Корреляция')


0.88

In [171]:
from sklearn.feature_selection import VarianceThreshold

In [172]:
check_model(
    VarianceThreshold(threshold=1).fit_transform(x),
    y,
    'VarianceThreshold'
)

0.88

# Дисперсионный анализ

In [173]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [174]:
check_model(
    SelectKBest(f_classif, k=5).fit_transform(x, y),
    y,
    'Дисперсия'
)

0.89

# Отбор на основе моделей

In [175]:
from sklearn.feature_selection import SelectFromModel

lr_selector = SelectFromModel(
    LogisticRegression(random_state=1589, penalty='l1', solver='liblinear'),
    prefit=False,
    threshold='mean'
)

In [176]:
check_model(
    lr_selector.fit_transform(x, y),
    y,
    'L1 отбор'
)


0.9

In [177]:
from sklearn.ensemble import RandomForestClassifier

rf_selector = RandomForestClassifier(random_state=1589)
rf_selector.fit(x, y)
rf_selector.feature_importances_

    


array([0.02807984, 0.01672324, 0.02734461, 0.01761614, 0.28228108,
       0.01913197, 0.01191736, 0.01414359, 0.0176748 , 0.01913196,
       0.01595491, 0.01638639, 0.01678054, 0.12173672, 0.03085703,
       0.23989051, 0.05088359, 0.01459682, 0.01547132, 0.02339757])

In [178]:
check_model(
    x[:,rf_selector.feature_importances_ > rf_selector.feature_importances_.mean()],
    y,
    'RandomForest'
)

0.89

# Перебор

In [179]:
from sklearn.feature_selection import SequentialFeatureSelector

selector = SequentialFeatureSelector(
	LogisticRegression(random_state=1589), 
	n_features_to_select='auto', 
	direction="forward" 
).fit(x, y)

In [180]:
selector.support_

array([ True,  True,  True, False,  True,  True, False,  True,  True,
       False,  True, False,  True, False, False,  True, False, False,
       False, False])

In [181]:
check_model(
    x[:,selector.support_],
    y,
    'Перебор'
)

0.89

# Итог

In [183]:
for m in models:
    print(f'{m[0]:20}: ACC: {m[2]:.4f}, Признаков = {m[1]}')

Без отбора          : ACC: 0.8600, Признаков = 20
Корреляция          : ACC: 0.8800, Признаков = 4
VarianceThreshold   : ACC: 0.8800, Признаков = 14
Дисперсия           : ACC: 0.8900, Признаков = 5
L1 отбор            : ACC: 0.9000, Признаков = 5
RandomForest        : ACC: 0.8900, Признаков = 4
Перебор             : ACC: 0.8900, Признаков = 10


Лучший результат дал отбор на основе L1.

В принципе любой метод отбора улучшил показатели модели
