In [21]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, SelectFromModel, SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [22]:
# Генерируем данные
x_data_generated, y_data_generated = make_classification(scale=1)

In [23]:
# Базовая модель логистической регрессии
base_model = LogisticRegression()
base_accuracy = cross_val_score(base_model, x_data_generated, y_data_generated, scoring='accuracy').mean()


In [24]:
# Отбор признаков на основе корреляции
corr_matrix = np.corrcoef(x_data_generated, rowvar=False)
high_corr_indices = np.where(np.abs(corr_matrix) > 0.7)
features_to_remove = set()
for i, j in zip(*high_corr_indices):
    if i != j and i not in features_to_remove:
        features_to_remove.add(i)
selected_features_corr = [i for i in range(x_data_generated.shape[1]) if i not in features_to_remove]
model_corr = LogisticRegression()
accuracy_corr = cross_val_score(model_corr, x_data_generated[:, selected_features_corr], y_data_generated, scoring='accuracy').mean()


In [25]:
# Отбор низковариативных признаков
variance_selector = VarianceThreshold(threshold=0.1)
x_data_low_variance = variance_selector.fit_transform(x_data_generated)
model_low_variance = LogisticRegression()
accuracy_low_variance = cross_val_score(model_low_variance, x_data_low_variance, y_data_generated, scoring='accuracy').mean()


In [26]:
# Отбор признаков на основе дисперсионного анализа
selector_f_classif = SelectKBest(f_classif, k=5)
x_data_f_classif = selector_f_classif.fit_transform(x_data_generated, y_data_generated)
model_f_classif = LogisticRegression()
accuracy_f_classif = cross_val_score(model_f_classif, x_data_f_classif, y_data_generated, scoring='accuracy').mean()


In [27]:
# Отбор признаков с использованием логистической регрессии (L1 регуляризация)
selector_l1 = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))
x_data_l1 = selector_l1.fit_transform(x_data_generated, y_data_generated)
model_l1 = LogisticRegression()
accuracy_l1 = cross_val_score(model_l1, x_data_l1, y_data_generated, scoring='accuracy').mean()


In [28]:
# Отбор признаков с использованием модели RandomForest
selector_rf = SelectFromModel(RandomForestClassifier(n_estimators=100))
x_data_rf = selector_rf.fit_transform(x_data_generated, y_data_generated)
model_rf = LogisticRegression()
accuracy_rf = cross_val_score(model_rf, x_data_rf, y_data_generated, scoring='accuracy').mean()


In [29]:
# Перебор признаков с использованием SequentialFeatureSelector
sfs = SequentialFeatureSelector(base_model, n_features_to_select=5)
x_data_sfs = sfs.fit_transform(x_data_generated, y_data_generated)
model_sfs = LogisticRegression()
accuracy_sfs = cross_val_score(model_sfs, x_data_sfs, y_data_generated, scoring='accuracy').mean()


In [30]:
# Вывод результатов
print("| Способ выбора признаков     | Количество признаков| Средняя точность модели |")
print("|-----------------------------|---------------------|-------------------------|")
print(f"| Базовая модель              | {x_data_generated.shape[1]}                  | {base_accuracy:.4f}                  |")
print(f"| Корреляция                  | {len(selected_features_corr)}                  | {accuracy_corr:.4f}                  |")
print(f"| Низковариативные признаки   | {x_data_low_variance.shape[1]}                  | {accuracy_low_variance:.4f}                  |")
print(f"| Дисперсионный анализ        | {x_data_f_classif.shape[1]}                   | {accuracy_f_classif:.4f}                  |")
print(f"| Логистическая регрессия (L1)| {x_data_l1.shape[1]}                  | {accuracy_l1:.4f}                  |")
print(f"| Модель RandomForest         | {x_data_rf.shape[1]}                   | {accuracy_rf:.4f}                  |")
print(f"| SequentialFeatureSelector   | {x_data_sfs.shape[1]}                   | {accuracy_sfs:.4f}                  |")


| Способ выбора признаков     | Количество признаков| Средняя точность модели |
|-----------------------------|---------------------|-------------------------|
| Базовая модель              | 20                  | 0.7900                  |
| Корреляция                  | 17                  | 0.5200                  |
| Низковариативные признаки   | 20                  | 0.7900                  |
| Дисперсионный анализ        | 5                   | 0.8200                  |
| Логистическая регрессия (L1)| 15                  | 0.8000                  |
| Модель RandomForest         | 3                   | 0.8000                  |
| SequentialFeatureSelector   | 5                   | 0.8500                  |
