# Шаг 1: Подготовка

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Загрузка данных
data = pd.read_csv('data.csv')

# Последняя колонка - это целевая переменная
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [5]:
data.shape

(35218, 156)

In [6]:
# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Шаг 2: Обучение модели ближайших соседей для K=5

In [10]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Оценка качества модели
print("Точность:", accuracy_score(y_test, y_pred))
print("Отчёт Классификации:\n", classification_report(y_test, y_pred))
print("Матрица Ошибок:\n", confusion_matrix(y_test, y_pred))

Точность: 0.9645088018171494
Отчёт Классификации:
               precision    recall  f1-score   support

         0.0       0.96      0.97      0.96      3540
         1.0       0.97      0.96      0.96      3504

    accuracy                           0.96      7044
   macro avg       0.96      0.96      0.96      7044
weighted avg       0.96      0.96      0.96      7044

Матрица Ошибок:
 [[3445   95]
 [ 155 3349]]


Точность: 96.45%

# Шаг 3: Подбор гиперпараметра K с использованием GridSearchCV и RandomizedSearchCV

In [8]:
param_grid = {'n_neighbors': range(1, 31)}

# GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters (GridSearchCV):", grid_search.best_params_)
print("Best score (GridSearchCV):", grid_search.best_score_)

# RandomizedSearchCV
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_iter=10, random_state=42)
random_search.fit(X_train, y_train)
print("Best parameters (RandomizedSearchCV):", random_search.best_params_)
print("Best score (RandomizedSearchCV):", random_search.best_score_)

Best parameters (GridSearchCV): {'n_neighbors': 1}
Best score (GridSearchCV): 0.9658905006647748
Best parameters (RandomizedSearchCV): {'n_neighbors': 1}
Best score (RandomizedSearchCV): 0.9658905006647748


# Шаг 4: Сравнение метрик качества исходной и оптимальной моделей

In [9]:
optimal_knn = grid_search.best_estimator_
optimal_knn.fit(X_train, y_train)
y_pred_optimal = optimal_knn.predict(X_test)

print("Optimal KNN Accuracy:", accuracy_score(y_test, y_pred_optimal))
print("Optimal KNN Classification Report:\n", classification_report(y_test, y_pred_optimal))
print("Optimal KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_optimal))

# Кросс-валидация с двумя стратегиями
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results_kfold = cross_val_score(optimal_knn, X, y, cv=kfold)
cv_results_stratified_kfold = cross_val_score(optimal_knn, X, y, cv=stratified_kfold)

print("Cross-validation results (KFold):", cv_results_kfold)
print("Mean CV accuracy (KFold):", cv_results_kfold.mean())

print("Cross-validation results (StratifiedKFold):", cv_results_stratified_kfold)
print("Mean CV accuracy (StratifiedKFold):", cv_results_stratified_kfold.mean())

Optimal KNN Accuracy: 0.9697614991482112
Optimal KNN Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      3540
         1.0       0.97      0.97      0.97      3504

    accuracy                           0.97      7044
   macro avg       0.97      0.97      0.97      7044
weighted avg       0.97      0.97      0.97      7044

Optimal KNN Confusion Matrix:
 [[3440  100]
 [ 113 3391]]
Cross-validation results (KFold): [0.9697615  0.96876775 0.96805792 0.96776942 0.96663354]
Mean CV accuracy (KFold): 0.9681980239339378
Cross-validation results (StratifiedKFold): [0.96749006 0.96876775 0.96550256 0.96776942 0.96677552]
Mean CV accuracy (StratifiedKFold): 0.9672610603332832
