<a href="https://colab.research.google.com/github/UtwoA/Introduction_to_ML/blob/main/ml4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from google.colab import files

In [2]:
column_names = ['name', 'hobby', 'age', 'educational_level', 'marital_status', 'class']

df = pd.read_csv('hayes-roth.data', header=None, names=column_names)
print("Первые 5 строк датасета:")
print(df.head())

Первые 5 строк датасета:
   name  hobby  age  educational_level  marital_status  class
0    92      2    1                  1               2      1
1    10      2    1                  3               2      2
2    83      3    1                  4               1      3
3    61      2    4                  2               2      3
4   107      1    1                  3               4      3


In [3]:
feature_cols = ['hobby', 'age', 'educational_level', 'marital_status']
X = df[feature_cols]
y = df['class']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [5]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # стандартизация признаков
    ('knn', KNeighborsClassifier())
])

In [6]:
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski']
}

In [7]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

In [8]:
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("\nЛучшие параметры модели:", grid_search.best_params_)
print("Лучшая точность кросс-валидации:", grid_search.best_score_)



Лучшие параметры модели: {'knn__metric': 'euclidean', 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Лучшая точность кросс-валидации: 0.7081871345029239


In [9]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)


In [10]:
def print_metrics(y_true, y_pred, dataset_name):
    print(f"\nМетрики для {dataset_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision (macro):", precision_score(y_true, y_pred, average='macro'))
    print("Recall (macro):", recall_score(y_true, y_pred, average='macro'))
    print("F1-score (macro):", f1_score(y_true, y_pred, average='macro'))
    print("-" * 40)


In [11]:
print_metrics(y_train, y_train_pred, "обучающей выборки")
print_metrics(y_test, y_test_pred, "тестовой выборки")



Метрики для обучающей выборки:
Accuracy: 0.9347826086956522
Precision (macro): 0.9512195121951219
Recall (macro): 0.9444444444444445
F1-score (macro): 0.9433811802232855
----------------------------------------

Метрики для тестовой выборки:
Accuracy: 0.65
Precision (macro): 0.7380952380952381
Recall (macro): 0.6351851851851852
F1-score (macro): 0.6612080060355923
----------------------------------------


In [12]:
print("\nClassification report (тестовая выборка):")
print(classification_report(y_test, y_test_pred))


Classification report (тестовая выборка):
              precision    recall  f1-score   support

           1       0.57      0.75      0.65        16
           2       0.64      0.60      0.62        15
           3       1.00      0.56      0.71         9

    accuracy                           0.65        40
   macro avg       0.74      0.64      0.66        40
weighted avg       0.69      0.65      0.65        40

