In [92]:
# импорт библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# [sklearn.metrics.classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

In [93]:
# 2. подгружаем датасет
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [94]:
# выведем частоту появления и уникальность
# в данном случае имеем 3 класса (многоклассовая)
df.describe(include='object')

Unnamed: 0,species,island,sex
count,344,344,333
unique,3,3,2
top,Adelie,Biscoe,Male
freq,152,168,168


In [95]:
# посмотрим на количество пропусков
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [96]:
print(df.shape)
# для эксперимента можно удалить пропуски (их менее 1%, кроме столбца sex)
df.dropna(inplace=True)
print(df.shape)

(344, 7)
(333, 7)


In [97]:
# разобьем наши данные на train и test, а также нормируем, так как наш алгоритм чувствителен к масштабу признаков
X = pd.get_dummies(df.drop('species', axis=1))
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)
# нормирование
mm = MinMaxScaler()
X_train_std = mm.fit_transform(X_train)
X_test_std = mm.transform(X_test)

In [98]:
# обучим модель. По умолчанию у метода k=5 ближайших соседей
clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)

# [sklearn.neighbors.KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.score)

In [99]:
# точность на тренировочных и тестовых данных
print(np.round(clf.score(X_train_std, y_train), 4))
print(np.round(clf.score(X_test_std, y_test), 4))

0.9962
0.9851


In [100]:
y_train_proba = clf.predict_proba(X_train_std)
y_test_proba = clf.predict_proba(X_test_std)
# array([[0. , 0. , 1. ],
#        [0. , 1. , 0. ],
#        [1. , 0. , 0. ],

y_train_pred = clf.predict(X_train_std)
y_test_pred = clf.predict(X_test_std)
# array(['Gentoo', 'Chinstrap', 'Adelie', 'Gentoo', 'Gentoo',

In [101]:
# рассмотрим метрики
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        29
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        24

    accuracy                           0.99        67
   macro avg       0.98      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67



# Снова задача о диабете в Индии, но на этот раз решаем её через K-ближайших соседей

In [102]:
diab = pd.read_csv('E:\GitHub репозитории\Data-Science-course-by-miracl6\CSV\pima_indians_diabetes.csv')
diab.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [103]:
# проверка на наличие пропусков
diab.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [104]:
# определяем X и y
X = diab.drop('Outcome', axis=1)
y = diab.Outcome

In [105]:
# разбиваем на трейн, тест
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=10)

In [107]:
# нормируем данные
mm = MinMaxScaler()
X_train_std = mm.fit_transform(X_train)
X_test_std = mm.transform(X_test)

In [109]:
# обучаем модель на K(5) ближайших соседях
clf = KNeighborsClassifier()
clf.fit(X_train_std, y_train)

In [111]:
# получим вероятности и предсказания на тестовых данных
y_test_proba = clf.predict_proba(X_test_std)
y_test_predict = clf.predict(X_test_std)

In [114]:
# импортируем необходимые метрики
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [115]:
# выведем отчет по метрикам
print(classification_report(y_test, y_test_predict))

              precision    recall  f1-score   support

           0       0.78      0.77      0.77       100
           1       0.58      0.59      0.59        54

    accuracy                           0.71       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.71      0.71      0.71       154



In [117]:
# получаем значение метрик
precision = precision_score(y_test, y_test_predict)
recall = recall_score(y_test, y_test_predict)
f1 = f1_score(y_test, y_test_predict)
roc_auc = roc_auc_score(y_test, y_test_predict)

In [120]:
print(f'precision = {precision:.4f}')
print(f'recall = {recall:.4f}')
print(f'f1 = {f1:.4f}')
print(f'roc_auc = {roc_auc:.4f}')

precision = 0.5818
recall = 0.5926
f1 = 0.5872
roc_auc = 0.6813
