In [2]:
import gdown

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder # Кодирование категориальных данных

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler # Масштабирование данных

from sklearn.feature_selection import SelectKBest # Выбор признаков с наивысшими оценками
from sklearn.feature_selection import chi2 # Выбор признаков по Хи квадрат

from sklearn.model_selection import train_test_split # Деление выборки на тестовые и тренировочные данные
from sklearn.model_selection import cross_val_score # Оценка качества работы модели

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Критерий качества, точности

from sklearn.neighbors import KNeighborsClassifier # Обучение модели K-ближайших соседей
from sklearn.linear_model import LinearRegression # Линейная регрессия

In [3]:
# Скачивание данных из Google Disk
# gdown.download(id='1LBDnhITL0Wqwp5G6M6IBI-SSz8BIoNec')

# Загрузка файла из Git репозитория в Pandas
# dataset = pd.read_csv('https://raw.githubusercontent.com/SotGE/innopolis2023/main/lesson12/diabetes.csv', sep=',')

# Загрузка данных из локального хранилища
dataset = pd.read_csv(r"diabetes.csv", sep=',')

# Первые ячейки
dataset.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Размер данных (количество строк, колонок)
dataset.shape

(768, 9)

In [5]:
# Заголовки столбцов в нижнем регистре
dataset.columns = [col.lower() for col in dataset.columns]
dataset.columns

Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',
       'bmi', 'diabetespedigreefunction', 'age', 'outcome'],
      dtype='object')

In [6]:
# Проверка пропущенных значений
dataset.isnull().mean()

pregnancies                 0.0
glucose                     0.0
bloodpressure               0.0
skinthickness               0.0
insulin                     0.0
bmi                         0.0
diabetespedigreefunction    0.0
age                         0.0
outcome                     0.0
dtype: float64

In [7]:
# Количество неопределенные значений (неправильно считанные)
dataset.isna().mean()

pregnancies                 0.0
glucose                     0.0
bloodpressure               0.0
skinthickness               0.0
insulin                     0.0
bmi                         0.0
diabetespedigreefunction    0.0
age                         0.0
outcome                     0.0
dtype: float64

In [8]:
# Проверка значений на 0
(dataset == 0).sum()

pregnancies                 111
glucose                       5
bloodpressure                35
skinthickness               227
insulin                     374
bmi                          11
diabetespedigreefunction      0
age                           0
outcome                     500
dtype: int64

In [9]:
# Заполнение нулевых значений - медианой
dataset = dataset.replace(0, dataset.median())
dataset

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,30.5,33.6,0.627,50,1
1,1,85,66,29,30.5,26.6,0.351,31,0
2,8,183,64,23,30.5,23.3,0.672,32,1
3,1,89,66,23,94.0,28.1,0.167,21,0
4,3,137,40,35,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180.0,32.9,0.171,63,0
764,2,122,70,27,30.5,36.8,0.340,27,0
765,5,121,72,23,112.0,26.2,0.245,30,0
766,1,126,60,23,30.5,30.1,0.349,47,1


In [10]:
# Проверка значений на 0
(dataset == 0).sum()

pregnancies                   0
glucose                       0
bloodpressure                 0
skinthickness                 0
insulin                       0
bmi                           0
diabetespedigreefunction      0
age                           0
outcome                     500
dtype: int64

In [11]:
# Описательная статистика
dataset.describe(include='all', percentiles=[0.1, 0.25,0.5, 0.75, 0.9]).T

Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
pregnancies,768.0,4.278646,3.021516,1.0,1.0,2.0,3.0,6.0,9.0,17.0
glucose,768.0,121.65625,30.438286,44.0,86.7,99.75,117.0,140.25,167.0,199.0
bloodpressure,768.0,72.386719,12.096642,24.0,58.0,64.0,72.0,80.0,88.0,122.0
skinthickness,768.0,27.334635,9.229014,7.0,18.0,23.0,23.0,32.0,40.0,99.0
insulin,768.0,94.652344,105.547598,14.0,30.5,30.5,31.25,127.25,210.0,846.0
bmi,768.0,32.450911,6.875366,18.2,24.0,27.5,32.0,36.6,41.5,67.1
diabetespedigreefunction,768.0,0.471876,0.331329,0.078,0.165,0.24375,0.3725,0.62625,0.8786,2.42
age,768.0,33.240885,11.760232,21.0,22.0,24.0,29.0,41.0,51.0,81.0
outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [12]:
# Просмотр типов данных в датасете
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pregnancies               768 non-null    int64  
 1   glucose                   768 non-null    int64  
 2   bloodpressure             768 non-null    int64  
 3   skinthickness             768 non-null    int64  
 4   insulin                   768 non-null    float64
 5   bmi                       768 non-null    float64
 6   diabetespedigreefunction  768 non-null    float64
 7   age                       768 non-null    int64  
 8   outcome                   768 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 54.1 KB


In [13]:
# Разделение для задачи классификации на X (data features) и y (outcome)
X = dataset.drop(columns=['outcome'])
y = dataset['outcome']

In [14]:
# Построить распределение для всех числовых переменных
figure = px.box(X)
figure.show()

In [15]:
# Подготовка данных
# Нормализация (StandardScaler)
scalar = StandardScaler()
features = scalar.fit_transform(X, y)
features

array([[ 0.57007018,  0.86604475, -0.03198993, ...,  0.16724016,
         0.46849198,  1.4259954 ],
       [-1.08580689, -1.20506583, -0.5283186 , ..., -0.85155088,
        -0.36506078, -0.19067191],
       [ 1.23242101,  2.01666174, -0.69376149, ..., -1.33183808,
         0.60439732, -0.10558415],
       ...,
       [ 0.23889477, -0.02157407, -0.03198993, ..., -0.90976751,
        -0.68519336, -0.27575966],
       [-1.08580689,  0.14279979, -1.02464727, ..., -0.34215536,
        -0.37110101,  1.17073215],
       [-1.08580689, -0.94206766, -0.19743282, ..., -0.29849289,
        -0.47378505, -0.87137393]])

In [16]:
# Массив в Pandas
X_normalised = pd.DataFrame(features, columns=X.columns)
X_normalised

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age
0,0.570070,0.866045,-0.031990,0.831114,-0.608201,0.167240,0.468492,1.425995
1,-1.085807,-1.205066,-0.528319,0.180566,-0.608201,-0.851551,-0.365061,-0.190672
2,1.232421,2.016662,-0.693761,-0.469981,-0.608201,-1.331838,0.604397,-0.105584
3,-1.085807,-1.073567,-0.528319,-0.469981,-0.006185,-0.633239,-0.920763,-1.041549
4,-0.423456,0.504422,-2.679076,0.831114,0.695378,1.549885,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.894772,-0.679069,0.298896,2.240633,0.809145,0.065361,-0.908682,2.532136
764,-0.754631,0.011301,-0.197433,-0.036283,-0.608201,0.632973,-0.398282,-0.531023
765,0.238895,-0.021574,-0.031990,-0.469981,0.164466,-0.909768,-0.685193,-0.275760
766,-1.085807,0.142800,-1.024647,-0.469981,-0.608201,-0.342155,-0.371101,1.170732


In [17]:
# Построить распределение для всех числовых нормализированных переменных
figure = px.box(X_normalised)
figure.show()

In [18]:
# Разделение на тренировочную и тестовую для классификации
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=512, shuffle=True)

In [19]:
# Построение модели k-Nearest Neighbors (kNN - k-ближайших соседей), загрузить классификатор
model_knn = KNeighborsClassifier()