In [102]:
import gdown

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder # Кодирование категориальных данных

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler # Масштабирование данных

from sklearn.feature_selection import SelectKBest # Выбор признаков с наивысшими оценками
from sklearn.feature_selection import chi2 # Выбор признаков по Хи квадрат

from sklearn.model_selection import train_test_split # Деление выборки на тестовые и тренировочные данные
from sklearn.model_selection import cross_val_score # Оценка качества работы модели

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Критерий качества, точности

from sklearn.neighbors import KNeighborsClassifier # Обучение модели K-ближайших соседей
from sklearn.linear_model import LinearRegression # Линейная регрессия

In [103]:
# Скачивание данных из Google Disk
# gdown.download(id='1LBDnhITL0Wqwp5G6M6IBI-SSz8BIoNec')

# Загрузка файла из Git репозитория в Pandas
# dataset = pd.read_csv('https://raw.githubusercontent.com/SotGE/innopolis2023/main/lesson12/diabetes.csv', sep=',')

# Загрузка данных из локального хранилища
dataset = pd.read_csv(r"diabetes.csv", sep=',')

# Первые ячейки
dataset.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [104]:
# Размер данных (количество строк, колонок)
dataset.shape

(768, 9)

In [105]:
# Заголовки столбцов в нижнем регистре
dataset.columns = [col.lower() for col in dataset.columns]
dataset.columns

Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',
       'bmi', 'diabetespedigreefunction', 'age', 'outcome'],
      dtype='object')

In [106]:
# Проверка пропущенных значений
dataset.isnull().mean()

pregnancies                 0.0
glucose                     0.0
bloodpressure               0.0
skinthickness               0.0
insulin                     0.0
bmi                         0.0
diabetespedigreefunction    0.0
age                         0.0
outcome                     0.0
dtype: float64

In [107]:
# Количество неопределенные значений (неправильно считанные)
dataset.isna().mean()

pregnancies                 0.0
glucose                     0.0
bloodpressure               0.0
skinthickness               0.0
insulin                     0.0
bmi                         0.0
diabetespedigreefunction    0.0
age                         0.0
outcome                     0.0
dtype: float64

In [108]:
# Проверка значений на 0
(dataset == 0).sum()

pregnancies                 111
glucose                       5
bloodpressure                35
skinthickness               227
insulin                     374
bmi                          11
diabetespedigreefunction      0
age                           0
outcome                     500
dtype: int64

In [109]:
# Заполнение нулевых значений - медианой
dataset = dataset.replace(0, dataset.median())
dataset

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,30.5,33.6,0.627,50,1
1,1,85,66,29,30.5,26.6,0.351,31,0
2,8,183,64,23,30.5,23.3,0.672,32,1
3,1,89,66,23,94.0,28.1,0.167,21,0
4,3,137,40,35,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180.0,32.9,0.171,63,0
764,2,122,70,27,30.5,36.8,0.340,27,0
765,5,121,72,23,112.0,26.2,0.245,30,0
766,1,126,60,23,30.5,30.1,0.349,47,1


In [110]:
# Проверка значений на 0
(dataset == 0).sum()

pregnancies                   0
glucose                       0
bloodpressure                 0
skinthickness                 0
insulin                       0
bmi                           0
diabetespedigreefunction      0
age                           0
outcome                     500
dtype: int64

In [111]:
# Описательная статистика
dataset.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pregnancies,768.0,4.278646,3.021516,1.0,2.0,3.0,6.0,17.0
glucose,768.0,121.65625,30.438286,44.0,99.75,117.0,140.25,199.0
bloodpressure,768.0,72.386719,12.096642,24.0,64.0,72.0,80.0,122.0
skinthickness,768.0,27.334635,9.229014,7.0,23.0,23.0,32.0,99.0
insulin,768.0,94.652344,105.547598,14.0,30.5,31.25,127.25,846.0
bmi,768.0,32.450911,6.875366,18.2,27.5,32.0,36.6,67.1
diabetespedigreefunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [112]:
# Просмотр типов данных в датасете
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pregnancies               768 non-null    int64  
 1   glucose                   768 non-null    int64  
 2   bloodpressure             768 non-null    int64  
 3   skinthickness             768 non-null    int64  
 4   insulin                   768 non-null    float64
 5   bmi                       768 non-null    float64
 6   diabetespedigreefunction  768 non-null    float64
 7   age                       768 non-null    int64  
 8   outcome                   768 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 54.1 KB
