In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


### Признаки данных:
* Pregnancies — количество беременностей.

* Glucose — концентрация глюкозы в плазме через два часа при пероральном тесте на толерантность к глюкозе.

* BloodPressure — диастолическое артериальное давление (мм рт. ст.).

* SkinThickness — толщина кожной складки трицепса (мм).

* Insulin — двухчасовой сывороточный инсулин (ме Ед/мл).

* BMI — индекс массы тела ( вес в кг / (рост в метрах)^2)
* DiabetesPedigreeFunction — функция родословной диабета (чем она выше, тем выше шанс наследственной заболеваемости).

* Age — возраст.

* Outcome — наличие диабета (0 — нет, 1 — да).

In [2]:
diabetes_df = diabetes.drop_duplicates()
print(f'Результирующее число записей: {diabetes_df.shape[0]}')

Результирующее число записей: 768


In [3]:
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.430,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.210,50,0,Female
...,...,...,...,...,...,...,...,...,...,...
763,5,139,64,35,140,28.6,0.411,26,0,Female
764,1,96,122,0,0,22.4,0.207,27,0,Female
765,10,101,86,37,0,45.6,1.136,38,1,Female
766,0,141,0,0,0,42.4,0.205,29,1,Female


In [4]:
#список неинформативных признаков
low_information_cols = [] 
col_usefullness_limit = 0.95

#цикл по всем столбцам
for col in diabetes_df.columns:
    #наибольшая относительная частота в признаке
    top_freq = diabetes_df[col].value_counts(normalize=True).max()
    #доля уникальных значений от размера признака
    nunique_ratio = diabetes_df[col].nunique() / diabetes_df[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > col_usefullness_limit:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > col_usefullness_limit:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')

Gender: 100.0% одинаковых значений


In [5]:
def zero_replacer(data):
    if data == 0: data = np.nan
    else: data = data
    return data
diabetes_df['Glucose'] = diabetes_df['Glucose'].apply(zero_replacer);
diabetes_df['BloodPressure'] = diabetes_df['BloodPressure'].apply(zero_replacer);
diabetes_df['SkinThickness'] = diabetes_df['SkinThickness'].apply(zero_replacer);
diabetes_df['Insulin'] = diabetes_df['Insulin'].apply(zero_replacer);
diabetes_df['BMI'] = diabetes_df['BMI'].apply(zero_replacer);
diabetes_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_df['Glucose'] = diabetes_df['Glucose'].apply(zero_replacer);
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_df['BloodPressure'] = diabetes_df['BloodPressure'].apply(zero_replacer);
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_df['SkinThickness'] = diabetes_df['SkinThick

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98.0,58.0,33.0,190.0,34.0,0.430,43,0,Female
1,2,112.0,75.0,32.0,,35.7,0.148,21,0,Female
2,2,108.0,64.0,,,30.8,0.158,21,0,Female
3,8,107.0,80.0,,,24.6,0.856,34,0,Female
4,7,136.0,90.0,,,29.9,0.210,50,0,Female
...,...,...,...,...,...,...,...,...,...,...
763,5,139.0,64.0,35.0,140.0,28.6,0.411,26,0,Female
764,1,96.0,122.0,,,22.4,0.207,27,0,Female
765,10,101.0,86.0,37.0,,45.6,1.136,38,1,Female
766,0,141.0,,,,42.4,0.205,29,1,Female


In [6]:
diabetes_df['Insulin'].isnull().value_counts()

Insulin
False    394
True     374
Name: count, dtype: int64

In [15]:
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,190.0,34.0,0.430,43,0
1,2,112.0,75.0,32.0,,35.7,0.148,21,0
2,2,108.0,64.0,,,30.8,0.158,21,0
3,8,107.0,80.0,,,24.6,0.856,34,0
4,7,136.0,90.0,,,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...,...
763,5,139.0,64.0,35.0,140.0,28.6,0.411,26,0
764,1,96.0,122.0,,,22.4,0.207,27,0
765,10,101.0,86.0,37.0,,45.6,1.136,38,1
766,0,141.0,,,,42.4,0.205,29,1


In [14]:
cols_null_percent = diabetes_df.isnull().mean() * 100
cols_with_null = cols_null_percent[cols_null_percent>0].sort_values(ascending=False)
display(cols_with_null)

Insulin          48.697917
SkinThickness    29.557292
BloodPressure     4.557292
BMI               1.432292
Glucose           0.651042
dtype: float64

In [16]:
diabetes_df = diabetes_df.drop('Insulin', axis = 1)
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98.0,58.0,33.0,34.0,0.430,43,0
1,2,112.0,75.0,32.0,35.7,0.148,21,0
2,2,108.0,64.0,,30.8,0.158,21,0
3,8,107.0,80.0,,24.6,0.856,34,0
4,7,136.0,90.0,,29.9,0.210,50,0
...,...,...,...,...,...,...,...,...
763,5,139.0,64.0,35.0,28.6,0.411,26,0
764,1,96.0,122.0,,22.4,0.207,27,0
765,10,101.0,86.0,37.0,45.6,1.136,38,1
766,0,141.0,,,42.4,0.205,29,1


In [21]:
m = diabetes_df.shape[1]
diabetes_df = diabetes_df.dropna(thresh = m - 2, axis = 0)
values = {
  'Pregnancies': diabetes_df['Pregnancies'].mean(),
  'Glucose': diabetes_df['Glucose'].mean(),
  'SkinThickness': diabetes_df['SkinThickness'].mean(),
  'BloodPressure': diabetes_df['BloodPressure'].mean(),
  'BMI': diabetes_df['BMI'].mean(),
  'DiabetesPedigreeFunction': diabetes_df['DiabetesPedigreeFunction'].mean(),
  'Age': diabetes_df['Age'].mean(),
  'Outcome': diabetes_df['Outcome'].mean()
}
diabetes_df = diabetes_df.fillna(values)
diabetes_df['SkinThickness'].mean()

np.float64(29.153419593345657)

In [22]:
def outliers_iqr(data, feature):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_iqr(diabetes_df, 'SkinThickness')
print(f'Число выбросов по методу Тьюки: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

Число выбросов по методу Тьюки: 87
Результирующее число записей: 674


In [23]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_z_score(diabetes_df, 'SkinThickness', log_scale=False)
print(f'Число выбросов по методу z-отклонения: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

Число выбросов по методу z-отклонения: 4
Результирующее число записей: 757


In [27]:
def outliers_iqr(data, feature):
    x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_iqr(diabetes_df, 'DiabetesPedigreeFunction')
print(f'Число выбросов по методу z-отклонения: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

Число выбросов по методу z-отклонения: 29
Результирующее число записей: 732


In [26]:
def outliers_z_score(data, feature, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_z_score(diabetes_df, 'DiabetesPedigreeFunction', log_scale=True)
print(f'Число выбросов по методу z-отклонения: {outliers.shape[0]}')
print(f'Результирующее число записей: {cleaned.shape[0]}')

Число выбросов по методу z-отклонения: 0
Результирующее число записей: 761
