In [3]:
import pandas as pd
import numpy as np
diabetes = pd.read_csv('data/diabetes_data.csv', sep=',')
display(diabetes.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


In [5]:
dupl_cols = list(diabetes.columns)
mask = diabetes.duplicated(subset=dupl_cols)
diabetes_dupl = diabetes[mask]
diabetes_dep = diabetes.drop_duplicates(subset=dupl_cols)
display(diabetes_dep.shape[0])

768

In [8]:
low_information_cols = []
for col in diabetes_dep:
    #наибольшая относительная частота в признаке
    top_freq = diabetes_dep[col].value_counts(normalize=True).max()
    #доля уникальных значений от размера признака
    nunique_ratio = diabetes_dep[col].nunique() / diabetes_dep[col].count()
    # сравниваем наибольшую частоту с порогом
    if top_freq > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(top_freq*100, 2)}% одинаковых значений')
    # сравниваем долю уникальных значений с порогом
    if nunique_ratio > 0.95:
        low_information_cols.append(col)
        print(f'{col}: {round(nunique_ratio*100, 2)}% уникальных значений')
        
inf_diabetes = diabetes_dep.drop(low_information_cols, axis=1)

Gender: 100.0% одинаковых значений


In [None]:
fill = inf_diabetes.copy()

def change(x):
    if x == 0:
        x = np.nan
    return x

fill['Insulin'] = fill['Insulin'].apply(change)
fill['Glucose'] = fill['Glucose'].apply(change)
fill['BloodPressure'] = fill['BloodPressure'].apply(change)
fill['SkinThickness'] = fill['SkinThickness'].apply(change)
fill['BMI'] = fill['BMI'].apply(change)

cols_null = fill.isnull().mean()*100
cols_with_null = cols_null[cols_null>0].sort_values(ascending=False)

n = fill.shape[0]
thresh = n*0.7
fill = fill.dropna(thresh=thresh, axis=1)

m = fill.shape[1]
fill = fill.dropna(thresh=m-2, axis=0)

values = {
    'Glucose': fill['Glucose'].median(),
    'BloodPressure': fill['BloodPressure'].median(),
    'SkinThickness': fill['SkinThickness'].median(),
    'BMI': fill['BMI'].median(),
}
fill = fill.fillna(values)

def outliers_iqr_mod(data, feature, left=1.5, right=1.5, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x= data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_iqr_mod(fill, 'SkinThickness')
print(f'Число выбросов по методу Тьюки: {outliers.shape[0]}')



Число выбросов по методу Тьюки: 87


In [41]:
def outliers_z_score_mod(data, feature, log_scale=False, left=3, right=3):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_z_score_mod(fill, 'SkinThickness')
display(outliers)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
30,0,162.0,76.0,56.0,53.2,0.759,25,1
36,2,197.0,70.0,99.0,34.7,0.575,62,1
382,0,180.0,78.0,63.0,59.4,2.42,25,1
444,0,100.0,88.0,60.0,46.8,0.962,31,0


In [43]:
def outliers_iqr_mod(data, feature, left=1.5, right=1.5, log_scale=False):
    if log_scale:
        x = np.log(data[feature])
    else:
        x= data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75),
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned
outliers, cleaned = outliers_iqr_mod(fill, 'DiabetesPedigreeFunction', log_scale=True)
print(f'Число выбросов по методу Тьюки: {outliers.shape[0]}')

Число выбросов по методу Тьюки: 0
