In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

diabetes = pd.read_csv('data/diabetes_data.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


# 8.1

In [6]:
diabetes_drop_dupl = diabetes.drop_duplicates()
diabetes_drop_dupl.shape[0]

768

# 8.2

In [7]:
low_info_cols = []
for col in diabetes_drop_dupl:
    top_freq = diabetes_drop_dupl[col].value_counts(normalize=True).max()
    top_uniq = diabetes_drop_dupl[col].nunique() / diabetes_drop_dupl[col].count()
    if top_freq > 0.95 or top_uniq > 0.95:
        low_info_cols.append(col)
        print(col)

Gender


# 8.3

In [9]:
info_diabetes = diabetes_drop_dupl.drop(low_info_cols, axis=1)
info_diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.43,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.21,50,0


In [10]:
cols_with_null = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']
for col in info_diabetes[cols_with_null]:
    info_diabetes[col] = info_diabetes[col].replace(0, value=np.nan)

info_diabetes['Insulin'].isnull().mean()

0.4869791666666667

# 8.4

In [18]:
thresh = info_diabetes.shape[0] * 0.7
info_diabetes_without_nulls = info_diabetes.dropna(axis=1, thresh=thresh)
info_diabetes_without_nulls.shape

(768, 8)

# 8.5

In [19]:
thresh_rows = info_diabetes_without_nulls.shape[1] - 2
info_diabetes_without_nulls = info_diabetes_without_nulls.dropna(axis=0, thresh=thresh_rows)
info_diabetes_without_nulls.shape

(761, 8)

# 8.6

In [20]:
info_diabetes_without_nulls.isnull().mean()

Pregnancies                 0.000000
Glucose                     0.006570
BloodPressure               0.036794
SkinThickness               0.289093
BMI                         0.005256
DiabetesPedigreeFunction    0.000000
Age                         0.000000
Outcome                     0.000000
dtype: float64

In [21]:
diabetes_filled = info_diabetes_without_nulls.fillna(value={
    'Glucose': info_diabetes_without_nulls['Glucose'].median(),
    'BloodPressure': info_diabetes_without_nulls['BloodPressure'].median(),
    'SkinThickness': info_diabetes_without_nulls['SkinThickness'].median(),
    'BMI': info_diabetes_without_nulls['BMI'].median()
})
diabetes_filled['SkinThickness'].mean()

29.109067017082786

# 8.7

In [28]:
def outliers_iqr(data:pd.DataFrame, feature, left=1.5, right=1.5,
                 log_scale=False, log_adder=0):
    if log_scale:
        x = np.log(data[feature] + log_adder)
    else:
        x = data[feature]
    quartile_1, quartile_3 = x.quantile(0.25), x.quantile(0.75)
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * left)
    upper_bound = quartile_3 + (iqr * right)
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

In [24]:
outliers, cleaned = outliers_iqr(diabetes_filled, 'SkinThickness')
outliers.shape

(87, 8)

# 8.8

In [29]:
def outliers_z_score(data:pd.DataFrame, feature, log_scale=False,
                     left=3, right=3, log_adder=0):
    if log_scale:
        x = np.log(data[feature] + log_adder)
    else:
        x = data[feature]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left*sigma
    upper_bound = mu + right*sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned 

In [30]:
outliers, cleaned = outliers_z_score(diabetes_filled, 'SkinThickness')
outliers.shape

(4, 8)

# 8.9

In [31]:
outliers, cleaned = outliers_iqr(diabetes_filled, 'DiabetesPedigreeFunction')
outliers.shape[0]

29

In [32]:
outliers, cleaned = outliers_iqr(diabetes_filled, 'DiabetesPedigreeFunction', log_scale=True)
outliers.shape[0]

0