In [14]:
# BASED ON https://machinelearningmastery.com/handle-missing-data-python/

import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [15]:
df = pd.read_csv('diabetes_dataset/data.csv', header=None)
#7. For Each Attribute: (all numeric-valued)
#   1. Number of times pregnant
#   2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
#   3. Diastolic blood pressure (mm Hg)
#   4. Triceps skin fold thickness (mm)
#   5. 2-Hour serum insulin (mu U/ml)
#   6. Body mass index (weight in kg/(height in m)^2)
#   7. Diabetes pedigree function
#   8. Age (years)
#   9. Class variable (0 or 1)

df.columns = ['num_preg', 'plasma_gluc', 'diast_bp', 'triceps_fold', 'serum_insulin', 'bmi', 'diab_pedig_fn', 'age', 'class']

In [16]:
df.head()

Unnamed: 0,num_preg,plasma_gluc,diast_bp,triceps_fold,serum_insulin,bmi,diab_pedig_fn,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
# missing values are marked as 0! 

# example - does it make sense for bmi or bp to be 0? No!!!
df.describe()

Unnamed: 0,num_preg,plasma_gluc,diast_bp,triceps_fold,serum_insulin,bmi,diab_pedig_fn,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [18]:
# most of the columns seem to have invalid zeros
missing_value_columns = ['plasma_gluc', 'diast_bp', 'triceps_fold', 'serum_insulin', 'bmi']
# how many are missing??
print((df[['plasma_gluc', 'diast_bp', 'triceps_fold', 'serum_insulin', 'bmi']] == 0).sum())

plasma_gluc        5
diast_bp          35
triceps_fold     227
serum_insulin    374
bmi               11
dtype: int64


In [19]:
# how many would we lose?
# drop rows with missing values
print(len(df))
print(df['class'].value_counts())
df_miss = df.copy()
df_miss[missing_value_columns] = df_miss[missing_value_columns].replace(0, np.nan)
df_miss.dropna(inplace=True)
print(len(df_miss))
print(df_miss['class'].value_counts())
# summarize the shape of the data with missing rows removed
# how many of each class? 


768
0    500
1    268
Name: class, dtype: int64
392
0    262
1    130
Name: class, dtype: int64


In [29]:
# SIMPLE LINEAR DISRIM ANALYSIS
def run_classifier(dataset, label):
    # replace '0' values with 'nan'
    # split dataset into inputs and outputs
    values = dataset.values
    X = values[:,0:8]
    y = values[:,8]
    # define the model
    model = LinearDiscriminantAnalysis()
    # define the model evaluation procedure
    cv = KFold(n_splits=3, shuffle=True, random_state=1)
    # evaluate the model
    result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    # report the mean performance
    print(label + ' Accuracy: %.3f' % result.mean())



In [24]:
# run with missing values removed 
df_miss = df.copy()
df_miss[missing_value_columns] = df_miss[missing_value_columns].replace(0, np.nan)
df_miss.dropna(inplace=True)

In [26]:
df_mean = df.copy()
# mark zero values as missing or NaN
df_mean[missing_value_columns] = df_mean[missing_value_columns].replace(0, np.nan)
# fill missing values with mean column values
df_mean.fillna(df_mean.mean(), inplace=True)


In [27]:
df_med = df.copy()
# mark zero values as missing or NaN
df_med[missing_value_columns] = df_med[missing_value_columns].replace(0, np.nan)
# fill missing values with median column values
df_med.fillna(df_mean.median(), inplace=True)


In [30]:
run_classifier(df, 'Original') # original dataset
run_classifier(df_miss, 'Missing Removed') # remove missing data (note this cuts the dataset in half!)
run_classifier(df_mean, 'Mean Imputation') # impute with the mean
run_classifier(df_med, 'Median Imputation') # impute with the median

Original Accuracy: 0.763
Missing Removed Accuracy: 0.781
Mean Imputation Accuracy: 0.762
Median Imputation Accuracy: 0.762
