In [3]:
import numpy as np
import pandas as pd

In [8]:
# Create the dataframe

data = pd.read_csv(r'C:/langchain2/data_diabities/diabetes_prediction_dataset.csv')

In [7]:
# import libraries                  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from yellowbrick.features import rank2d
from yellowbrick.classifier import discrimination_threshold
import eli5
from sklearn.metrics import confusion_matrix, classification_report
import shap 
import matplotlib.gridspec as gridspec
import itertools
from sklearn.svm import SVC
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.data import iris_data
from mlxtend.plotting import plot_decision_regions

In [10]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [11]:
# Define each epidemiologic measure of assosciation

def measure_association(a,b,c,d):
    total = a+b+c+d
    
    # get incidence
    I_o = c / (c+d)
    I_e = a / (a+b)
    I_t = (a+c) / total
    
    # get prevalence
    P_exp = (a+b) / total
    P_o = c / (c+d)
    P_e = a / (a+b)
    
    # risk and odds ratio
    RR = (a/(a+b)) / (c/(c+d))
    OR = (a*d) / (b*c)
    PR = P_e / P_o
    # difference measures
    RD = I_e - I_o
    AR = RD
    #NNHT = 1 / (I_t - I_o)
    PAR = I_t - I_o
    
    # Define each measurement
    data_dict = {'Positive exposure & positive outcome' : a,
                 'Positive exposure & negative outcome' : b,
                 'Negative exposure & positive outcome' : c,
                 'Negative exposure & negative outcome' : d,
                 'Total positive outcome' : a+c,
                 'Total negative outcome' : b+d,
                 'Total positive exposure' : a+b,
                 'Total negative exposure' : c+d,
                 'Total' : total, 
                 'Incidence/Prevalence of outcome among unexposed (baseline risk)' : I_o,
                 'Incidence/Prevalence of outcome among exposed' : I_e,
                 'Incidence/Prevalence of outcome in total population (exposed & unexposed)' : I_t,
                 'Prevalence of exposure in the population': (a+b)/total,
                 'Risk Difference' : RD,
                 'Relative Risk' : RR,
                 'Odds Ratio' : OR}
    return data_dict

In [12]:
# Get the exposure outcome 2x2 table

def get_table(a,b,c,d):
    result = np.array([[a,b,a+b],
                     [c,d,c+d],
                     [a+c,b+d,a+b+c+d]])
    return result

In [15]:
# Check the smoking history column for 'No Info'

data['smoking_history'].value_counts()

smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

In [16]:
# Remove ambiguous genders for brevity of analysis

data['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [17]:
# Treat 'No Info' as missing data for removal

df = data[data['smoking_history'] != 'No Info']
df = df[df['gender'] != 'Other']
df = df[df['smoking_history'] != 'not current']
df = df[df['smoking_history'] != 'ever']
df.index = list(range(len(df)))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53730 entries, 0 to 53729
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               53730 non-null  object 
 1   age                  53730 non-null  float64
 2   hypertension         53730 non-null  int64  
 3   heart_disease        53730 non-null  int64  
 4   smoking_history      53730 non-null  object 
 5   bmi                  53730 non-null  float64
 6   HbA1c_level          53730 non-null  float64
 7   blood_glucose_level  53730 non-null  int64  
 8   diabetes             53730 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 4.1+ MB


In [18]:
# Get cleaned data shape

df.shape

(53730, 9)

In [19]:
# Preview the new data set

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Male,28.0,0,0,never,27.32,5.7,158,0
2,Female,36.0,0,0,current,23.45,5.0,155,0
3,Male,76.0,1,1,current,20.14,4.8,155,0
4,Female,20.0,0,0,never,27.32,6.6,85,0


In [20]:
# Define the types of variables

numeric = ['age','bmi','HbA1c_level','blood_glucose_level']
categorical = ['gender', 'hypertension', 'heart_disease','smoking_history','diabetes']

In [24]:
# Get spread of target variable diabetes
df['diabetes'].value_counts()

diabetes
0    47846
1     5884
Name: count, dtype: int64

In [25]:
# Get aggregated descriptive statistics on numeric variables

df[numeric].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,53730.0,46.213463,19.556779,0.16,31.0,47.0,61.0,80.0
bmi,53730.0,28.42495,6.561159,10.08,24.5,27.32,31.16,91.82
HbA1c_level,53730.0,5.5629,1.094939,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,53730.0,139.679732,42.162461,80.0,100.0,140.0,159.0,300.0


In [26]:
# Check for a relationship between gender and diabetes
# Let the exposure be whether the participant is a male vs. female

print('Exposure: Gender (is male)')
print('Outcome: Diabetes \n')
gen_table = pd.crosstab(df['gender'], df['diabetes'])
male_diab = gen_table.values[1,1]
male_no_diab = gen_table.values[1,0]
fem_diab = gen_table.values[0,1]
fem_no_diab = gen_table.values[0,0]
print(gen_table,'\n')
print('2x2 Table Construction: ')
print(get_table(male_diab, male_no_diab, fem_diab, fem_no_diab),'\n')
print('Measures of Association: ')
gen_diab = measure_association(male_diab, male_no_diab, fem_diab, fem_no_diab)
gen_diab

Exposure: Gender (is male)
Outcome: Diabetes 

diabetes      0     1
gender               
Female    29599  3102
Male      18247  2782 

2x2 Table Construction: 
[[ 2782 18247 21029]
 [ 3102 29599 32701]
 [ 5884 47846 53730]] 

Measures of Association: 


{'Positive exposure & positive outcome': 2782,
 'Positive exposure & negative outcome': 18247,
 'Negative exposure & positive outcome': 3102,
 'Negative exposure & negative outcome': 29599,
 'Total positive outcome': 5884,
 'Total negative outcome': 47846,
 'Total positive exposure': 21029,
 'Total negative exposure': 32701,
 'Total': 53730,
 'Incidence/Prevalence of outcome among unexposed (baseline risk)': 0.09485948441943672,
 'Incidence/Prevalence of outcome among exposed': 0.13229349945313615,
 'Incidence/Prevalence of outcome in total population (exposed & unexposed)': 0.10951051554066629,
 'Prevalence of exposure in the population': 0.3913828401265587,
 'Risk Difference': 0.03743401503369943,
 'Relative Risk': 1.3946259592575774,
 'Odds Ratio': 1.454791982091719}

In [13]:
# Get info on dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [14]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0
