In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import dependencies
import numpy as np
import pandas as pd
import os
from pathlib import Path
from collections import Counter

## File Considerations

The team downloaded and reviewed multiple datasets to determine which source data would be best for the project being built.  Rational for selecting or not selecting a dataset follows the import and review code blocks

### Diabetes Data Set 

This file was sourced from: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset

Three datasets are provided on this page, and the team based their decision off of the 'Binary Health Indicators' version of the data.  

This file includese multiple binary attributes that a non medical user would most likely be able to answer about the child they are responding for.  Additionally, the file has over 250,000 records, which will provide substantial training data for our algorithm.  

This dataset was selected for use.  Details of preprocessing steps are included beneath the appropriate code blocks below. 

In [3]:
#set file path
file_path3 = Path(r'resources\diabetes.csv')

#read file into memory
diabetes_df = pd.read_csv(file_path3)

#validate load
diabetes_df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [4]:
#Column_listing
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [5]:
#Determine Null Values 
nan_count = diabetes_df.isna().sum(axis = 1)

Null values are ont contained in any of the records in the dataframe.  Null cleanup is not necessary. 

In [6]:
#determine count of duplicate values 
dupe_records = len(diabetes_df)-len(diabetes_df.drop_duplicates())

print(f"There are {dupe_records} duplicate records")

There are 23899 duplicate records


In [7]:
#Drop all duplicate records
diabetesnd_df = diabetes_df.drop_duplicates() 

#display count of records after drop
diabetesnd_df.count()

Diabetes_012            229781
HighBP                  229781
HighChol                229781
CholCheck               229781
BMI                     229781
Smoker                  229781
Stroke                  229781
HeartDiseaseorAttack    229781
PhysActivity            229781
Fruits                  229781
Veggies                 229781
HvyAlcoholConsump       229781
AnyHealthcare           229781
NoDocbcCost             229781
GenHlth                 229781
MentHlth                229781
PhysHlth                229781
DiffWalk                229781
Sex                     229781
Age                     229781
Education               229781
Income                  229781
dtype: int64

In [8]:
#Check Data types - all fields should be numeric 
diabetesnd_df.dtypes

Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [9]:
#Count number of each value in Diabetes 012 column
#0-no diabetes, 1-prediabetes 2-diabetes
#this is what we want our model to train on - recommend combining 1&2 into one value to make it binary 'get checked'
diabetesnd_df['Diabetes_012'].value_counts()

0.0    190055
2.0     35097
1.0      4629
Name: Diabetes_012, dtype: int64

In [10]:
#percentage of each value in Diabetes 012 column
diabetesnd_df.Diabetes_012.value_counts(normalize=True)

0.0    0.827114
2.0    0.152741
1.0    0.020145
Name: Diabetes_012, dtype: float64

In [11]:
#Count number of each value in HighBP Column
# 0 - noHBP, 1 -  HBP 
diabetesnd_df['HighBP'].value_counts()


0.0    125359
1.0    104422
Name: HighBP, dtype: int64

In [12]:
#percentage of each value 
diabetesnd_df.HighBP.value_counts(normalize=True)

0.0    0.545559
1.0    0.454441
Name: HighBP, dtype: float64

In [13]:
#Count number of each value in HighChol Column
#0 = no high cholesterol 1 = high cholesterol
diabetesnd_df['HighChol'].value_counts()


0.0    128273
1.0    101508
Name: HighChol, dtype: int64

In [14]:
#percentage
diabetesnd_df.HighChol.value_counts(normalize=True)

0.0    0.55824
1.0    0.44176
Name: HighChol, dtype: float64

In [15]:
#Count number of each value in CholCheck Column
#0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
diabetesnd_df['CholCheck'].value_counts()


1.0    220483
0.0      9298
Name: CholCheck, dtype: int64

In [16]:
#percentage
diabetesnd_df.CholCheck.value_counts(normalize=True)

1.0    0.959535
0.0    0.040465
Name: CholCheck, dtype: float64

In [17]:
#Count number of each value in BMI Column
#calculate BMI by dividing weight in pounds (lb) by height in inches (in) squared and multiplying by a conversion factor of 703
#Will need to have user enter height and weight of subject 

diabetesnd_df['BMI'].value_counts()


27.0    21551
26.0    17808
24.0    16537
28.0    14933
25.0    14809
        ...  
85.0        1
91.0        1
86.0        1
90.0        1
78.0        1
Name: BMI, Length: 84, dtype: int64

In [18]:
#Determine bucketing for BMI values 
diabetesnd_df.BMI.describe()

count    229781.00000
mean         28.68567
std           6.78636
min          12.00000
25%          24.00000
50%          27.00000
75%          32.00000
max          98.00000
Name: BMI, dtype: float64

In [19]:
# BMI Binning 

In [25]:
bmi_bins = [0, 18.5, 24.9, 29.9, 100]
diabetesnd_df['bmi_bin'] = pd.cut(diabetesnd_df['BMI'], bmi_bins)
diabetesnd_df.bmi_bin.describe()

count            229781
unique                4
top       (29.9, 100.0]
freq              84942
Name: bmi_bin, dtype: object

In [26]:
diabetesnd_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'bmi_bin'],
      dtype='object')

In [27]:
diabetesnd_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,bmi_bin
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,"(29.9, 100.0]"
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,"(24.9, 29.9]"
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,"(24.9, 29.9]"
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,"(24.9, 29.9]"
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,"(18.5, 24.9]"


### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
10-19
20-29
30-39
40-49
50-59
60-69
70-79
80-89
90-99

In [21]:
#percentage
diabetesnd_df.BMI.value_counts(normalize=True)

27.0    0.093789
26.0    0.077500
24.0    0.071969
28.0    0.064988
25.0    0.064448
          ...   
85.0    0.000004
91.0    0.000004
86.0    0.000004
90.0    0.000004
78.0    0.000004
Name: BMI, Length: 84, dtype: float64

In [22]:
### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

SyntaxError: invalid syntax (1955888305.py, line 3)

In [None]:
#Count number of each value in Smoker Column
#Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
diabetesnd_df['Smoker'].value_counts()


In [None]:
#percentage
diabetesnd_df.Smoker.value_counts(normalize=True)

In [None]:
#Count number of each value in Stroke Column
#(Ever told) you had a stroke. 0 = no 1 = yes
diabetesnd_df['Stroke'].value_counts()

In [None]:
#Percentage 
diabetesnd_df.Stroke.value_counts(normalize=True)

In [None]:
#Count number of each value in HeartDiseaseorAttack Column
#coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
diabetesnd_df['HeartDiseaseorAttack'].value_counts()


In [None]:
#percentage
diabetesnd_df.HeartDiseaseorAttack.value_counts(normalize=True)

In [None]:
#Count number of each value in PhysActivity Column
#physical activity in past 30 days - not including job 0 = no 1 = yes
diabetesnd_df['PhysActivity'].value_counts()


In [None]:
#percentage
diabetesnd_df.PhysActivity.value_counts(normalize=True)

In [None]:
#Count number of each value in Fruits Column
#Consume Fruit 1 or more times per day 0 = no 1 = yes
diabetesnd_df['Fruits'].value_counts()


In [None]:
#percentage
diabetesnd_df.Fruits.value_counts(normalize=True)

In [None]:
#Count number of each value in Veggies Column
#Consume Vegetables 1 or more times per day 0 = no 1 = yes
diabetesnd_df['Veggies'].value_counts()


In [None]:
#percentage
diabetesnd_df.Veggies.value_counts(normalize=True)

In [None]:
#Count number of each value in HvyAlcoholConsump Column
#Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week) 0 = no 1 = yes
diabetesnd_df['HvyAlcoholConsump'].value_counts()

In [None]:
#percentage
diabetesnd_df.HvyAlcoholConsump.value_counts(normalize=True)

In [None]:
#Count number of each value in AnyHealthcare Column
#Have any kind of health care coverage, including health insurance, prepaid plans such as HMO, etc. 0 = no 1 = yes
diabetesnd_df['AnyHealthcare'].value_counts()

In [None]:
#percentage
diabetesnd_df.AnyHealthcare.value_counts(normalize=True)

In [None]:
#Count number of each value in NoDocbcCost Column
#Was there a time in the past 12 months when you needed to see a doctor but could not because of cost? 0 = no 1 = yes
diabetesnd_df['NoDocbcCost'].value_counts()

In [None]:
#percentage
diabetesnd_df.NoDocbcCost.value_counts(normalize=True)

In [None]:
#Count number of each value in GenHlth Column
#Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
diabetesnd_df['GenHlth'].value_counts()

In [None]:
#percentage
diabetesnd_df.GenHlth.value_counts(normalize=True)

In [None]:
#Count number of each value in MentHlth Column
#Now thinking about your mental health, which includes stress, depression, and problems with 
#emotions, for how many days during the past 30 days was your mental health not good? scale 1-30 days
diabetesnd_df['MentHlth'].value_counts()


In [None]:
#percentage
diabetesnd_df.MentHlth.value_counts(normalize=True)

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [None]:
#Count number of each value in PhysHlth Column
#Now thinking about your physical health, which includes physical illness and injury, for how many days 
#during the past 30 days was your physical health not good? scale 1-30 days
diabetesnd_df['PhysHlth'].value_counts()


In [None]:
#percentage
diabetesnd_df.PhysHlth.value_counts(normalize=True)

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [None]:
#Count number of each value in DiffWalk Column
#Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
diabetesnd_df['DiffWalk'].value_counts()


In [None]:
#percentage
diabetesnd_df.DiffWalk.value_counts(normalize=True)

In [None]:
#Count number of each value in Sex Column
#0 = female 1 = male
diabetesnd_df['Sex'].value_counts()

In [None]:
#percentage
diabetesnd_df.Sex.value_counts(normalize=True)

In [None]:
#Count number of each value in Age Column
#13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Age'].value_counts()

In [None]:
#Percentage
diabetesnd_df.Age.value_counts(normalize=True)

In [None]:
#Count number of each value in Education Column
#Education level (EDUCA see codebook) scale 1-6 1 = Never attended school or only kindergarten 
#2 = Grades 1 through 8 (Elementary) 3 = Grades 9 through 11 (Some high school) 4 = Grade 12 or GED (High school graduate) 
#5 = College 1 year to 3 years (Some college or technical school) 6 = College 4 years or more (College graduate)
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Education'].value_counts()


In [None]:
#percentage
diabetesnd_df.Education.value_counts(normalize=True)

In [None]:
#Count number of each value in Income Column
#Income scale (INCOME2 see codebook) scale 1-8 1 = less than $10,000 5 = less than $35,000 8 = $75,000 or more
#link to codebook here https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Income'].value_counts()


In [None]:
#percentage
diabetesnd_df.Income.value_counts(normalize=True)