In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import dependencies
import numpy as np
import pandas as pd
import os
from pathlib import Path
from collections import Counter

In [3]:
#set file path
file_path3 = Path(r'resources\diabetes.csv')

#read file into memory
diabetes_df = pd.read_csv(file_path3)

#validate load
diabetes_df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [4]:
#Dataframe Column_listing
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [5]:
#Determine Null Values 
nan_count = diabetes_df.isna().sum(axis = 1)

In [6]:
#Check Data types - all fields should be numeric 
diabetes_df.dtypes

Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [7]:
#Count number of each value in Diabetes 012 column
#0-no diabetes, 1-prediabetes 2-diabetes
#this is what we want our model to train on - recommend combining 1&2 into one value to make it binary 'get checked'
diabetes_df['Diabetes_012'].value_counts()

0.0    213703
2.0     35346
1.0      4631
Name: Diabetes_012, dtype: int64

In [8]:
#percentage of each value in Diabetes 012 column
diabetes_df.Diabetes_012.value_counts(normalize=True)

0.0    0.842412
2.0    0.139333
1.0    0.018255
Name: Diabetes_012, dtype: float64

In [9]:
#binning of Diabetes Status 
status_bin_range = [-1, 0, 4]

status_bins = [1, 2]

diabetes_df['Diabetes_Status'] = pd.cut(diabetes_df['Diabetes_012'], status_bin_range, labels=status_bins)

diabetes_df.Diabetes_Status.describe()

count     253680
unique         2
top            1
freq      213703
Name: Diabetes_Status, dtype: int64

In [10]:
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'Diabetes_Status'],
      dtype='object')

In [11]:
diabetes_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_Status
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,1
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,1
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,1
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,1


In [12]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Diabetes_012          253680 non-null  float64 
 1   HighBP                253680 non-null  float64 
 2   HighChol              253680 non-null  float64 
 3   CholCheck             253680 non-null  float64 
 4   BMI                   253680 non-null  float64 
 5   Smoker                253680 non-null  float64 
 6   Stroke                253680 non-null  float64 
 7   HeartDiseaseorAttack  253680 non-null  float64 
 8   PhysActivity          253680 non-null  float64 
 9   Fruits                253680 non-null  float64 
 10  Veggies               253680 non-null  float64 
 11  HvyAlcoholConsump     253680 non-null  float64 
 12  AnyHealthcare         253680 non-null  float64 
 13  NoDocbcCost           253680 non-null  float64 
 14  GenHlth               253680 non-nul

In [13]:
#Count number of each value in HighBP Column
# 0 - noHBP, 1 -  HBP 
diabetes_df['HighBP'].value_counts()


0.0    144851
1.0    108829
Name: HighBP, dtype: int64

In [14]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Diabetes_012          253680 non-null  float64 
 1   HighBP                253680 non-null  float64 
 2   HighChol              253680 non-null  float64 
 3   CholCheck             253680 non-null  float64 
 4   BMI                   253680 non-null  float64 
 5   Smoker                253680 non-null  float64 
 6   Stroke                253680 non-null  float64 
 7   HeartDiseaseorAttack  253680 non-null  float64 
 8   PhysActivity          253680 non-null  float64 
 9   Fruits                253680 non-null  float64 
 10  Veggies               253680 non-null  float64 
 11  HvyAlcoholConsump     253680 non-null  float64 
 12  AnyHealthcare         253680 non-null  float64 
 13  NoDocbcCost           253680 non-null  float64 
 14  GenHlth               253680 non-nul

In [15]:
#percentage of each value 
diabetes_df.HighBP.value_counts(normalize=True)

0.0    0.570999
1.0    0.429001
Name: HighBP, dtype: float64

In [16]:
#Count number of each value in HighChol Column
#0 = no high cholesterol 1 = high cholesterol
diabetes_df['HighChol'].value_counts()


0.0    146089
1.0    107591
Name: HighChol, dtype: int64

In [17]:
#percentage
diabetes_df.HighChol.value_counts(normalize=True)

0.0    0.575879
1.0    0.424121
Name: HighChol, dtype: float64

In [18]:
#Count number of each value in CholCheck Column
#0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
diabetes_df['CholCheck'].value_counts()


1.0    244210
0.0      9470
Name: CholCheck, dtype: int64

In [19]:
#percentage
diabetes_df.CholCheck.value_counts(normalize=True)

1.0    0.96267
0.0    0.03733
Name: CholCheck, dtype: float64

In [20]:
#Count number of each value in BMI Column
#calculate BMI by dividing weight in pounds (lb) by height in inches (in) squared and multiplying by a conversion factor of 703
#Will need to have user enter height and weight of subject 

diabetes_df['BMI'].value_counts()


27.0    24606
26.0    20562
24.0    19550
25.0    17146
28.0    16545
        ...  
85.0        1
91.0        1
86.0        1
90.0        1
78.0        1
Name: BMI, Length: 84, dtype: int64

In [21]:
#Determine bucketing for BMI values 
diabetes_df.BMI.describe()

count    253680.000000
mean         28.382364
std           6.608694
min          12.000000
25%          24.000000
50%          27.000000
75%          31.000000
max          98.000000
Name: BMI, dtype: float64

In [26]:
#percentage
diabetes_df.BMI.value_counts(normalize=True)

27.0    0.096996
26.0    0.081055
24.0    0.077066
25.0    0.067589
28.0    0.065220
          ...   
85.0    0.000004
91.0    0.000004
86.0    0.000004
90.0    0.000004
78.0    0.000004
Name: BMI, Length: 84, dtype: float64

In [27]:
# BMI Binning 
https://qpp.cms.gov/docs/QPP_quality_measure_specifications/CQM-Measures/2019_Measure_128_MIPSCQM.pdf
# BMI 1 is less than 18.5
# BMI 2  is 18.5 to 24.9,
# BMI 3  is 25.0 to 29.9
# BMI 4 is 30.0 or higher

In [23]:
# define ranges
bmi_bin_range = [0, 18.5, 24.9, 29.9, 100]

#define bins 
bmi_bins = [1, 2, 3, 4]

#sort data into bins 
diabetes_df['BMI_Range'] = pd.cut(diabetes_df['BMI'], bmi_bin_range, labels=bmi_bins)

#describe model after binning 
diabetes_df.BMI_Range.describe()

count     253680
unique         4
top            3
freq       93749
Name: BMI_Range, dtype: int64

In [24]:
#Validate columns post binning
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'Diabetes_Status', 'BMI_Range'],
      dtype='object')

In [25]:
#view data 
diabetes_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_Status,BMI_Range
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1,4
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,1,3
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,1,3
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,1,3
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,1,2


In [28]:
#Count number of each value in Smoker Column
#Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
diabetes_df['Smoker'].value_counts()


0.0    141257
1.0    112423
Name: Smoker, dtype: int64

In [29]:
#percentage
diabetes_df.Smoker.value_counts(normalize=True)

0.0    0.556831
1.0    0.443169
Name: Smoker, dtype: float64

In [30]:
#Count number of each value in Stroke Column
#(Ever told) you had a stroke. 0 = no 1 = yes
diabetes_df['Stroke'].value_counts()

0.0    243388
1.0     10292
Name: Stroke, dtype: int64

In [31]:
#Percentage 
diabetes_df.Stroke.value_counts(normalize=True)

0.0    0.959429
1.0    0.040571
Name: Stroke, dtype: float64

In [32]:
#Count number of each value in HeartDiseaseorAttack Column
#coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
diabetes_df['HeartDiseaseorAttack'].value_counts()


0.0    229787
1.0     23893
Name: HeartDiseaseorAttack, dtype: int64

In [33]:
#percentage
diabetes_df.HeartDiseaseorAttack.value_counts(normalize=True)

0.0    0.905814
1.0    0.094186
Name: HeartDiseaseorAttack, dtype: float64

In [34]:
#Count number of each value in PhysActivity Column
#physical activity in past 30 days - not including job 0 = no 1 = yes
diabetes_df['PhysActivity'].value_counts()


1.0    191920
0.0     61760
Name: PhysActivity, dtype: int64

In [35]:
#percentage
diabetes_df.PhysActivity.value_counts(normalize=True)

1.0    0.756544
0.0    0.243456
Name: PhysActivity, dtype: float64

In [36]:
#Count number of each value in Fruits Column
#Consume Fruit 1 or more times per day 0 = no 1 = yes
diabetes_df['Fruits'].value_counts()


1.0    160898
0.0     92782
Name: Fruits, dtype: int64

In [37]:
#percentage
diabetes_df.Fruits.value_counts(normalize=True)

1.0    0.634256
0.0    0.365744
Name: Fruits, dtype: float64

In [38]:
#Count number of each value in Veggies Column
#Consume Vegetables 1 or more times per day 0 = no 1 = yes
diabetes_df['Veggies'].value_counts()


1.0    205841
0.0     47839
Name: Veggies, dtype: int64

In [39]:
#percentage
diabetes_df.Veggies.value_counts(normalize=True)

1.0    0.81142
0.0    0.18858
Name: Veggies, dtype: float64

In [40]:
#Count number of each value in HvyAlcoholConsump Column
#Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week) 0 = no 1 = yes
diabetes_df['HvyAlcoholConsump'].value_counts()

0.0    239424
1.0     14256
Name: HvyAlcoholConsump, dtype: int64

In [41]:
#percentage
diabetes_df.HvyAlcoholConsump.value_counts(normalize=True)

0.0    0.943803
1.0    0.056197
Name: HvyAlcoholConsump, dtype: float64

In [42]:
#Count number of each value in AnyHealthcare Column
#Have any kind of health care coverage, including health insurance, prepaid plans such as HMO, etc. 0 = no 1 = yes
diabetes_df['AnyHealthcare'].value_counts()

1.0    241263
0.0     12417
Name: AnyHealthcare, dtype: int64

In [43]:
#percentage
diabetes_df.AnyHealthcare.value_counts(normalize=True)

1.0    0.951053
0.0    0.048947
Name: AnyHealthcare, dtype: float64

In [44]:
#Count number of each value in NoDocbcCost Column
#Was there a time in the past 12 months when you needed to see a doctor but could not because of cost? 0 = no 1 = yes
diabetes_df['NoDocbcCost'].value_counts()

0.0    232326
1.0     21354
Name: NoDocbcCost, dtype: int64

In [45]:
#percentage
diabetes_df.NoDocbcCost.value_counts(normalize=True)

0.0    0.915823
1.0    0.084177
Name: NoDocbcCost, dtype: float64

In [46]:
#Count number of each value in GenHlth Column
#Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
diabetes_df['GenHlth'].value_counts()

2.0    89084
3.0    75646
1.0    45299
4.0    31570
5.0    12081
Name: GenHlth, dtype: int64

In [47]:
#percentage
diabetes_df.GenHlth.value_counts(normalize=True)

2.0    0.351167
3.0    0.298195
1.0    0.178567
4.0    0.124448
5.0    0.047623
Name: GenHlth, dtype: float64

In [48]:
#Count number of each value in MentHlth Column
#Now thinking about your mental health, which includes stress, depression, and problems with 
#emotions, for how many days during the past 30 days was your mental health not good? scale 1-30 days
diabetes_df['MentHlth'].value_counts()


0.0     175680
2.0      13054
30.0     12088
5.0       9030
1.0       8538
3.0       7381
10.0      6373
15.0      5505
4.0       3789
20.0      3364
7.0       3100
25.0      1188
14.0      1167
6.0        988
8.0        639
12.0       398
28.0       327
21.0       227
29.0       158
18.0        97
9.0         91
16.0        88
27.0        79
22.0        63
17.0        54
26.0        45
11.0        41
13.0        41
23.0        38
24.0        33
19.0        16
Name: MentHlth, dtype: int64

In [49]:
#percentage
diabetes_df.MentHlth.value_counts(normalize=True)

0.0     0.692526
2.0     0.051459
30.0    0.047651
5.0     0.035596
1.0     0.033657
3.0     0.029096
10.0    0.025122
15.0    0.021701
4.0     0.014936
20.0    0.013261
7.0     0.012220
25.0    0.004683
14.0    0.004600
6.0     0.003895
8.0     0.002519
12.0    0.001569
28.0    0.001289
21.0    0.000895
29.0    0.000623
18.0    0.000382
9.0     0.000359
16.0    0.000347
27.0    0.000311
22.0    0.000248
17.0    0.000213
26.0    0.000177
11.0    0.000162
13.0    0.000162
23.0    0.000150
24.0    0.000130
19.0    0.000063
Name: MentHlth, dtype: float64

In [51]:
#### Buckets 
# 0 Days 
# 1-5 Days
# 6-10 Days
# 11-15 Days
# 16-20 Days
# 21-25 Days
# 26-30 Days

In [52]:
mhealth_bin_range = [-1, 5, 10, 15, 20, 25, 32]

mhealth_bins = [1, 2, 3, 4, 5, 6]

diabetes_df['Mental_Health_Range'] = pd.cut(diabetes_df['MentHlth'], mhealth_bin_range, labels=mhealth_bins)

diabetes_df.Mental_Health_Range.describe()

count     253680
unique         6
top            1
freq      217472
Name: Mental_Health_Range, dtype: int64

In [53]:
#post bucketing column validation 
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'Diabetes_Status', 'BMI_Range', 'Mental_Health_Range'],
      dtype='object')

In [54]:
#View data
diabetes_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_Status,BMI_Range,Mental_Health_Range
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1,4,4
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,7.0,6.0,1.0,1,3,1
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,30.0,30.0,1.0,0.0,9.0,4.0,8.0,1,3,6
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,11.0,3.0,6.0,1,3,1
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,3.0,0.0,0.0,0.0,11.0,5.0,4.0,1,2,1


In [55]:
#Table info 
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Diabetes_012          253680 non-null  float64 
 1   HighBP                253680 non-null  float64 
 2   HighChol              253680 non-null  float64 
 3   CholCheck             253680 non-null  float64 
 4   BMI                   253680 non-null  float64 
 5   Smoker                253680 non-null  float64 
 6   Stroke                253680 non-null  float64 
 7   HeartDiseaseorAttack  253680 non-null  float64 
 8   PhysActivity          253680 non-null  float64 
 9   Fruits                253680 non-null  float64 
 10  Veggies               253680 non-null  float64 
 11  HvyAlcoholConsump     253680 non-null  float64 
 12  AnyHealthcare         253680 non-null  float64 
 13  NoDocbcCost           253680 non-null  float64 
 14  GenHlth               253680 non-nul

In [None]:
#Count number of each value in PhysHlth Column
#Now thinking about your physical health, which includes physical illness and injury, for how many days 
#during the past 30 days was your physical health not good? scale 1-30 days
diabetes_df['PhysHlth'].value_counts()


In [None]:
#percentage
diabetes_df.PhysHlth.value_counts(normalize=True)

In [None]:
#bins
# 0 Days
# 1-5 Days
# 6-10 Days
# 11-15 Days
# 16-20 Days
# 21-25 Days
# 26-30 Days

In [56]:
phealth_bin_range = [-1, 5, 10, 15, 20, 25, 32]

phealth_bins = [1, 2, 3, 4, 5, 6]

diabetes_df['Physical_Health_Range'] = pd.cut(diabetes_df['PhysHlth'], phealth_bin_range, labels=phealth_bins)

diabetes_df.Physical_Health_Range.describe()

count     253680
unique         6
top            1
freq      206863
Name: Physical_Health_Range, dtype: int64

In [None]:
diabetes_df.columns

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.info()

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [None]:
#Count number of each value in DiffWalk Column
#Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
diabetes_df['DiffWalk'].value_counts()


In [None]:
#percentage
diabetes_df.DiffWalk.value_counts(normalize=True)

In [None]:
#Count number of each value in Sex Column
#0 = female 1 = male
diabetes_df['Sex'].value_counts()

In [None]:
#percentage
diabetes_df.Sex.value_counts(normalize=True)

In [None]:
#Count number of each value in Age Column
#13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetes_df['Age'].value_counts()

In [None]:
#Percentage
diabetes_df.Age.value_counts(normalize=True)

In [None]:
#Count number of each value in Education Column
#Education level (EDUCA see codebook) scale 1-6 1 = Never attended school or only kindergarten 
#2 = Grades 1 through 8 (Elementary) 3 = Grades 9 through 11 (Some high school) 4 = Grade 12 or GED (High school graduate) 
#5 = College 1 year to 3 years (Some college or technical school) 6 = College 4 years or more (College graduate)
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetes_df['Education'].value_counts()


In [None]:
#percentage
diabetes_df.Education.value_counts(normalize=True)

In [None]:
#Count number of each value in Income Column
#Income scale (INCOME2 see codebook) scale 1-8 1 = less than $10,000 5 = less than $35,000 8 = $75,000 or more
#link to codebook here https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetes_df['Income'].value_counts()


In [None]:
#percentage
diabetes_df.Income.value_counts(normalize=True)

In [None]:
diabetes_df.to_csv('results/diabetes_fullfile.csv', encoding='utf-8')