In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import dependencies
import numpy as np
import pandas as pd
import os
from pathlib import Path
from collections import Counter

## File Considerations

The team downloaded and reviewed multiple datasets to determine which source data would be best for the project being built.  Rational for selecting or not selecting a dataset follows the import and review code blocks

### Cancer Data Set Review 
This file was sourced from: https://data.world/cancerdatahp/lung-cancer-data

While the file had items that could be attributes a non medical user would be able to answer about themselves, the team felt that there was not enough data to include in a robust testing set (There were only 1000 records)   

This dataset was not selected for use.

In [3]:
#set file path
file_path = Path(r'resources\Cancer_Training_Data.csv')

#read file into memory
cancer_df = pd.read_csv(file_path)

#validate load
cancer_df

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,P995,44,1,6,7,7,7,7,6,7,...,5,3,2,7,8,2,4,5,3,High
996,P996,37,2,6,8,7,7,7,6,7,...,9,6,5,7,2,4,3,1,4,High
997,P997,25,2,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
998,P998,18,2,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [4]:
#column listing
cancer_df.columns

Index(['Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking',
       'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue',
       'Weight Loss', 'Shortness of Breath', 'Wheezing',
       'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold',
       'Dry Cough', 'Snoring', 'Level'],
      dtype='object')

### Cardivascular Data Set Review 

This file was sourced from: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

This file included attributes a non medical user would most likely not be able to answer about themselves.  Additionally, the team felt that there was not enough data to include in a robust testing set (There were only 918 records)

This dataset was not selected for use.


In [5]:
#set file path
file_path2 = Path(r'resources\heart.csv')

#read file into memory
heart_df = pd.read_csv(file_path2)

#validate load
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [6]:
#column listing 
heart_df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

### Diabetes Data Set 

This file was sourced from: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset

Three datasets are provided on this page, and the team based their decision off of the 'Binary Health Indicators' version of the data.  

This file includese multiple binary attributes that a non medical user would most likely be able to answer about the child they are responding for.  Additionally, the file has over 250,000 records, which will provide substantial training data for our algorithm.  

This dataset was selected for use.  Details of preprocessing steps are included beneath the appropriate code blocks below. 

In [7]:
#set file path
file_path3 = Path(r'resources\diabetes.csv')

#read file into memory
diabetes_df = pd.read_csv(file_path3)

#validate load
diabetes_df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [8]:
#Column_listing
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [9]:
#Determine Null Values 
nan_count = diabetes_df.isna().sum(axis = 1)

Null values are ont contained in any of the records in the dataframe.  Null cleanup is not necessary. 

In [10]:
#determine count of duplicate values 
dupe_records = len(diabetes_df)-len(diabetes_df.drop_duplicates())

print(f"There are {dupe_records} duplicate records")

There are 23899 duplicate records


In [11]:
#Drop all duplicate records
diabetesnd_df = diabetes_df.drop_duplicates() 

#display count of records after drop
diabetesnd_df.count()

Diabetes_012            229781
HighBP                  229781
HighChol                229781
CholCheck               229781
BMI                     229781
Smoker                  229781
Stroke                  229781
HeartDiseaseorAttack    229781
PhysActivity            229781
Fruits                  229781
Veggies                 229781
HvyAlcoholConsump       229781
AnyHealthcare           229781
NoDocbcCost             229781
GenHlth                 229781
MentHlth                229781
PhysHlth                229781
DiffWalk                229781
Sex                     229781
Age                     229781
Education               229781
Income                  229781
dtype: int64

In [12]:
#Check Data types - all fields should be numeric 
diabetesnd_df.dtypes

Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [13]:
#Count number of each value in Diabetes 012 column
#0-no diabetes, 1-prediabetes 2-diabetes
#this is what we want our model to train on - recommend combining 1&2 into one value to make it binary 'get checked'
diabetesnd_df['Diabetes_012'].value_counts()

0.0    190055
2.0     35097
1.0      4629
Name: Diabetes_012, dtype: int64

In [14]:
#percentage of each value in Diabetes 012 column
diabetesnd_df.Diabetes_012.value_counts(normalize=True)

0.0    0.827114
2.0    0.152741
1.0    0.020145
Name: Diabetes_012, dtype: float64

In [15]:
#Count number of each value in HighBP Column
# 0 - noHBP, 1 -  HBP 
diabetesnd_df['HighBP'].value_counts()


0.0    125359
1.0    104422
Name: HighBP, dtype: int64

In [16]:
#percentage of each value 
diabetesnd_df.HighBP.value_counts(normalize=True)

0.0    0.545559
1.0    0.454441
Name: HighBP, dtype: float64

In [17]:
#Count number of each value in HighChol Column
#0 = no high cholesterol 1 = high cholesterol
diabetesnd_df['HighChol'].value_counts()


0.0    128273
1.0    101508
Name: HighChol, dtype: int64

In [18]:
#percentage
diabetesnd_df.HighChol.value_counts(normalize=True)

0.0    0.55824
1.0    0.44176
Name: HighChol, dtype: float64

In [19]:
#Count number of each value in CholCheck Column
#0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
diabetesnd_df['CholCheck'].value_counts()


1.0    220483
0.0      9298
Name: CholCheck, dtype: int64

In [20]:
#percentage
diabetesnd_df.CholCheck.value_counts(normalize=True)

1.0    0.959535
0.0    0.040465
Name: CholCheck, dtype: float64

In [21]:
#Count number of each value in BMI Column
#calculate BMI by dividing weight in pounds (lb) by height in inches (in) squared and multiplying by a conversion factor of 703
#Will need to have user enter height and weight of subject 

diabetesnd_df['BMI'].value_counts()


27.0    21551
26.0    17808
24.0    16537
28.0    14933
25.0    14809
        ...  
85.0        1
91.0        1
86.0        1
90.0        1
78.0        1
Name: BMI, Length: 84, dtype: int64

In [64]:
#Determine bucketing for BMI values 
diabetesnd_df.BMI.describe()

count    229781.00000
mean         28.68567
std           6.78636
min          12.00000
25%          24.00000
50%          27.00000
75%          32.00000
max          98.00000
Name: BMI, dtype: float64

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
10-19
20-29
30-39
40-49
50-59
60-69
70-79
80-89
90-99

In [22]:
#percentage
diabetesnd_df.BMI.value_counts(normalize=True)

27.0    0.093789
26.0    0.077500
24.0    0.071969
28.0    0.064988
25.0    0.064448
          ...   
85.0    0.000004
91.0    0.000004
86.0    0.000004
90.0    0.000004
78.0    0.000004
Name: BMI, Length: 84, dtype: float64

In [None]:
### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [23]:
#Count number of each value in Smoker Column
#Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
diabetesnd_df['Smoker'].value_counts()


0.0    122781
1.0    107000
Name: Smoker, dtype: int64

In [24]:
#percentage
diabetesnd_df.Smoker.value_counts(normalize=True)

0.0    0.534339
1.0    0.465661
Name: Smoker, dtype: float64

In [25]:
#Count number of each value in Stroke Column
#(Ever told) you had a stroke. 0 = no 1 = yes
diabetesnd_df['Stroke'].value_counts()

0.0    219497
1.0     10284
Name: Stroke, dtype: int64

In [26]:
#Percentage 
diabetesnd_df.Stroke.value_counts(normalize=True)

0.0    0.955244
1.0    0.044756
Name: Stroke, dtype: float64

In [27]:
#Count number of each value in HeartDiseaseorAttack Column
#coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
diabetesnd_df['HeartDiseaseorAttack'].value_counts()


0.0    206064
1.0     23717
Name: HeartDiseaseorAttack, dtype: int64

In [28]:
#percentage
diabetesnd_df.HeartDiseaseorAttack.value_counts(normalize=True)

0.0    0.896784
1.0    0.103216
Name: HeartDiseaseorAttack, dtype: float64

In [29]:
#Count number of each value in PhysActivity Column
#physical activity in past 30 days - not including job 0 = no 1 = yes
diabetesnd_df['PhysActivity'].value_counts()


1.0    168511
0.0     61270
Name: PhysActivity, dtype: int64

In [30]:
#percentage
diabetesnd_df.PhysActivity.value_counts(normalize=True)

1.0    0.733355
0.0    0.266645
Name: PhysActivity, dtype: float64

In [31]:
#Count number of each value in Fruits Column
#Consume Fruit 1 or more times per day 0 = no 1 = yes
diabetesnd_df['Fruits'].value_counts()


1.0    140848
0.0     88933
Name: Fruits, dtype: int64

In [32]:
#percentage
diabetesnd_df.Fruits.value_counts(normalize=True)

1.0    0.612966
0.0    0.387034
Name: Fruits, dtype: float64

In [33]:
#Count number of each value in Veggies Column
#Consume Vegetables 1 or more times per day 0 = no 1 = yes
diabetesnd_df['Veggies'].value_counts()


1.0    182633
0.0     47148
Name: Veggies, dtype: int64

In [34]:
#percentage
diabetesnd_df.Veggies.value_counts(normalize=True)

1.0    0.794813
0.0    0.205187
Name: Veggies, dtype: float64

In [35]:
#Count number of each value in HvyAlcoholConsump Column
#Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week) 0 = no 1 = yes
diabetesnd_df['HvyAlcoholConsump'].value_counts()

0.0    215831
1.0     13950
Name: HvyAlcoholConsump, dtype: int64

In [36]:
#percentage
diabetesnd_df.HvyAlcoholConsump.value_counts(normalize=True)

0.0    0.93929
1.0    0.06071
Name: HvyAlcoholConsump, dtype: float64

In [37]:
#Count number of each value in AnyHealthcare Column
#Have any kind of health care coverage, including health insurance, prepaid plans such as HMO, etc. 0 = no 1 = yes
diabetesnd_df['AnyHealthcare'].value_counts()

1.0    217390
0.0     12391
Name: AnyHealthcare, dtype: int64

In [38]:
#percentage
diabetesnd_df.AnyHealthcare.value_counts(normalize=True)

1.0    0.946075
0.0    0.053925
Name: AnyHealthcare, dtype: float64

In [39]:
#Count number of each value in NoDocbcCost Column
#Was there a time in the past 12 months when you needed to see a doctor but could not because of cost? 0 = no 1 = yes
diabetesnd_df['NoDocbcCost'].value_counts()

0.0    208455
1.0     21326
Name: NoDocbcCost, dtype: int64

In [40]:
#percentage
diabetesnd_df.NoDocbcCost.value_counts(normalize=True)

0.0    0.90719
1.0    0.09281
Name: NoDocbcCost, dtype: float64

In [41]:
#Count number of each value in GenHlth Column
#Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
diabetesnd_df['GenHlth'].value_counts()

2.0    77536
3.0    73714
1.0    34907
4.0    31546
5.0    12078
Name: GenHlth, dtype: int64

In [42]:
#percentage
diabetesnd_df.GenHlth.value_counts(normalize=True)

2.0    0.337434
3.0    0.320801
1.0    0.151914
4.0    0.137287
5.0    0.052563
Name: GenHlth, dtype: float64

In [43]:
#Count number of each value in MentHlth Column
#Now thinking about your mental health, which includes stress, depression, and problems with 
#emotions, for how many days during the past 30 days was your mental health not good? scale 1-30 days
diabetesnd_df['MentHlth'].value_counts()


0.0     152623
2.0      12697
30.0     12080
5.0       8913
1.0       8309
3.0       7302
10.0      6352
15.0      5501
4.0       3774
20.0      3362
7.0       3090
25.0      1188
14.0      1167
6.0        988
8.0        639
12.0       398
28.0       327
21.0       227
29.0       158
18.0        97
9.0         91
16.0        88
27.0        79
22.0        63
17.0        54
26.0        45
11.0        41
13.0        41
23.0        38
24.0        33
19.0        16
Name: MentHlth, dtype: int64

In [44]:
#percentage
diabetesnd_df.MentHlth.value_counts(normalize=True)

0.0     0.664211
2.0     0.055257
30.0    0.052572
5.0     0.038789
1.0     0.036161
3.0     0.031778
10.0    0.027644
15.0    0.023940
4.0     0.016424
20.0    0.014631
7.0     0.013448
25.0    0.005170
14.0    0.005079
6.0     0.004300
8.0     0.002781
12.0    0.001732
28.0    0.001423
21.0    0.000988
29.0    0.000688
18.0    0.000422
9.0     0.000396
16.0    0.000383
27.0    0.000344
22.0    0.000274
17.0    0.000235
26.0    0.000196
11.0    0.000178
13.0    0.000178
23.0    0.000165
24.0    0.000144
19.0    0.000070
Name: MentHlth, dtype: float64

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [45]:
#Count number of each value in PhysHlth Column
#Now thinking about your physical health, which includes physical illness and injury, for how many days 
#during the past 30 days was your physical health not good? scale 1-30 days
diabetesnd_df['PhysHlth'].value_counts()


0.0     136877
30.0     19386
2.0      14495
1.0      11074
3.0       8435
5.0       7597
10.0      5588
15.0      4914
7.0       4531
4.0       4521
20.0      3273
14.0      2584
25.0      1336
6.0       1328
8.0        809
21.0       663
12.0       578
28.0       522
29.0       215
9.0        179
18.0       152
16.0       112
27.0        99
17.0        96
24.0        72
22.0        70
26.0        69
13.0        68
11.0        60
23.0        56
19.0        22
Name: PhysHlth, dtype: int64

In [46]:
#percentage
diabetesnd_df.PhysHlth.value_counts(normalize=True)

0.0     0.595685
30.0    0.084367
2.0     0.063082
1.0     0.048194
3.0     0.036709
5.0     0.033062
10.0    0.024319
15.0    0.021386
7.0     0.019719
4.0     0.019675
20.0    0.014244
14.0    0.011245
25.0    0.005814
6.0     0.005779
8.0     0.003521
21.0    0.002885
12.0    0.002515
28.0    0.002272
29.0    0.000936
9.0     0.000779
18.0    0.000661
16.0    0.000487
27.0    0.000431
17.0    0.000418
24.0    0.000313
22.0    0.000305
26.0    0.000300
13.0    0.000296
11.0    0.000261
23.0    0.000244
19.0    0.000096
Name: PhysHlth, dtype: float64

### Recommend bucketing this value to reduce the number of columns 
#### Buckets 
0 Days 
1-5 Days
6-10 Days
11-15 Days
16-20 Days
21-25 Days
26-30 Days

In [47]:
#Count number of each value in DiffWalk Column
#Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
diabetesnd_df['DiffWalk'].value_counts()


0.0    187155
1.0     42626
Name: DiffWalk, dtype: int64

In [48]:
#percentage
diabetesnd_df.DiffWalk.value_counts(normalize=True)

0.0    0.814493
1.0    0.185507
Name: DiffWalk, dtype: float64

In [49]:
#Count number of each value in Sex Column
#0 = female 1 = male
diabetesnd_df['Sex'].value_counts()

0.0    128854
1.0    100927
Name: Sex, dtype: int64

In [50]:
#percentage
diabetesnd_df.Sex.value_counts(normalize=True)

0.0    0.560769
1.0    0.439231
Name: Sex, dtype: float64

In [51]:
#Count number of each value in Age Column
#13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Age'].value_counts()

9.0     29736
10.0    29168
8.0     27301
7.0     23140
11.0    22041
6.0     17299
13.0    16813
12.0    15394
5.0     14050
4.0     12234
3.0     10025
2.0      7068
1.0      5512
Name: Age, dtype: int64

In [52]:
#Percentage
diabetesnd_df.Age.value_counts(normalize=True)

9.0     0.129410
10.0    0.126938
8.0     0.118813
7.0     0.100705
11.0    0.095922
6.0     0.075285
13.0    0.073170
12.0    0.066994
5.0     0.061145
4.0     0.053242
3.0     0.043628
2.0     0.030760
1.0     0.023988
Name: Age, dtype: float64

In [53]:
#Count number of each value in Education Column
#Education level (EDUCA see codebook) scale 1-6 1 = Never attended school or only kindergarten 
#2 = Grades 1 through 8 (Elementary) 3 = Grades 9 through 11 (Some high school) 4 = Grade 12 or GED (High school graduate) 
#5 = College 1 year to 3 years (Some college or technical school) 6 = College 4 years or more (College graduate)
#link to codebook here: https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Education'].value_counts()


6.0    88443
5.0    66499
4.0    61158
3.0     9467
2.0     4040
1.0      174
Name: Education, dtype: int64

In [54]:
#percentage
diabetesnd_df.Education.value_counts(normalize=True)

6.0    0.384901
5.0    0.289402
4.0    0.266158
3.0    0.041200
2.0    0.017582
1.0    0.000757
Name: Education, dtype: float64

In [55]:
#Count number of each value in Income Column
#Income scale (INCOME2 see codebook) scale 1-8 1 = less than $10,000 5 = less than $35,000 8 = $75,000 or more
#link to codebook here https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf
diabetesnd_df['Income'].value_counts()


8.0    71818
7.0    40189
6.0    35001
5.0    25345
4.0    19957
3.0    15922
2.0    11757
1.0     9792
Name: Income, dtype: int64

In [56]:
#percentage
diabetesnd_df.Income.value_counts(normalize=True)

8.0    0.312550
7.0    0.174901
6.0    0.152323
5.0    0.110301
4.0    0.086852
3.0    0.069292
2.0    0.051166
1.0    0.042614
Name: Income, dtype: float64