In [1]:
import pandas as pd

In [2]:
df2019 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/brfss2019.csv')

In [3]:
df2019.shape

(287999, 343)

In [4]:
df2019.dtypes

Unnamed: 0,0
Unnamed: 0,int64
x.state,int64
fmonth,int64
idate,int64
imonth,int64
...,...
x.fruite1,int64
x.vegete1,int64
x.flshot7,float64
x.pneumo3,float64


In [5]:
# select specific columns
df2019_selected = df2019[['diabete4',
                          'x.rfhype5',
                          'toldhi2',
                          'cholchk2',
                          'x.bmi5',
                          'smoke100',
                          'cvdstrk3',
                          'x.michd',
                          'x.totinda',
                          'x.frtlt1a',
                          'x.veglt1a',
                          'x.rfdrhv7',
                          'hlthpln1',
                          'medcost',
                          'genhlth',
                          'menthlth',
                          'physhlth',
                          'diffwalk',
                          'sexvar',
                          'x.ageg5yr',
                          'educa',
                          'income2' ]]

In [6]:
df2019_selected.shape

(287999, 22)

In [7]:
#Drop Missing Values - knocks 45,000 rows out right away
df2019_selected = df2019_selected.dropna()
df2019_selected.shape

(242186, 22)

In [8]:
#Rename the columns to make them more readable
brfss2019 = df2019_selected.rename(columns =
                                   {'diabete4':'Diabetes',
                                    'x.rfhype5':'HighBloodPressure',
                                    'toldhi2':'HighCholesterol',
                                    'cholchk2':'CholesterolCheck',
                                    'x.bmi5':'BodyMassIndex',
                                    'smoke100':'Smoker',
                                    'cvdstrk3':'HadStroke',
                                    'x.michd':'HadHeartDiseaseorAtack',
                                    'x.totinda':'PhysicallyActive',
                                    'x.frtlt1a':'ConsumesFruits',
                                    'x.veglt1a':'ConsumesVeggies',
                                    'x.rfdrhv7':'HeavyAlcoholConsumption',
                                    'hlthpln1':'HaveHealthCoverage',
                                    'medcost':'HaveHealthFinancialIssues',
                                    'genhlth':'GeneralHealthCondition',
                                    'menthlth':'MentalHealthCondition',
                                    'physhlth':'PhysicalHealthCondition',
                                    'diffwalk':'DifficultyInWalking',
                                    'sexvar':'Gender',
                                    'x.ageg5yr':'AgeBand',
                                    'educa':'HighestLevelOfEducation',
                                    'income2':'IncomeLevel' })

In [9]:
brfss2019.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
3.0,199718
1.0,34646
4.0,5486
2.0,2024
7.0,274
9.0,38


In [10]:
# Diabetes
# going to make this ordinal. 0 is for no diabetes or only during pregnancy, 1 is for pre-diabetes or borderline diabetes, 2 is for yes diabetes
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2019['Diabetes'] = brfss2019['Diabetes'].replace({2:0, 3:0, 1:2, 4:1})
brfss2019 = brfss2019[brfss2019.Diabetes != 7]
brfss2019 = brfss2019[brfss2019.Diabetes != 9]
brfss2019.Diabetes.unique()

array([0., 2., 1.])

In [11]:
# 1 HighBloodPressure
#Change 1 to 0 so it represetnts No high blood pressure and 2 to 1 so it represents high blood pressure
brfss2019['HighBloodPressure'] = brfss2019['HighBloodPressure'].replace({1:0, 2:1})
brfss2019 = brfss2019[brfss2019.HighBloodPressure != 9]
brfss2019.HighBloodPressure.unique()

array([1, 0])

In [12]:
# 2 HighCholesterol
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2019['HighCholesterol'] = brfss2019['HighCholesterol'].replace({2:0})
brfss2019 = brfss2019[brfss2019.HighCholesterol != 7]
brfss2019 = brfss2019[brfss2019.HighCholesterol != 9]
brfss2019.HighCholesterol.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['HighCholesterol'] = brfss2019['HighCholesterol'].replace({2:0})


array([1., 0.])

In [13]:
# 3 CholesterolCheck
# Change 3 to 0 and 2 to 0 for Not checked cholesterol in past 5 years
# Remove 9
brfss2019['CholesterolCheck'] = brfss2019['CholesterolCheck'].replace({3:0,2:0})
brfss2019 = brfss2019[brfss2019.CholesterolCheck != 9]
brfss2019.CholesterolCheck.unique()

array([0., 4., 7., 8., 5., 6.])

In [14]:
# 4 BodyMassIndex (no changes, just note that these are BMI * 100. So for example a BMI of 4018 is really 40.18)
brfss2019['BodyMassIndex'] = brfss2019['BodyMassIndex'].div(100).round(0)
brfss2019.BodyMassIndex.unique()

array([ 28.,  19.,  32.,  21.,  33.,  31.,  24.,  17.,  22.,  23.,  26.,
        25.,  27.,  35.,  36.,  30.,  16.,  34.,  41.,  29.,  37.,  20.,
        42.,  43.,  39.,  55.,  38.,  40.,  18.,  50.,  12.,  45.,  53.,
        48.,  47.,  44.,  52.,  46.,  51.,  57.,  49.,  74.,  59.,  60.,
        15.,  14.,  54.,  58.,  61.,  63.,  67.,  13.,  70.,  72.,  56.,
        62.,  64.,  68.,  69.,  71.,  76.,  73.,  65.,  66.,  92.,  86.,
        77.,  75.,  79.,  80.,  82., 100.,  84.,  87.,  88.,  81.,  78.,
        97.,  89.,  83.,  99.,  96.,  91.,  98.])

In [15]:
# 5 Smoker
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2019['Smoker'] = brfss2019['Smoker'].replace({2:0})
brfss2019 = brfss2019[brfss2019.Smoker != 7]
brfss2019 = brfss2019[brfss2019.Smoker != 9]
brfss2019.Smoker.unique()

array([1., 0.])

In [16]:
# 6 HadStroke
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2019['HadStroke'] = brfss2019['HadStroke'].replace({2:0})
brfss2019 = brfss2019[brfss2019.HadStroke != 7]
brfss2019 = brfss2019[brfss2019.HadStroke != 9]
brfss2019.HadStroke.unique()

array([0., 1.])

In [17]:
# 7 HadHeartDiseaseorAtack
# Change 2 to 0 because this means did not have MI or CHD
brfss2019['HadHeartDiseaseorAtack'] = brfss2019['HadHeartDiseaseorAtack'].replace({2: 0})
brfss2019.HadHeartDiseaseorAtack.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['HadHeartDiseaseorAtack'] = brfss2019['HadHeartDiseaseorAtack'].replace({2: 0})


array([0., 1.])

In [18]:
# 8 PhysicallyActive
# 1 for physical activity
# change 2 to 0 for no physical activity
# Remove all 9 (don't know/refused)
brfss2019['PhysicallyActive'] = brfss2019['PhysicallyActive'].replace({2:0})
brfss2019 = brfss2019[brfss2019.PhysicallyActive != 9]
brfss2019.PhysicallyActive.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['PhysicallyActive'] = brfss2019['PhysicallyActive'].replace({2:0})


array([0, 1])

In [19]:
# 9 ConsumesFruits
# Change 2 to 0. this means no fruit consumed per day. 1 will mean consumed 1 or more pieces of fruit per day
# remove all dont knows and missing 9
brfss2019['ConsumesFruits'] = brfss2019['ConsumesFruits'].replace({2:0})
brfss2019 = brfss2019[brfss2019.ConsumesFruits != 9]
brfss2019.ConsumesFruits.unique()

array([1, 0])

In [20]:
# 10 ConsumesVeggies
# Change 2 to 0. this means no vegetables consumed per day. 1 will mean consumed 1 or more pieces of vegetable per day
# remove all dont knows and missing 9
brfss2019['ConsumesVeggies'] = brfss2019['ConsumesVeggies'].replace({2:0})
brfss2019 = brfss2019[brfss2019.ConsumesVeggies != 9]
brfss2019.ConsumesVeggies.unique()

array([1, 0])

In [21]:
# 11 HeavyAlcoholConsumption
# Change 1 to 0 (1 was no for heavy drinking). change all 2 to 1 (2 was yes for heavy drinking)
# remove all dont knows and missing 9
brfss2019['HeavyAlcoholConsumption'] = brfss2019['HeavyAlcoholConsumption'].replace({1:0, 2:1})
brfss2019 = brfss2019[brfss2019.HeavyAlcoholConsumption != 9]
brfss2019.HeavyAlcoholConsumption.unique()

array([0, 1])

In [22]:
# 12 HaveHealthCoverage
# 1 is yes, change 2 to 0 because it is No health care access
# remove 7 and 9 for don't know or refused
brfss2019['HaveHealthCoverage'] = brfss2019['HaveHealthCoverage'].replace({2:0})
brfss2019 = brfss2019[brfss2019.HaveHealthCoverage != 7]
brfss2019 = brfss2019[brfss2019.HaveHealthCoverage != 9]
brfss2019.HaveHealthCoverage.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['HaveHealthCoverage'] = brfss2019['HaveHealthCoverage'].replace({2:0})


array([1., 0.])

In [23]:
# 13 HaveHealthFinancialIssues
# Change 2 to 0 for no, 1 is already yes
# remove 7 for don/t know and 9 for refused
brfss2019['HaveHealthFinancialIssues'] = brfss2019['HaveHealthFinancialIssues'].replace({2:0})
brfss2019 = brfss2019[brfss2019.HaveHealthFinancialIssues != 7]
brfss2019 = brfss2019[brfss2019.HaveHealthFinancialIssues != 9]
brfss2019.HaveHealthFinancialIssues.unique()

array([0., 1.])

In [24]:
# 14 GeneralHealthCondition
# This is an ordinal variable that I want to keep (1 is Excellent -> 5 is Poor)
# Remove 7 and 9 for don't know and refused
brfss2019 = brfss2019[brfss2019.GeneralHealthCondition != 7]
brfss2019 = brfss2019[brfss2019.GeneralHealthCondition != 9]
brfss2019.GeneralHealthCondition.unique()

array([3., 4., 2., 1., 5.])

In [25]:
# 15 MentalHealthCondition
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss2019['MentalHealthCondition'] = brfss2019['MentalHealthCondition'].replace({88:0})
brfss2019 = brfss2019[brfss2019.MentalHealthCondition != 77]
brfss2019 = brfss2019[brfss2019.MentalHealthCondition != 99]
brfss2019.MentalHealthCondition.unique()

array([ 0., 30.,  4.,  1.,  2., 15.,  5.,  7., 10.,  3., 25.,  6., 21.,
       20.,  8., 14., 17., 28., 12., 16., 27., 23., 26., 29., 24.,  9.,
       13., 18., 22., 11., 19.])

In [26]:
# 16 PhysicalHealthCondition
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss2019['PhysicalHealthCondition'] = brfss2019['PhysicalHealthCondition'].replace({88:0})
brfss2019 = brfss2019[brfss2019.PhysicalHealthCondition != 77]
brfss2019 = brfss2019[brfss2019.PhysicalHealthCondition != 99]
brfss2019.PhysicalHealthCondition.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['PhysicalHealthCondition'] = brfss2019['PhysicalHealthCondition'].replace({88:0})


array([15., 10.,  0., 30., 20.,  2.,  1.,  7., 14.,  3.,  5., 25.,  4.,
        6., 28., 21., 17.,  8., 16., 27., 12., 23., 18., 13., 29., 19.,
        9., 24., 26., 11., 22.])

In [27]:
# 17 DifficultyInWalking
# change 2 to 0 for no. 1 is already yes
# remove 7 and 9 for don't know not sure and refused
brfss2019['DifficultyInWalking'] = brfss2019['DifficultyInWalking'].replace({2:0})
brfss2019 = brfss2019[brfss2019.DifficultyInWalking != 7]
brfss2019 = brfss2019[brfss2019.DifficultyInWalking != 9]
brfss2019.DifficultyInWalking.unique()

array([1., 0.])

In [28]:
# 18 Gender
# in other words - is respondent male (somewhat arbitrarily chose this change because men are at higher risk for heart disease)
# change 2 to 0 (female as 0). Male is 1
brfss2019['Gender'] = brfss2019['Gender'].replace({2:0})
brfss2019.Gender.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2019['Gender'] = brfss2019['Gender'].replace({2:0})


array([0, 1])

In [29]:
# 19 AgeBand
# already ordinal. 1 is 18-24 all the way up to 13 wis 80 and older. 5 year increments.
# remove 14 because it is don't know or missing
brfss2019 = brfss2019[brfss2019.AgeBand != 14]
brfss2019.AgeBand.unique()

array([13, 11, 10,  8, 12,  7,  6,  5,  9,  4,  3,  2,  1])

In [30]:
# 20 HighestLevelOfEducation
# This is already an ordinal variable with 1 being never attended school or kindergarten only up to 6 being college 4 years or more
# Scale here is 1-6
# Remove 9 for refused:
brfss2019 = brfss2019[brfss2019.HighestLevelOfEducation != 9]
brfss2019.HighestLevelOfEducation.unique()

array([3., 5., 6., 2., 4., 1.])

In [31]:
# 21 IncomeLevel
# Variable is already ordinal with 1 being less than $10,000 all the way up to 8 being $75,000 or more
# Remove 77 and 99 for don't know and refused
brfss2019 = brfss2019[brfss2019.IncomeLevel != 77]
brfss2019 = brfss2019[brfss2019.IncomeLevel != 99]
brfss2019.IncomeLevel.unique()

array([3., 5., 7., 8., 6., 4., 2., 1.])

In [32]:
brfss2019.shape

(175617, 22)

In [33]:
brfss2019.head()

Unnamed: 0,Diabetes,HighBloodPressure,HighCholesterol,CholesterolCheck,BodyMassIndex,Smoker,HadStroke,HadHeartDiseaseorAtack,PhysicallyActive,ConsumesFruits,...,HaveHealthCoverage,HaveHealthFinancialIssues,GeneralHealthCondition,MentalHealthCondition,PhysicalHealthCondition,DifficultyInWalking,Gender,AgeBand,HighestLevelOfEducation,IncomeLevel
0,0.0,1,1.0,0.0,28.0,1.0,0.0,0.0,0,1,...,1.0,0.0,3.0,0.0,15.0,1.0,0,13,3.0,3.0
1,0.0,0,0.0,0.0,19.0,0.0,0.0,0.0,1,1,...,1.0,0.0,4.0,0.0,10.0,0.0,0,11,5.0,5.0
2,2.0,1,0.0,0.0,32.0,0.0,0.0,0.0,1,1,...,1.0,0.0,3.0,30.0,0.0,1.0,0,10,6.0,7.0
6,2.0,0,0.0,0.0,33.0,1.0,0.0,0.0,1,1,...,1.0,0.0,2.0,0.0,30.0,1.0,1,11,6.0,7.0
9,2.0,0,1.0,0.0,17.0,1.0,0.0,0.0,0,0,...,1.0,0.0,5.0,0.0,20.0,1.0,0,11,2.0,3.0


In [34]:
brfss2019.groupby(['Diabetes']).size()

Unnamed: 0_level_0,0
Diabetes,Unnamed: 1_level_1
0.0,147245
1.0,3878
2.0,24494


In [35]:
brfss2019.to_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2019.csv', sep=",", index=False)