In [1]:
import pandas as pd

In [2]:
df2021 = pd.read_csv('/content/drive/MyDrive/DM_Fall24_Project/brfss2021.csv')

In [3]:
df2021.shape

(133873, 304)

In [4]:
df2021.dtypes

Unnamed: 0,0
Unnamed: 0,int64
x.state,int64
fmonth,int64
idate,int64
imonth,int64
...,...
x.veglt1a,float64
x.frt16a,float64
x.veg23a,float64
x.fruite1,float64


In [5]:
df2021_selected = df2021[['diabete4',
                          'x.rfhype6',
                          'toldhi3',
                          'cholchk3',
                          'x.bmi5',
                          'smoke100',
                          'cvdstrk3',
                          'x.michd',
                          'x.totinda',
                          'x.frtlt1a',
                          'x.veglt1a',
                          'x.rfdrhv7',
                          'x.hlthpln',
                          'medcost1',
                          'genhlth',
                          'menthlth',
                          'physhlth',
                          'diffwalk',
                          'sexvar',
                          'x.ageg5yr',
                          'educa',
                          'income3' ]]

In [6]:
df2021_selected.shape

(133873, 22)

In [7]:
#Drop Missing Values - knocks 33,000 rows out right away
df2021_selected = df2021_selected.dropna()
df2021_selected.shape

(100638, 22)

In [8]:
#Rename the columns to make them more readable
brfss2021 = df2021_selected.rename(columns =
                                   {'diabete4':'Diabetes',
                                    'x.rfhype6':'HighBloodPressure',
                                    'toldhi3':'HighCholesterol',
                                    'cholchk3':'CholesterolCheck',
                                    'x.bmi5':'BodyMassIndex',
                                    'smoke100':'Smoker',
                                    'cvdstrk3':'HadStroke',
                                    'x.michd':'HadHeartDiseaseorAtack',
                                    'x.totinda':'PhysicallyActive',
                                    'x.frtlt1a':'ConsumesFruits',
                                    'x.veglt1a':'ConsumesVeggies',
                                    'x.rfdrhv7':'HeavyAlcoholConsumption',
                                    'x.hlthpln':'HaveHealthCoverage',
                                    'medcost1':'HaveHealthFinancialIssues',
                                    'genhlth':'GeneralHealthCondition',
                                    'menthlth':'MentalHealthCondition',
                                    'physhlth':'PhysicalHealthCondition',
                                    'diffwalk':'DifficultyInWalking',
                                    'sexvar':'Gender',
                                    'x.ageg5yr':'AgeBand',
                                    'educa':'HighestLevelOfEducation',
                                    'income3':'IncomeLevel' })

In [9]:
brfss2021.head()

Unnamed: 0,Diabetes,HighBloodPressure,HighCholesterol,CholesterolCheck,BodyMassIndex,Smoker,HadStroke,HadHeartDiseaseorAtack,PhysicallyActive,ConsumesFruits,...,HaveHealthCoverage,HaveHealthFinancialIssues,GeneralHealthCondition,MentalHealthCondition,PhysicalHealthCondition,DifficultyInWalking,Gender,AgeBand,HighestLevelOfEducation,IncomeLevel
0,3.0,1.0,1.0,2.0,1454.0,1.0,2.0,2.0,2.0,1.0,...,1.0,2.0,5.0,10.0,20.0,2.0,2,11.0,4.0,5.0
2,1.0,2.0,2.0,2.0,2829.0,2.0,2.0,1.0,2.0,1.0,...,1.0,2.0,2.0,88.0,88.0,2.0,2,11.0,4.0,3.0
3,1.0,2.0,1.0,2.0,3347.0,2.0,2.0,2.0,1.0,1.0,...,1.0,2.0,2.0,10.0,88.0,2.0,2,9.0,4.0,7.0
4,1.0,1.0,1.0,2.0,2873.0,2.0,1.0,1.0,1.0,1.0,...,1.0,2.0,5.0,88.0,30.0,1.0,1,12.0,3.0,4.0
5,3.0,1.0,2.0,2.0,2437.0,1.0,2.0,2.0,2.0,2.0,...,1.0,2.0,3.0,88.0,88.0,1.0,1,13.0,5.0,6.0


In [10]:
brfss2021.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100638 entries, 0 to 133871
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Diabetes                   100638 non-null  float64
 1   HighBloodPressure          100638 non-null  float64
 2   HighCholesterol            100638 non-null  float64
 3   CholesterolCheck           100638 non-null  float64
 4   BodyMassIndex              100638 non-null  float64
 5   Smoker                     100638 non-null  float64
 6   HadStroke                  100638 non-null  float64
 7   HadHeartDiseaseorAtack     100638 non-null  float64
 8   PhysicallyActive           100638 non-null  float64
 9   ConsumesFruits             100638 non-null  float64
 10  ConsumesVeggies            100638 non-null  float64
 11  HeavyAlcoholConsumption    100638 non-null  float64
 12  HaveHealthCoverage         100638 non-null  float64
 13  HaveHealthFinancialIssues  100638 

In [11]:
brfss2021.Diabetes.value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
3.0,82607
1.0,14539
4.0,2574
2.0,814
7.0,90
9.0,14


In [12]:
# Diabetes
# going to make this ordinal. 0 is for no diabetes or only during pregnancy, 1 is for pre-diabetes or borderline diabetes, 2 is for yes diabetes
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2021['Diabetes'] = brfss2021['Diabetes'].replace({2:0, 3:0, 1:2, 4:1})
brfss2021 = brfss2021[brfss2021.Diabetes != 7]
brfss2021 = brfss2021[brfss2021.Diabetes != 9]
brfss2021.Diabetes.unique()

array([0., 2., 1.])

In [13]:
# 1 HighBloodPressure
#Change 1 to 0 so it represetnts No high blood pressure and 2 to 1 so it represents high blood pressure
brfss2021['HighBloodPressure'] = brfss2021['HighBloodPressure'].replace({1:0, 2:1})
brfss2021 = brfss2021[brfss2021.HighBloodPressure != 9]
brfss2021.HighBloodPressure.unique()

array([0., 1.])

In [14]:
# 2 HighCholesterol
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2021['HighCholesterol'] = brfss2021['HighCholesterol'].replace({2:0})
brfss2021 = brfss2021[brfss2021.HighCholesterol != 7]
brfss2021 = brfss2021[brfss2021.HighCholesterol != 9]
brfss2021.HighCholesterol.unique()

array([1., 0.])

In [15]:
# 3 CholesterolCheck
# Change 3 to 0 and 2 to 0 for Not checked cholesterol in past 5 years
# Remove 9
brfss2021['CholesterolCheck'] = brfss2021['CholesterolCheck'].replace({3:0,2:0})
brfss2021 = brfss2021[brfss2021.CholesterolCheck != 9]
brfss2021.CholesterolCheck.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2021['CholesterolCheck'] = brfss2021['CholesterolCheck'].replace({3:0,2:0})


array([0., 8., 5., 4., 6., 7.])

In [16]:
# 4 BodyMassIndex (no changes, just note that these are BMI * 100. So for example a BMI of 4018 is really 40.18)
brfss2021['BodyMassIndex'] = brfss2021['BodyMassIndex'].div(100).round(0)
brfss2021.BodyMassIndex.unique()

array([15., 28., 33., 29., 24., 46., 23., 40., 27., 35., 18., 30., 25.,
       36., 22., 31., 45., 26., 14., 38., 21., 32., 20., 19., 34., 41.,
       43., 44., 39., 37., 16., 42., 50., 51., 17., 52., 47., 49., 56.,
       57., 48., 58., 61., 53., 63., 64., 54., 68., 55., 62., 13., 59.,
       89., 66., 77., 60., 87., 69., 72., 75., 67., 71., 65., 82., 86.,
       70., 78., 12., 74., 98., 73., 84., 76., 80., 83., 79., 99., 88.,
       81.])

In [17]:
# 5 Smoker
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2021['Smoker'] = brfss2021['Smoker'].replace({2:0})
brfss2021 = brfss2021[brfss2021.Smoker != 7]
brfss2021 = brfss2021[brfss2021.Smoker != 9]
brfss2021.Smoker.unique()

array([1., 0.])

In [18]:
# 6 HadStroke
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss2021['HadStroke'] = brfss2021['HadStroke'].replace({2:0})
brfss2021 = brfss2021[brfss2021.HadStroke != 7]
brfss2021 = brfss2021[brfss2021.HadStroke != 9]
brfss2021.HadStroke.unique()

array([0., 1.])

In [19]:
# 7 HadHeartDiseaseorAtack
# Change 2 to 0 because this means did not have MI or CHD
brfss2021['HadHeartDiseaseorAtack'] = brfss2021['HadHeartDiseaseorAtack'].replace({2: 0})
brfss2021.HadHeartDiseaseorAtack.unique()

array([0., 1.])

In [20]:
# 8 PhysicallyActive
# 1 for physical activity
# change 2 to 0 for no physical activity
# Remove all 9 (don't know/refused)
brfss2021['PhysicallyActive'] = brfss2021['PhysicallyActive'].replace({2:0})
brfss2021 = brfss2021[brfss2021.PhysicallyActive != 9]
brfss2021.PhysicallyActive.unique()

array([0., 1.])

In [21]:
# 9 ConsumesFruits
# Change 2 to 0. this means no fruit consumed per day. 1 will mean consumed 1 or more pieces of fruit per day
# remove all dont knows and missing 9
brfss2021['ConsumesFruits'] = brfss2021['ConsumesFruits'].replace({2:0})
brfss2021 = brfss2021[brfss2021.ConsumesFruits != 9]
brfss2021.ConsumesFruits.unique()

array([1., 0.])

In [22]:
# 10 ConsumesVeggies
# Change 2 to 0. this means no vegetables consumed per day. 1 will mean consumed 1 or more pieces of vegetable per day
# remove all dont knows and missing 9
brfss2021['ConsumesVeggies'] = brfss2021['ConsumesVeggies'].replace({2:0})
brfss2021 = brfss2021[brfss2021.ConsumesVeggies != 9]
brfss2021.ConsumesVeggies.unique()

array([1., 0.])

In [23]:
# 11 HeavyAlcoholConsumption
# Change 1 to 0 (1 was no for heavy drinking). change all 2 to 1 (2 was yes for heavy drinking)
# remove all dont knows and missing 9
brfss2021['HeavyAlcoholConsumption'] = brfss2021['HeavyAlcoholConsumption'].replace({1:0, 2:1})
brfss2021 = brfss2021[brfss2021.HeavyAlcoholConsumption != 9]
brfss2021.HeavyAlcoholConsumption.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brfss2021['HeavyAlcoholConsumption'] = brfss2021['HeavyAlcoholConsumption'].replace({1:0, 2:1})


array([0., 1.])

In [24]:
# 12 HaveHealthCoverage
# 1 is yes, change 2 to 0 because it is No health care access
# remove 7 and 9 for don't know or refused
brfss2021['HaveHealthCoverage'] = brfss2021['HaveHealthCoverage'].replace({2:0})
brfss2021 = brfss2021[brfss2021.HaveHealthCoverage != 9]
brfss2021.HaveHealthCoverage.unique()

array([1., 0.])

In [25]:
# 13 HaveHealthFinancialIssues
# Change 2 to 0 for no, 1 is already yes
# remove 7 for don/t know and 9 for refused
brfss2021['HaveHealthFinancialIssues'] = brfss2021['HaveHealthFinancialIssues'].replace({2:0})
brfss2021 = brfss2021[brfss2021.HaveHealthFinancialIssues != 7]
brfss2021 = brfss2021[brfss2021.HaveHealthFinancialIssues != 9]
brfss2021.HaveHealthFinancialIssues.unique()

array([0., 1.])

In [26]:
# 14 GeneralHealthCondition
# This is an ordinal variable that I want to keep (1 is Excellent -> 5 is Poor)
# Remove 7 and 9 for don't know and refused
brfss2021 = brfss2021[brfss2021.GeneralHealthCondition != 7]
brfss2021 = brfss2021[brfss2021.GeneralHealthCondition != 9]
brfss2021.GeneralHealthCondition.unique()

array([5., 2., 3., 4., 1.])

In [27]:
# 15 MentalHealthCondition
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss2021['MentalHealthCondition'] = brfss2021['MentalHealthCondition'].replace({88:0})
brfss2021 = brfss2021[brfss2021.MentalHealthCondition != 77]
brfss2021 = brfss2021[brfss2021.MentalHealthCondition != 99]
brfss2021.MentalHealthCondition.unique()

array([10.,  0.,  5., 25.,  2.,  7., 30.,  3., 14., 20.,  8.,  1., 15.,
        4., 28., 24., 21., 12.,  6., 22., 27., 18., 13., 17., 16.,  9.,
       19., 29., 23., 11., 26.])

In [28]:
# 16 PhysicalHealthCondition
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss2021['PhysicalHealthCondition'] = brfss2021['PhysicalHealthCondition'].replace({88:0})
brfss2021 = brfss2021[brfss2021.PhysicalHealthCondition != 77]
brfss2021 = brfss2021[brfss2021.PhysicalHealthCondition != 99]
brfss2021.PhysicalHealthCondition.unique()

array([20.,  0., 30., 25.,  1.,  4., 10.,  2.,  3., 15.,  8., 13., 14.,
        5.,  7.,  6., 24., 29., 18.,  9., 16., 17., 26., 28., 12., 21.,
       27., 11., 19., 22., 23.])

In [29]:
# 17 DifficultyInWalking
# change 2 to 0 for no. 1 is already yes
# remove 7 and 9 for don't know not sure and refused
brfss2021['DifficultyInWalking'] = brfss2021['DifficultyInWalking'].replace({2:0})
brfss2021 = brfss2021[brfss2021.DifficultyInWalking != 7]
brfss2021 = brfss2021[brfss2021.DifficultyInWalking != 9]
brfss2021.DifficultyInWalking.unique()

array([0., 1.])

In [30]:
# 18 Gender
# in other words - is respondent male (somewhat arbitrarily chose this change because men are at higher risk for heart disease)
# change 2 to 0 (female as 0). Male is 1
brfss2021['Gender'] = brfss2021['Gender'].replace({2:0})
brfss2021.Gender.unique()

array([0, 1])

In [31]:
# 19 AgeBand
# already ordinal. 1 is 18-24 all the way up to 13 wis 80 and older. 5 year increments.
# remove 14 because it is don't know or missing
brfss2021 = brfss2021[brfss2021.AgeBand != 14]
brfss2021.AgeBand.unique()

array([11.,  9., 12., 13., 10.,  7.,  6.,  8.,  4.,  3.,  5.,  2.,  1.])

In [32]:
# 20 HighestLevelOfEducation
# This is already an ordinal variable with 1 being never attended school or kindergarten only up to 6 being college 4 years or more
# Scale here is 1-6
# Remove 9 for refused:
brfss2021 = brfss2021[brfss2021.HighestLevelOfEducation != 9]
brfss2021.HighestLevelOfEducation.unique()

array([4., 3., 5., 6., 2., 1.])

In [33]:
# 21 IncomeLevel
# Variable is already ordinal with 1 being less than $10,000 all the way up to 8 being $75,000 or more
# Remove 77 and 99 for don't know and refused
brfss2021 = brfss2021[brfss2021.IncomeLevel != 77]
brfss2021 = brfss2021[brfss2021.IncomeLevel != 99]
brfss2021.IncomeLevel.unique()

array([ 5.,  3.,  7.,  4.,  6.,  8.,  2.,  9., 10.,  1., 11.])

In [34]:
brfss2021.shape

(72972, 22)

In [35]:
brfss2021.groupby(['Diabetes']).size()

Unnamed: 0_level_0,0
Diabetes,Unnamed: 1_level_1
0.0,60862
1.0,1848
2.0,10262


In [36]:
brfss2021.to_csv('/content/drive/MyDrive/DM_Fall24_Project/Cleaned_BRFSS2021.csv', sep=",", index=False)