In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Store filepath in a variable
file_one = Path("Resources/BRFSS__Table_of_Overweight_and_Obesity__BMI__20231102.csv")

In [3]:
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
file_one_df = pd.read_csv(file_one, encoding="ISO-8859-1")

In [4]:
# Show just the header
file_one_df.head()

Unnamed: 0,Year,Locationabbr,Locationdesc,Class,Topic,Question,Response,Break_Out,Break_Out_Category,Sample_Size,...,Data_Value_Footnote,DataSource,ClassId,TopicId,LocationID,BreakoutID,BreakOutCategoryID,QuestionID,ResponseID,GeoLocation
0,2020,AK,Alaska,Overweight and Obesity (BMI),BMI Categories,Weight classification by Body Mass Index (BMI)...,Underweight (BMI 12.0-18.4),45-54,Age Group,5,...,Prevalence estimate not available if the unwei...,BRFSS,CLASS14,TOPIC09,2,AGE04,CAT3,_BMI5CAT,RESP042,"(64.84507995700051, -147.72205903599973)"
1,2019,AL,Alabama,Overweight and Obesity (BMI),BMI Categories,Weight classification by Body Mass Index (BMI)...,Underweight (BMI 12.0-18.4),35-44,Age Group,7,...,Prevalence estimate not available if the unwei...,BRFSS,CLASS14,TOPIC09,1,AGE03,CAT3,_BMI5CAT,RESP042,"(32.84057112200048, -86.63186076199969)"
2,2019,AL,Alabama,Overweight and Obesity (BMI),BMI Categories,Weight classification by Body Mass Index (BMI)...,Underweight (BMI 12.0-18.4),"Other, non-Hispanic",Race/Ethnicity,1,...,Prevalence estimate not available if the unwei...,BRFSS,CLASS14,TOPIC09,1,RACE06,CAT4,_BMI5CAT,RESP042,"(32.84057112200048, -86.63186076199969)"
3,2019,AL,Alabama,Overweight and Obesity (BMI),BMI Categories,Weight classification by Body Mass Index (BMI)...,Underweight (BMI 12.0-18.4),"Multiracial, non-Hispanic",Race/Ethnicity,1,...,Prevalence estimate not available if the unwei...,BRFSS,CLASS14,TOPIC09,1,RACE07,CAT4,_BMI5CAT,RESP042,"(32.84057112200048, -86.63186076199969)"
4,2019,AK,Alaska,Overweight and Obesity (BMI),BMI Categories,Weight classification by Body Mass Index (BMI)...,Normal Weight (BMI 18.5-24.9),"Asian, non-Hispanic",Race/Ethnicity,16,...,Prevalence estimate not available if the unwei...,BRFSS,CLASS14,TOPIC09,2,RACE04,CAT4,_BMI5CAT,RESP041,"(64.84507995700051, -147.72205903599973)"


In [10]:
# Filter only the needed columns
cleaned_columns_df = file_one_df[["Year", "Locationabbr", "Locationdesc", "Response", "Break_Out", "Break_Out_Category", "Sample_Size", "Data_value", "GeoLocation"]];
print(cleaned_columns_df.count())
cleaned_columns_df.head()
unique_years = cleaned_columns_df['Year'].unique()
count = len(unique_years)
print(f"Number of unique_years: {count}")

Year                  64594
Locationabbr          64594
Locationdesc          64594
Response              64594
Break_Out             64594
Break_Out_Category    64594
Sample_Size           64594
Data_value            51506
GeoLocation           64498
dtype: int64
Number of unique_years: 12


In [11]:
# Filter need rows only starting from year 2014-2019
cleaned_columns_df = cleaned_columns_df.loc[(cleaned_columns_df["Year"]>=2014) & (cleaned_columns_df["Year"]<=2019), :]
print(cleaned_columns_df.count())
cleaned_columns_df.head()


# group_A_X = temp.get_group(('2015', '2016'))
# group_A_X.head()
unique_years = cleaned_columns_df['Year'].unique()
count = len(unique_years)
print(f"Number of unique_years: {count}")

Year                  32450
Locationabbr          32450
Locationdesc          32450
Response              32450
Break_Out             32450
Break_Out_Category    32450
Sample_Size           32450
Data_value            25462
GeoLocation           32402
dtype: int64
Number of unique_years: 6


In [12]:
# Filter Break_Out_Category	 to "Age Group" only to filter out the Data_value with entries
cleaned_columns_df['Break_Out_Category'] = cleaned_columns_df['Break_Out_Category'].astype(str)
print(cleaned_columns_df.dtypes)
cleaned_columns_df = cleaned_columns_df.loc[(cleaned_columns_df["Break_Out_Category"].str.contains("Age Group")), :]
print(cleaned_columns_df.count())

cleaned_columns_df.head()


unique_years = cleaned_columns_df['Year'].unique()
count = len(unique_years)
print(f"Number of unique_years: {count}")

Year                    int64
Locationabbr           object
Locationdesc           object
Response               object
Break_Out              object
Break_Out_Category     object
Sample_Size             int64
Data_value            float64
GeoLocation            object
dtype: object
Year                  7632
Locationabbr          7632
Locationdesc          7632
Response              7632
Break_Out             7632
Break_Out_Category    7632
Sample_Size           7632
Data_value            6594
GeoLocation           7632
dtype: int64
Number of unique_years: 6


In [13]:
# Filter Response to "Obese" only to filter out the Data_value with entries
cleaned_columns_df['Response'] = cleaned_columns_df['Response'].astype(str)
print(cleaned_columns_df.dtypes)
cleaned_columns_df = cleaned_columns_df.loc[(cleaned_columns_df["Response"].str.contains("Obese")) | (cleaned_columns_df["Response"].str.contains("Overweight")), :]
print(cleaned_columns_df.count())
cleaned_columns_df

unique_years = cleaned_columns_df['Year'].unique()
count = len(unique_years)
print(f"Number of unique_years: {count}")

Year                    int64
Locationabbr           object
Locationdesc           object
Response               object
Break_Out              object
Break_Out_Category     object
Sample_Size             int64
Data_value            float64
GeoLocation            object
dtype: object
Year                  3816
Locationabbr          3816
Locationdesc          3816
Response              3816
Break_Out             3816
Break_Out_Category    3816
Sample_Size           3816
Data_value            3813
GeoLocation           3816
dtype: int64
Number of unique_years: 6


In [14]:
# grouped_df = cleaned_columns_df.groupby(["Locationabbr"])
# cleaned_columns_df["Locationabbr"].value_counts()

unique_states = cleaned_columns_df['Locationabbr'].unique()
count = len(unique_states)
print(f"Number of unique states: {count}")

unique_years = cleaned_columns_df['Year'].unique()
count = len(unique_years)
print(f"Number of unique_years: {count}")

Number of unique states: 54
Number of unique_years: 6


In [16]:
# Export file as a CSV, without the Pandas index, but with the header
cleaned_columns_df.to_csv("clean_data/BRFSS__Table_of_Overweight_and_Obesity__BMI__20231102_cleaned.csv", index=False, header=True)