In [104]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

## Health Data by County

In [None]:
# CDC LOCALS health data
# https://data.cdc.gov/500-Cities-Places/PLACES-ZCTA-Data-GIS-Friendly-Format-2023-release/kee5-23sr/about_data

In [128]:
df_health1 = pd.read_csv("../data/raw/PLACES__Local_Data_for_Better_Health__County_Data_2022_release_20240102.csv")

In [129]:
df_health1.Year.unique()

array([2020, 2019])

In [130]:
df_health2 = pd.read_csv("../data/raw/PLACES__Local_Data_for_Better_Health__County_Data_2023_release_20240102.csv")

In [131]:
df_health2.Year.unique()

array([2021, 2020])

In [132]:
df_health1.columns

Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'LocationID', 'CategoryID', 'MeasureId', 'DataValueTypeID',
       'Short_Question_Text', 'Geolocation'],
      dtype='object')

In [133]:
unique_lists_cols = ['Year', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
        'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'LocationID', 'CategoryID', 'MeasureId', 'DataValueTypeID',
       'Short_Question_Text']

In [134]:
for col in unique_lists_cols:
    print(col)
    print(df_health1[col].unique()[0:4])

Year
[2020 2019]
DataSource
['BRFSS']
Category
['Prevention' 'Health Outcomes' 'Health Risk Behaviors' 'Health Status']
Measure
['Current lack of health insurance among adults aged 18-64 years'
 'Diagnosed diabetes among adults aged >=18 years'
 'Visits to doctor for routine checkup within the past year among adults aged >=18 years'
 'Cervical cancer screening among adult women aged 21-65 years']
Data_Value_Unit
['%']
Data_Value_Type
['Crude prevalence' 'Age-adjusted prevalence']
Data_Value_Footnote_Symbol
[nan]
Data_Value_Footnote
[nan]
LocationID
[  59 1121 1123 1131]
CategoryID
['PREVENT' 'HLTHOUT' 'RISKBEH' 'HLTHSTAT']
MeasureId
['ACCESS2' 'DIABETES' 'CHECKUP' 'CERVICAL']
DataValueTypeID
['CrdPrv' 'AgeAdjPrv']
Short_Question_Text
['Health Insurance' 'Diabetes' 'Annual Checkup'
 'Cervical Cancer Screening']


In [135]:
for col in df_health1.columns:
    print(col)
    print(df_health1[col].nunique())

Year
2
StateAbbr
52
StateDesc
52
LocationName
1840
DataSource
1
Category
4
Measure
30
Data_Value_Unit
1
Data_Value_Type
2
Data_Value
903
Data_Value_Footnote_Symbol
0
Data_Value_Footnote
0
Low_Confidence_Limit
900
High_Confidence_Limit
911
TotalPopulation
3068
LocationID
3144
CategoryID
4
MeasureId
30
DataValueTypeID
2
Short_Question_Text
30
Geolocation
3143


In [136]:
df_health1.StateAbbr.unique()

array(['US', 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'FL', 'GA', 'ID',
       'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'MD', 'MA', 'MI', 'MN', 'MS',
       'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH',
       'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA',
       'WY', 'WA', 'WV', 'WI', 'DE', 'DC', 'HI', 'ME'], dtype=object)

In [137]:
df_health1.LocationName.nunique()

1840

In [138]:
df_health1.DataSource.unique()

array(['BRFSS'], dtype=object)

In [139]:
df_health1.Category.unique()

array(['Prevention', 'Health Outcomes', 'Health Risk Behaviors',
       'Health Status'], dtype=object)

## Economic Data by County

In [95]:
#Economic Data - by county
# https://www.bea.gov/data/gdp/gdp-county-metro-and-other-areas
file_path = "../data/raw/lagdp1223.xlsx"
df = pd.read_excel(file_path, skiprows=3,skipfooter=4,names=['Location','GDP_2019','GDP_2020','GDP_2021','GDP_2022','GDP_RankInState_2022','PctChange_2020','PctChange_2021','PctChange_2022','PctChange_RankInState_2022'])

In [96]:
# Initialize an empty list to hold the state names as we go
states = []

# Initialize the current state variable
current_state = None

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Check if the Location is USA
    if row['Location'] == 'United States':
            # Set both State and County as 'Total'
            df.at[index, 'State'] = 'TOTAL'
            df.at[index, 'County'] = 'TOTAL'
            current_state = 'United States'  # Set the current state to 'United States'
            states.append(current_state)  # Add to our list of states


    elif pd.isna(row['Location']):
        # Set the next row's Location as the current state
        # Check if the next row exists to avoid IndexError at the end of the DataFrame
        if index + 1 < len(df):
            current_state = df.at[index + 1, 'Location']
            states.append(current_state)  # Add the state to our list
    else:
        # If it's not NaN and not a state (since states were added to the list),
        # it must be a county, so we assign the current state to it
        if row['Location'] not in states:
            df.at[index, 'State'] = current_state
            df.at[index, 'County'] = row['Location']
        else:
            # It's a state, so we assign it as 'Total' to the County column
            df.at[index, 'State'] = row['Location']
            df.at[index, 'County'] = 'TOTAL'

# Drop the NaN rows from the DataFrame as they are just separators
df.dropna(subset=['Location'], inplace=True)
df.drop(columns=['Location'],inplace=True)
# The DataFrame now has 'State' and 'County' for each row
df = df[['State', 'County'] + [c for c in df.columns if c not in ['State', 'County']]]



In [97]:
df.head()  # Display the first few rows to check our work

Unnamed: 0,State,County,GDP_2019,GDP_2020,GDP_2021,GDP_2022,GDP_RankInState_2022,PctChange_2020,PctChange_2021,PctChange_2022,PctChange_RankInState_2022
0,TOTAL,TOTAL,20573580103,20106509888,21271676784,21668520913,--,-2.3,5.8,1.9,--
2,Alabama,TOTAL,224944577,222081439,231892626,235807320,--,-1.3,4.4,1.7,--
3,Alabama,Autauga,1730861,1722438,1727818,1929264,23,-0.5,0.3,11.7,1
4,Alabama,Baldwin,8148786,8102009,8738819,8924207,7,-0.6,7.9,2.1,27
5,Alabama,Barbour,762557,731636,747888,745349,42,-4.1,2.2,-0.3,43


In [98]:
df.tail()

Unnamed: 0,State,County,GDP_2019,GDP_2020,GDP_2021,GDP_2022,GDP_RankInState_2022,PctChange_2020,PctChange_2021,PctChange_2022,PctChange_RankInState_2022
3214,Wyoming,Sweetwater,3695022,3405807,3288513,3095421,5,-7.8,-3.4,-5.9,19
3215,Wyoming,Teton,2620496,2659859,3591537,3507901,4,1.5,35.0,-2.3,16
3216,Wyoming,Uinta,882964,783682,800735,813461,13,-11.2,2.2,1.6,7
3217,Wyoming,Washakie,346359,344531,346237,342039,19,-0.5,0.5,-1.2,12
3218,Wyoming,Weston,303330,286131,285672,279273,21,-5.7,-0.2,-2.2,13


In [106]:
df[df.County=='Ware']

Unnamed: 0,State,County,GDP_2019,GDP_2020,GDP_2021,GDP_2022,GDP_RankInState_2022,PctChange_2020,PctChange_2021,PctChange_2022,PctChange_RankInState_2022
558,Georgia,Ware,1415915,1330060,1333672,1338301,56,-6.1,0.3,0.3,100


In [110]:
df.County.nunique()

array(['GA', 'IN', 'IA', 'TX', 'AL', 'AK', 'CA', 'AR', 'US', 'AZ', 'IL',
       'ID', 'KS', 'KY', 'CO', 'CT', 'DE', 'FL', 'DC', 'HI', 'LA', 'MI',
       'MN', 'MS', 'MD', 'MA', 'ME', 'MT', 'NE', 'MO', 'NY', 'ND', 'OR',
       'OH', 'OK', 'PA', 'NV', 'NC', 'NJ', 'NM', 'NH', 'SD', 'SC', 'TN',
       'RI', 'VA', 'UT', 'WI', 'WY', 'WV', 'VT', 'WA'], dtype=object)

## Food Access Data by County

In [None]:
# food access data by county
#https://www.ers.usda.gov/data-products/food-environment-atlas/data-access-and-documentation-downloads/#Current%20Version
# for example has rate of fast food per county

In [140]:
df_food = pd.read_csv("../data/raw/FoodEnvironmentAtlas/StateAndCountyData.csv")

In [141]:
df_food.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,18428.43969
1,1001,AL,Autauga,LACCESS_POP15,17496.69304
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026
3,1001,AL,Autauga,PCT_LACCESS_POP10,33.769657
4,1001,AL,Autauga,PCT_LACCESS_POP15,32.062255


In [142]:
df_food.State.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', ' AK', ' AZ', ' AR',
       ' CA', ' CO', ' CT', ' DE', ' DC', ' FL', ' GA', ' HI', ' ID',
       ' IL', ' IN', ' IA', ' KS', ' KY', ' LA', ' ME', ' MD', ' MA',
       ' MI', ' MN', ' MS', ' MO', ' MT', ' NE', ' NV', ' NH', ' NJ',
       ' NM', ' NY', ' NC', ' ND', ' OH', ' OK', ' OR', ' PA', ' RI',
       ' SC', ' SD', ' TN', ' TX', ' UT', ' VT', ' VA', ' WA', ' WV',
       ' WI', ' WY'], dtype=object)

In [143]:
df_food.County.nunique()

3708

In [144]:
df_food.Variable_Code.unique()

array(['LACCESS_POP10', 'LACCESS_POP15', 'PCH_LACCESS_POP_10_15',
       'PCT_LACCESS_POP10', 'PCT_LACCESS_POP15', 'LACCESS_LOWI10',
       'LACCESS_LOWI15', 'PCH_LACCESS_LOWI_10_15', 'PCT_LACCESS_LOWI10',
       'PCT_LACCESS_LOWI15', 'LACCESS_HHNV10', 'LACCESS_HHNV15',
       'PCH_LACCESS_HHNV_10_15', 'PCT_LACCESS_HHNV10',
       'PCT_LACCESS_HHNV15', 'LACCESS_SNAP15', 'PCT_LACCESS_SNAP15',
       'LACCESS_CHILD10', 'LACCESS_CHILD15', 'LACCESS_CHILD_10_15',
       'PCT_LACCESS_CHILD10', 'PCT_LACCESS_CHILD15', 'LACCESS_SENIORS10',
       'LACCESS_SENIORS15', 'PCH_LACCESS_SENIORS_10_15',
       'PCT_LACCESS_SENIORS10', 'PCT_LACCESS_SENIORS15',
       'LACCESS_WHITE15', 'PCT_LACCESS_WHITE15', 'LACCESS_BLACK15',
       'PCT_LACCESS_BLACK15', 'LACCESS_HISP15', 'PCT_LACCESS_HISP15',
       'LACCESS_NHASIAN15', 'PCT_LACCESS_NHASIAN15', 'LACCESS_NHNA15',
       'PCT_LACCESS_NHNA15', 'LACCESS_NHPI15', 'PCT_LACCESS_NHPI15',
       'LACCESS_MULTIR15', 'PCT_LACCESS_MULTIR15', 'GROC11', 'GROC16',


In [145]:
df_food.Variable_Code.nunique()

332

In [146]:
df_food[df_food.County=='Ware']

Unnamed: 0,FIPS,State,County,Variable_Code,Value
21882,13299,GA,Ware,LACCESS_POP10,13610.323490
21883,13299,GA,Ware,LACCESS_POP15,14253.192790
21884,13299,GA,Ware,PCH_LACCESS_POP_10_15,4.723395
21885,13299,GA,Ware,PCT_LACCESS_POP10,37.481613
21886,13299,GA,Ware,PCT_LACCESS_POP15,39.252018
...,...,...,...,...,...
783275,13299,GA,Ware,PERPOV10,1.000000
783276,13299,GA,Ware,CHILDPOVRATE15,45.600000
783277,13299,GA,Ware,PERCHLDPOV10,1.000000
783278,13299,GA,Ware,METRO13,0.000000
