In [None]:
from dotenv import load_dotenv
import os

load_dotenv("../.env")   # from notebooks/ folder

In [2]:
import pandas as pd
import os


In [3]:
epa_path = "../data/epa_pm25_annual_by_county.csv"
pm25_df = pd.read_csv(epa_path)
pm25_df.head()


Unnamed: 0,county_name,state_fips,county_fips,year,annual_pm25
0,"Los Angeles County, CA",6,37,2018,11.924489
1,"Los Angeles County, CA",6,37,2019,10.113096
2,"Los Angeles County, CA",6,37,2020,13.43545
3,"Los Angeles County, CA",6,37,2021,12.161321
4,"Los Angeles County, CA",6,37,2022,10.929975


In [4]:
pm25_df["state_fips"] = pm25_df["state_fips"].astype(str).str.zfill(2)
pm25_df["county_fips"] = pm25_df["county_fips"].astype(str).str.zfill(3)

pm25_df["full_fips"] = pm25_df["state_fips"] + pm25_df["county_fips"]

target_fips = set(pm25_df["full_fips"])

print("✅ Correct EPA FIPS:")
print(sorted(target_fips))


✅ Correct EPA FIPS:
['04013', '06037', '17031', '48201']


In [5]:
asthma_raw = pd.read_csv("../data/cdc_places_asthma.csv")

asthma_raw.columns



  asthma_raw = pd.read_csv("../data/cdc_places_asthma.csv")


Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'TotalPop18plus', 'LocationID', 'CategoryID', 'MeasureId',
       'DataValueTypeID', 'Short_Question_Text', 'Geolocation'],
      dtype='object')

In [6]:
asthma = asthma_raw[
    asthma_raw["Measure"].str.contains("Asthma", case=False, na=False)
].copy()

asthma.head()


Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,...,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,TotalPop18plus,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,Geolocation
1,2023,AR,Arkansas,Fulton,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,10.6,...,9.2,11.9,12421,9795,5049,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,POINT (-91.817888079321 36.3816206347765)
9,2023,CO,Colorado,Lake,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,10.2,...,9.0,11.6,7365,5869,8065,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,POINT (-106.344971513974 39.2024367117474)
64,2023,ME,Maine,Hancock,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,12.1,...,10.7,13.5,56526,47589,23009,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,POINT (-68.3588356549835 44.6432336584807)
66,2023,MD,Maryland,Carroll,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,10.1,...,8.9,11.4,176639,137462,24013,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,POINT (-77.0227635353375 39.5629025323817)
83,2023,NE,Nebraska,Thomas,BRFSS,Health Outcomes,Current asthma among adults,%,Crude prevalence,9.0,...,7.9,10.1,677,525,31171,HLTHOUT,CASTHMA,CrdPrv,Current Asthma,POINT (-100.555663288734 41.9135892531917)


In [7]:
asthma = asthma_raw.copy()

possible_fips_cols = ["CountyFIPS", "LocationID", "County_FIPS", "CountyFips", "FIPS"]

fips_col = None
for col in possible_fips_cols:
    if col in asthma.columns:
        fips_col = col
        break

if fips_col is None:
    raise ValueError("❌ No county FIPS column found in CDC dataset.")

print("✅ CDC FIPS column detected as:", fips_col)

asthma[fips_col] = asthma[fips_col].astype(str).str.zfill(5)
asthma["state_fips"] = asthma[fips_col].str[:2]
asthma["county_fips"] = asthma[fips_col].str[2:]
asthma["full_fips"] = asthma["state_fips"] + asthma["county_fips"]


✅ CDC FIPS column detected as: LocationID


In [8]:
asthma = asthma[
    asthma["full_fips"].isin(target_fips) &
    asthma["Measure"].str.contains("Asthma", case=False, na=False)
]

asthma = asthma[["state_fips", "county_fips", "Year", "Data_Value"]]
asthma = asthma.rename(columns={
    "Year": "year",
    "Data_Value": "asthma_prevalence"
})

asthma.head()


Unnamed: 0,state_fips,county_fips,year,asthma_prevalence
8012,6,37,2023,9.1
11373,4,13,2023,10.1
11579,6,37,2023,9.0
16146,4,13,2023,10.0
49301,17,31,2023,9.8


In [9]:
output_path = "../data/cdc_asthma_by_county.csv"
asthma.to_csv(output_path, index=False)

print("✅ CDC asthma dataset saved to:", output_path)


✅ CDC asthma dataset saved to: ../data/cdc_asthma_by_county.csv
