In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
DATA_DIR = './data/'
CANCER_DATA = 'cancer_incd_rate_2016_2020.csv'
AQI_DATA = lambda year: f'annual_aqi_by_county_{year}.csv'
US_COUNTIES = 'uscounties/uscounties.csv'

In [4]:
def get_cancer_data():
    df = pd.read_csv(DATA_DIR + CANCER_DATA, skiprows=8, skipfooter=31, engine='python')
    df.insert(0, 'State', df['County'].apply(lambda x: x.split(', ')[-1][:-3]))
    df['County'] = df['County'].apply(lambda x: x.split(', ')[0])
    df = df.iloc[1:].rename({
        'Age-Adjusted Incidence Rate([rate note]) - cases per 100,000': 'Incidence Rate per 100k', 
        'Recent 5-Year Trend ([trend note]) in Incidence Rates': 'Recent 5-Year Trend',
        ' FIPS': 'FIPS'
    }, axis=1)
    df.replace('data not available', np.nan, inplace=True)
    df['Recent 5-Year Trend'] = pd.to_numeric(df['Recent 5-Year Trend'], errors='coerce')
    df['FIPS'] = pd.to_numeric(df['FIPS'], errors='coerce')
    df["FIPS"] = df["FIPS"].astype('Int64')
    return df
get_cancer_data()

Unnamed: 0,State,County,FIPS,Incidence Rate per 100k,Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
1,Florida,Union County,12125,1237.4,1165.6,1312.8,,1,1,237,stable,0.6,-0.5,1.9
2,Iowa,Palo Alto County,19147,658.1,591.1,731.1,,1,6,82,rising,4.8,0.2,15.4
3,Montana,Treasure County,30103,652.2,401,1007.4,,1,55,7,stable,-1.1,-5.6,3.3
4,Texas,Polk County,48373,633.6,604.6,663.7,,1,4,425,rising,2.2,1.2,4.2
5,Kentucky,Floyd County,21071,616.8,584.3,650.7,,1,19,295,stable,1.5,-1.8,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3139,Kansas,Wyandotte County,20209,data not available,,data not available,,,,,,,,
3140,Virginia,Wythe County,51197,data not available,,data not available,,,,,,,,
3141,Alaska,Yakutat City and Borough,2282,*,*,*,*,*,*,3 or fewer,*,,*,*
3142,Minnesota,Yellow Medicine County,27173,data not available,,data not available,,,,,,,,


In [20]:
def average_aqi(df: pd.DataFrame):

    pass

# compile aqi data into 5 year avaerage
def compile_aqi_data():
    dfs = []
    
    for year in range(2016, 2021):
        aqi = pd.read_csv(DATA_DIR + AQI_DATA(year))
        dfs.append(aqi)
    
    annual_aqi_2016_2020 = pd.concat(dfs)
    annual_aqi_2016_2020 = annual_aqi_2016_2020.groupby(['State', 'County']).agg(
        {
            'Good Days': 'mean', 
            'Moderate Days': 'mean', 
            'Unhealthy for Sensitive Groups Days': 'mean',
            'Unhealthy Days': 'mean',
            'Very Unhealthy Days': 'mean',
            'Hazardous Days': 'mean',
            'Max AQI': 'max',
            'Median AQI': 'mean',
            'Days CO': 'sum',
            'Days NO2': 'sum',
            'Days Ozone': 'sum',
            'Days PM2.5': 'sum',
            'Days PM10': 'sum',
        }
    ).reset_index()
    annual_aqi_2016_2020.melt(id_vars=['State', 'County'])
    
    return annual_aqi_2016_2020

compile_aqi_data()

Unnamed: 0,State,County,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,244.0,27.6,0.20,0.0,0.0,0.0,108,36.20,0,0,1059,300,0
1,Alabama,Clay,102.4,9.4,0.00,0.0,0.0,0.0,86,28.60,0,0,0,559,0
2,Alabama,Colbert,256.5,19.5,0.25,0.0,0.0,0.0,115,36.75,0,0,874,231,0
3,Alabama,DeKalb,324.6,31.6,0.20,0.0,0.0,0.0,119,37.80,0,0,1615,167,0
4,Alabama,Elmore,187.2,10.8,0.00,0.0,0.0,0.0,100,35.60,0,0,990,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060,Wyoming,Sublette,264.2,96.2,4.20,0.8,0.0,0.0,200,45.80,0,0,1756,70,1
1061,Wyoming,Sweetwater,254.8,105.0,3.80,1.4,0.2,0.2,1250,46.00,0,13,1515,28,271
1062,Wyoming,Teton,312.0,50.6,1.80,1.0,0.0,0.0,161,42.80,0,0,1720,104,3
1063,Wyoming,Uinta,315.2,49.0,1.00,0.0,0.0,0.2,696,42.80,0,8,1646,0,173


In [6]:
def get_uscounties():
    df = pd.read_csv(DATA_DIR + US_COUNTIES).rename({'county': 'County'}, axis=1)
    df["county_fips"] = pd.to_numeric(df["county_fips"], errors='coerce')
    df["county_fips"] = df["county_fips"].astype('Int64')
    df = df.rename(columns={"county_fips": "FIPS"})
    return df
get_uscounties()

Unnamed: 0,County,county_ascii,county_full,FIPS,state_id,state_name,lat,lng,population
0,Los Angeles,Los Angeles,Los Angeles County,6037,CA,California,34.3219,-118.2247,9936690
1,Cook,Cook,Cook County,17031,IL,Illinois,41.8401,-87.8168,5225367
2,Harris,Harris,Harris County,48201,TX,Texas,29.8578,-95.3938,4726177
3,Maricopa,Maricopa,Maricopa County,4013,AZ,Arizona,33.3490,-112.4915,4430871
4,San Diego,San Diego,San Diego County,6073,CA,California,33.0343,-116.7351,3289701
...,...,...,...,...,...,...,...,...,...
3139,Blaine,Blaine,Blaine County,31009,NE,Nebraska,41.9128,-99.9768,384
3140,King,King,King County,48269,TX,Texas,33.6165,-100.2558,216
3141,Kenedy,Kenedy,Kenedy County,48261,TX,Texas,26.9285,-97.7017,116
3142,Loving,Loving,Loving County,48301,TX,Texas,31.8493,-103.5800,96


# Cancer EDA

In [7]:
coords_df = get_uscounties()
cancer_df = get_cancer_data()
cancer_df = cancer_df.merge(
    coords_df, on='FIPS'
).drop(
    columns=['County_x', 'state_id', 'county_full','county_ascii', 'State']
).rename(
    {'County_y': 'County', 'state_name': 'State', 'lat': 'Lat', 'lng': 'Lon', 'population': 'Population'}, 
    axis=1
)

In [8]:
fig = px.scatter_geo(
    cancer_df, 
    lat='Lat', 
    lon='Lon', 
    hover_name='County', 
    hover_data=['Incidence Rate per 100k', 'Recent 5-Year Trend', 'Population'],
    color='Incidence Rate per 100k',
    size='Population',
    projection='albers usa'
)
fig.show()