In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
DATA_DIR = './data/'
CANCER_DATA = 'cancer_incd_rate_2016_2020.csv'
AQI_DATA = lambda year: f'annual_aqi_by_county_{year}.csv'
US_COUNTIES = 'uscounties/uscounties.csv'

In [3]:
def get_cancer_data():
    df = pd.read_csv(DATA_DIR + CANCER_DATA, skiprows=8, skipfooter=31, engine='python')
    df.insert(0, 'State', df['County'].apply(lambda x: x.split(', ')[-1][:-3]))
    df['County'] = df['County'].apply(lambda x: x.split(', ')[0])
    df = df.iloc[1:].rename({
        'Age-Adjusted Incidence Rate([rate note]) - cases per 100,000': 'Incidence Rate per 100k', 
        'Recent 5-Year Trend ([trend note]) in Incidence Rates': 'Recent 5-Year Trend',
        ' FIPS': 'FIPS'
    }, axis=1)
    df.replace('data not available', np.nan, inplace=True)
    df['Recent 5-Year Trend'] = pd.to_numeric(df['Recent 5-Year Trend'], errors='coerce')
    df['FIPS'] = pd.to_numeric(df['FIPS'], errors='coerce')
    df["FIPS"] = df["FIPS"].astype('Int64')
    return df
get_cancer_data()

  df = pd.read_csv(DATA_DIR + CANCER_DATA, skiprows=8, skipfooter=31)


Unnamed: 0,State,County,FIPS,Incidence Rate per 100k,Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
1,Florida,Union County,12125,1237.4,1165.6,1312.8,,1,1,237,stable,0.6,-0.5,1.9
2,Iowa,Palo Alto County,19147,658.1,591.1,731.1,,1,6,82,rising,4.8,0.2,15.4
3,Montana,Treasure County,30103,652.2,401,1007.4,,1,55,7,stable,-1.1,-5.6,3.3
4,Texas,Polk County,48373,633.6,604.6,663.7,,1,4,425,rising,2.2,1.2,4.2
5,Kentucky,Floyd County,21071,616.8,584.3,650.7,,1,19,295,stable,1.5,-1.8,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3139,Kansas,Wyandotte County,20209,data not available,,data not available,,,,,,,,
3140,Virginia,Wythe County,51197,data not available,,data not available,,,,,,,,
3141,Alaska,Yakutat City and Borough,2282,*,*,*,*,*,*,3 or fewer,*,,*,*
3142,Minnesota,Yellow Medicine County,27173,data not available,,data not available,,,,,,,,


In [4]:
# compile aqi data into 5 year avaerage
def compile_aqi_data():
    dfs = []
    
    for year in range(2016, 2021):
        aqi = pd.read_csv(DATA_DIR + AQI_DATA(year))
        dfs.append(aqi)
    
    annual_aqi_2016_2020 = pd.concat(dfs)
    
    return annual_aqi_2016_2020

compile_aqi_data()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2016,279,247,32,0,0,0,0,87,51,37,0,0,221,58,0
1,Alabama,Clay,2016,116,109,7,0,0,0,0,56,45,30,0,0,0,116,0
2,Alabama,Colbert,2016,282,258,23,1,0,0,0,115,50,38,0,0,219,63,0
3,Alabama,DeKalb,2016,348,304,43,1,0,0,0,119,54,40,0,0,321,27,0
4,Alabama,Elmore,2016,117,107,10,0,0,0,0,77,48,40,0,0,117,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,Wyoming,Sublette,2020,366,297,64,4,1,0,0,151,61,44,0,0,355,11,0
999,Wyoming,Sweetwater,2020,366,238,116,7,4,0,1,1250,76,46,0,5,253,7,101
1000,Wyoming,Teton,2020,366,318,36,7,5,0,0,161,54,42,0,0,329,37,0
1001,Wyoming,Uinta,2020,366,323,41,2,0,0,0,122,51,40,0,2,225,0,139


In [17]:
def get_uscounties():
    df = pd.read_csv(DATA_DIR + US_COUNTIES).rename({'county': 'County'}, axis=1)
    df["county_fips"] = pd.to_numeric(df["county_fips"], errors='coerce')
    df["county_fips"] = df["county_fips"].astype('Int64')
    df = df.rename(columns={"county_fips": "FIPS"})
    return df
get_uscounties()

Unnamed: 0,County,county_ascii,county_full,FIPS,state_id,state_name,lat,lng,population
0,Los Angeles,Los Angeles,Los Angeles County,6037,CA,California,34.3219,-118.2247,9936690
1,Cook,Cook,Cook County,17031,IL,Illinois,41.8401,-87.8168,5225367
2,Harris,Harris,Harris County,48201,TX,Texas,29.8578,-95.3938,4726177
3,Maricopa,Maricopa,Maricopa County,4013,AZ,Arizona,33.3490,-112.4915,4430871
4,San Diego,San Diego,San Diego County,6073,CA,California,33.0343,-116.7351,3289701
...,...,...,...,...,...,...,...,...,...
3139,Blaine,Blaine,Blaine County,31009,NE,Nebraska,41.9128,-99.9768,384
3140,King,King,King County,48269,TX,Texas,33.6165,-100.2558,216
3141,Kenedy,Kenedy,Kenedy County,48261,TX,Texas,26.9285,-97.7017,116
3142,Loving,Loving,Loving County,48301,TX,Texas,31.8493,-103.5800,96


# Cancer EDA

In [21]:
coords_df = get_uscounties()
cancer_df = get_cancer_data()
coords_df.merge(cancer_df, on='FIPS')

  df = pd.read_csv(DATA_DIR + CANCER_DATA, skiprows=8, skipfooter=31)


Unnamed: 0,County_x,county_ascii,county_full,FIPS,state_id,state_name,lat,lng,population,State,...,Lower 95% Confidence Interval,Upper 95% Confidence Interval,CI*Rank([rank note]),Lower CI (CI*Rank),Upper CI (CI*Rank),Average Annual Count,Recent Trend,Recent 5-Year Trend,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,Los Angeles,Los Angeles,Los Angeles County,6037,CA,California,34.3219,-118.2247,9936690,California,...,366.3,369.5,,44,52,40485,falling,-0.9,-1.4,-0.2
1,Cook,Cook,Cook County,17031,IL,Illinois,41.8401,-87.8168,5225367,Illinois,...,436.4,441.3,,83,97,25975,stable,-0.4,-0.7,0.3
2,Harris,Harris,Harris County,48201,TX,Texas,29.8578,-95.3938,4726177,Texas,...,391.2,396.6,,148,182,16988,stable,1.0,-0.3,2.0
3,Maricopa,Maricopa,Maricopa County,4013,AZ,Arizona,33.3490,-112.4915,4430871,Arizona,...,377.8,382.7,,4,6,19166,falling,-1.1,-2.0,-0.8
4,San Diego,San Diego,San Diego County,6073,CA,California,33.0343,-116.7351,3289701,California,...,415.5,421.5,,15,24,15343,rising,1.1,0.3,1.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,Blaine,Blaine,Blaine County,31009,NE,Nebraska,41.9128,-99.9768,384,Nebraska,...,349.1,972,,1,87,4,stable,3.2,-1.1,8.9
3128,King,King,King County,48269,TX,Texas,33.6165,-100.2558,216,Texas,...,*,*,*,*,*,3 or fewer,*,,*,*
3129,Kenedy,Kenedy,Kenedy County,48261,TX,Texas,26.9285,-97.7017,116,Texas,...,*,*,*,*,*,3 or fewer,*,,*,*
3130,Loving,Loving,Loving County,48301,TX,Texas,31.8493,-103.5800,96,Texas,...,*,*,*,*,*,3 or fewer,*,,*,*
