In [1]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests

pd.set_option('display.max_colwidth', None)

INDICATOR = 'unhealthy_aqi'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))




  from .autonotebook import tqdm as notebook_tqdm


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/environment/air_quality/unhealthy_aqi](C:/Users/tan/src/regional-pm-2023/data/raw/environment/air_quality/unhealthy_aqi)

# Healthy Environment: Air Quality

## Unhealthy Air Days

In [2]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Number of days with AQI exceeding federal health standards.

nan

In [3]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
unhealthy_aqi_days,Unhealthy AQI Days,"Number of days with AQI Exceeding Federal Standards, in a given year.",int


In [4]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
air_data,Air Data,Environmental Protection Agency,True,"New source, consistent API."
select_8,Select 8 Summary,California Air Resources Board,False,"Old source, really old reporting tool, doesn't have all pollutants AQI is based on."


In [5]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download annual county level AQI data from EPA's Air Data platform.
1,Extract/Calculate unhealthy days in San Diego county by excluding Good and Moderate days.


In [6]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,Current SME is Keith Greer <Keith.Greer@sandag.org>.
1,TAN,"It takes the EPA time to validate individual days of data. Although some days exist for 2022 and 2023,"


### Step 0: Download

In [7]:
display(steps.loc[0])

def download_aqi_by_county(year: int) -> None:
    if not Path(CONFIG['raw_dir']/'air_data').exists():
        Path(CONFIG['raw_dir']/'air_data').mkdir()

    response = requests.get(f'https://aqs.epa.gov/aqsweb/airdata/annual_aqi_by_county_{year}.zip')
    with open(CONFIG['raw_dir']/f'air_data/annual_aqi_by_county_{year}.zip', mode='wb') as output_file:
        output_file.write(response.content)

for year in tqdm(list(range(2005, 2024))):
    if not Path(CONFIG['raw_dir']/f'air_data/annual_aqi_by_county_{year}.zip').exists():
        download_aqi_by_county(year)

step    Download annual county level AQI data from EPA's Air Data platform.
Name: 0, dtype: object

100%|██████████| 19/19 [00:00<00:00, 6332.28it/s]


### Step 1: Extract/Calculate


In [8]:
display(steps.loc[1])

# Test one zip to see format.
pd.read_csv(CONFIG['raw_dir']/f'air_data/annual_aqi_by_county_2020.zip').head(2)

step    Extract/Calculate unhealthy days in San Diego county by excluding Good and Moderate days.
Name: 1, dtype: object

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2020,269,250,19,0,0,0,0,74,49,36,0,0,198,71,0
1,Alabama,Clay,2020,108,99,9,0,0,0,0,86,49,26,0,0,0,108,0


In [9]:
# Iterate extracts over all zips and combine

def extract_unhealthy_aqi_days(raw_zip_path: Path) -> pd.DataFrame:
    return (
        pd.read_csv(raw_zip_path)
        # Filter for just the San Diego County row.
        .query('`County` == "San Diego"')
        # Convert columns to snake_case
        .rename(columns=lambda col: col.lower().replace(' ', '_'))
        [['year', 'days_with_aqi', 'good_days', 'moderate_days']]
        # Days with unhealthy aqi is <days with recorded aqis> - <healthy aqi days (good or moderate)>
        .assign(unhealthy_aqi_days=lambda df: df.days_with_aqi - (df.good_days + df.moderate_days))
        # Cast the year into a date value
        .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
        # Index by year
        .set_index('year')
    )

unhealthy_aqi = (
    # Extract data from each zip then concatenate them together.
    pd.concat(
        [
            extract_unhealthy_aqi_days(zip_path)
            for zip_path in (CONFIG['raw_dir']/'air_data/').iterdir()
        ]
    )
    # only keep years with whole year reported
    .query('`days_with_aqi` >= 365')
    [['unhealthy_aqi_days']]
)

unhealthy_aqi.tail(5)

Unnamed: 0_level_0,unhealthy_aqi_days
year,Unnamed: 1_level_1
2017-01-01,62
2018-01-01,35
2019-01-01,25
2020-01-01,49
2021-01-01,16


### Save data

In [10]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/environment/air_quality/unhealthy_aqi](C:/Users/tan/src/regional-pm-2023/data/clean/environment/air_quality/unhealthy_aqi)

In [11]:
unhealthy_aqi.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)