# Supplementary Notebook 1: AQI Breakpoints Transformation



In [3]:
import pandas as pd
import requests
from io import StringIO
import hashlib
from pathlib import Path
import os

In [6]:
# OPENAQ_DIR = Path(os.getenv('OPENAQ_DIR'))
DATASET_DIR = Path("datasets/")
CACHE_DIR = Path("cache/")
WEIGHTS_DIR = Path("weights/")
OUTPUT_DIR = Path("output/")

# print(OPENAQ_HASH)


os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [8]:
MOLAR_MASSES = {
    "NO2": 46.0055,
    "O3": 48.00,
    "CO": 28.01,
    "SO2": 64.066,
}

GAS_MAPPINGS = {
    "Carbon monoxide": "CO",
    "Nitrogen dioxide (NO2)": "NO2",
    "Ozone" : "O3",
    "PM2.5 - Local Conditions": "PM25",
    "PM10 Total 0-10um STP": "PM10",
    "Sulfur dioxide": "SO2",
}

In [9]:
df = pd.read_csv("https://aqs.epa.gov/aqsweb/documents/codetables/aqi_breakpoints.csv")
df = df.drop(['Parameter Code','Low AQI','High AQI'], axis=1)
df = df[df['Parameter'].isin(GAS_MAPPINGS.keys())]
df['Parameter'] = df['Parameter'].map(GAS_MAPPINGS)

# Filter out
df = df[~(
    ((df['Parameter'] == 'O3') & (df['Duration Description'] == '8-HR RUN AVG BEGIN HOUR')) |
    ((df['Parameter'] == 'PM10') & (df['Duration Description'] == '24-HR BLK AVG')) |
    ((df['Parameter'] == 'PM25') & (df['Duration Description'] == '24-HR BLK AVG')) |
    ((df['Parameter'] == 'SO2') & (df['Duration Description'] == '24-HR BLK AVG'))
)]

In [10]:
def to_ugm3(row, col):
    gas = row['Parameter']
    val = row[col]
    if gas in ["PM25", "PM10"] or pd.isna(val):
        return val
    unit = "ppb" if gas in ["NO2", "SO2"] else "ppm"
    molar_mass = MOLAR_MASSES[gas]
    if unit == "ppm":
        return val * 1000 * molar_mass / 24.45
    else:
        return val * molar_mass / 24.45

# Apply conversion
df["Low Breakpoint (ug/m3)"] = df.apply(lambda row: to_ugm3(row, "Low Breakpoint"), axis=1)
df["High Breakpoint (ug/m3)"] = df.apply(lambda row: to_ugm3(row, "High Breakpoint"), axis=1)


In [11]:
df

Unnamed: 0,Parameter,Duration Code,Duration Description,AQI Category,Low Breakpoint,High Breakpoint,Low Breakpoint (ug/m3),High Breakpoint (ug/m3)
14,CO,Z,8-HR RUN AVG END HOUR,GOOD,0.0,4.4,0.0,5040.654
15,CO,Z,8-HR RUN AVG END HOUR,MODERATE,4.5,9.4,5155.214724,10768.67
16,CO,Z,8-HR RUN AVG END HOUR,UNHEALTHY FOR SENSITIVE,9.5,12.4,10883.231084,14205.48
17,CO,Z,8-HR RUN AVG END HOUR,UNHEALTHY,12.5,15.4,14320.0409,17642.29
18,CO,Z,8-HR RUN AVG END HOUR,VERY UNHEALTHY,15.5,30.4,17756.850716,34826.34
19,CO,Z,8-HR RUN AVG END HOUR,HAZARDOUS,30.5,50.4,34940.899796,57738.4
20,CO,Z,8-HR RUN AVG END HOUR,HAZARDOUS,50.4,99999.9,57738.404908,114560200.0
21,NO2,1,1 HOUR,GOOD,0.0,53.0,0.0,99.72562
22,NO2,1,1 HOUR,MODERATE,54.0,100.0,101.607239,188.1616
23,NO2,1,1 HOUR,UNHEALTHY FOR SENSITIVE,101.0,360.0,190.04317,677.3816


In [12]:
df.to_csv(DATASET_DIR / "aqi_breakpoints.csv", index=False)