# Supplementary Notebook 1: AQI Breakpoints Transformation



In [None]:
import pandas as pd
import requests
from io import StringIO
import hashlib
from pathlib import Path
import os
from src.constants import (
    DATASET_DIR,
    CACHE_DIR,
    MODELS_DIR,
    TQDM_DISABLE,
    CITY_NAMES,
    POLLUTANT_COLUMNS,
)

In [None]:
MOLAR_MASSES = {
    "NO2": 46.0055,
    "O3": 48.00,
    "CO": 28.01,
    "SO2": 64.066,
}

GAS_MAPPINGS = {
    "Carbon monoxide": "CO",
    "Nitrogen dioxide (NO2)": "NO2",
    "Ozone" : "O3",
    "PM2.5 - Local Conditions": "PM25",
    "PM10 Total 0-10um STP": "PM10",
    "Sulfur dioxide": "SO2",
}

In [None]:
df = pd.read_csv("https://aqs.epa.gov/aqsweb/documents/codetables/aqi_breakpoints.csv")
df = df.drop(['Parameter Code','Low AQI','High AQI'], axis=1)
df = df[df['Parameter'].isin(GAS_MAPPINGS.keys())]
df['Parameter'] = df['Parameter'].map(GAS_MAPPINGS)

# Filter out
df = df[~(
    ((df['Parameter'] == 'O3') & (df['Duration Description'] == '8-HR RUN AVG BEGIN HOUR')) |
    ((df['Parameter'] == 'PM10') & (df['Duration Description'] == '24-HR BLK AVG')) |
    ((df['Parameter'] == 'PM25') & (df['Duration Description'] == '24-HR BLK AVG')) |
    ((df['Parameter'] == 'SO2') & (df['Duration Description'] == '24-HR BLK AVG'))
)]

In [None]:
def to_ugm3(row, col):
    gas = row['Parameter']
    val = row[col]
    if gas in ["PM25", "PM10"] or pd.isna(val):
        return val
    unit = "ppb" if gas in ["NO2", "SO2"] else "ppm"
    molar_mass = MOLAR_MASSES[gas]
    if unit == "ppm":
        return val * 1000 * molar_mass / 24.45
    else:
        return val * molar_mass / 24.45

# Apply conversion
df["Low Breakpoint (ug/m3)"] = df.apply(lambda row: to_ugm3(row, "Low Breakpoint"), axis=1)
df["High Breakpoint (ug/m3)"] = df.apply(lambda row: to_ugm3(row, "High Breakpoint"), axis=1)


In [None]:
df

In [None]:
df.to_csv(DATASET_DIR / "processed" / "aqi_breakpoints.csv", index=False)