In [1]:
import pandas as pd
import numpy as np

In [2]:
# === 1. Load & prepare ===
df = pd.read_csv("Chonburi.csv")
df.columns = df.columns.str.strip().str.lower()

In [3]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
df = df.dropna(subset=['date'])
df = df.sort_values('date')
df['date'] = df['date'].dt.strftime('%d/%m/%Y')


  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)


In [4]:
# === Prepare pollutant data ===
pollutants = ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']
for p in pollutants:
    if p in df.columns:
        df[p] = pd.to_numeric(df[p], errors='coerce')

# === Convert O3 if in µg/m³ ===
if 'o3' in df.columns and df['o3'].max() > 100:
    MW = 48.00
    df['o3'] = (df['o3'] * 24.45) / MW
    print("Converted O3 µg/m³ → ppb")

Converted O3 µg/m³ → ppb


In [5]:
# === AQI breakpoints (EPA) ===
breakpoints = {
    'pm25': [(0,12,0,50),(12.1,35.4,51,100),(35.5,55.4,101,150),
             (55.5,150.4,151,200),(150.5,250.4,201,300),(250.5,350.4,301,400),
             (350.5,500.4,401,500)],
    'pm10': [(0,54,0,50),(55,154,51,100),(155,254,101,150),
             (255,354,151,200),(355,424,201,300),(425,504,301,400),
             (505,604,401,500)],
    'o3': [(0,54,0,50),(55,70,51,100),(71,85,101,150),(86,105,151,200),
           (106,200,201,300)],
    'no2': [(0,53,0,50),(54,100,51,100),(101,360,101,150),(361,649,151,200),
            (650,1249,201,300),(1250,1649,301,400),(1650,2049,401,500)],
    'so2': [(0,35,0,50),(36,75,51,100),(76,185,101,150),(186,304,151,200),
            (305,604,201,300),(605,804,301,400),(805,1004,401,500)],
    'co':  [(0,4.4,0,50),(4.5,9.4,51,100),(9.5,12.4,101,150),(12.5,15.4,151,200),
            (15.5,30.4,201,300),(30.5,40.4,301,400),(40.5,50.4,401,500)]
}

In [6]:
# === AQI calculation ===
def calc_aqi(p, val):
    if pd.isna(val) or p not in breakpoints:
        return np.nan
    for Clow, Chigh, Ilow, Ihigh in breakpoints[p]:
        if Clow <= val <= Chigh:
            return (Ihigh - Ilow) / (Chigh - Clow) * (val - Clow) + Ilow
    return np.nan

for p in pollutants:
    if p in df.columns:
        df[f"{p}_aqi"] = df[p].apply(lambda v: calc_aqi(p, v))

In [7]:
# === Combine AQI values ===
aqi_cols = [f"{p}_aqi" for p in pollutants if f"{p}_aqi" in df.columns]
df['overall_aqi'] = df[aqi_cols].max(axis=1, skipna=True)

# Keep only rows with valid date & AQI
df = df.dropna(subset=['date', 'overall_aqi'])

In [8]:
#=== Save clean file ===
out = df[['date', 'overall_aqi']].sort_values('date').drop_duplicates(subset=['date'])
out.to_csv("Pattaya2024.csv", index=False)

print(f"Saved clean file 'Khon_Kaen_AQI.csv' with {len(out)} rows")

Saved clean file 'Khon_Kaen_AQI.csv' with 3800 rows
