In [None]:
2️⃣ Transform (transform.py)
Each city’s JSON must be flattened into tabular format with one row per hour.
A. Required Columns
city
time
pm10
pm2_5
carbon_monoxide
nitrogen_dioxide
sulphur_dioxide
ozone
uv_index
B. Derived Features (Feature Engineering)
1. AQI based on PM2.5
0–50     → Good
51–100   → Moderate
101–200  → Unhealthy
201–300  → Very Unhealthy>300     → Hazardous 
2. Pollution Severity Score
Use weighted pollutants:
severity = (pm2_5 * 5) + (pm10 * 3) +
           (nitrogen_dioxide * 4) + (sulphur_dioxide * 4) +
           (carbon_monoxide * 2) + (ozone * 3)
3. Risk Classification
severity > 400 → "High Risk"severity > 200 → "Moderate Risk" else           → "Low Risk" 
4. Temperature Hour-of-Day Feature (Optional)
Extract hour:
hour = time.hour
C. Transform Requirements
Convert timestamps into datetime format
Convert all pollutant values to numeric
Remove records where all pollutant readings are missing
Save transformed data into:
data/staged/air_quality_transformed

In [None]:
hourly = payload.get("hourly", {})
df = pd.DataFrame({col: hourly.get(col, []) for col in COLS})
df["time"] = hourly.get("time", [])
df["city"] = raw_file.stem.split("_")[0].capitalize()
df.head()

In [None]:
# Convert time to datetime
df["time"] = pd.to_datetime(df["time"], errors="coerce")
# Convert pollutants to numeric
for col in COLS:
    df[col] = pd.to_numeric(df[col], errors="coerce")
# Drop rows where all pollutants are missing
df.dropna(subset=COLS, how="all", inplace=True)
df.head()


In [None]:
# Extract hour
df["hour"] = df["time"].dt.hour
df.head()


In [None]:
def classify_aqi(pm2_5):
    if pm2_5 <= 50:
        return "Good"
    elif pm2_5 <= 100:
        return "Moderate"
    elif pm2_5 <= 200:
        return "Unhealthy"
    elif pm2_5 <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"
df["AQI_category"] = df["pm2_5"].apply(classify_aqi)
df.head()


In [None]:
df["severity_score"] = (
    df["pm2_5"] * 5 +
    df["pm10"] * 3 +
    df["nitrogen_dioxide"] * 4 +
    df["sulphur_dioxide"] * 4 +
    df["carbon_monoxide"] * 2 +
    df["ozone"] * 3
)
df.head()


In [None]:
def risk_classification(severity):
    if severity > 400:
        return "High Risk"
    elif severity > 200:
        return "Moderate Risk"
    else:
        return "Low Risk"
df["risk"] = df["severity_score"].apply(risk_classification)
df.head()
