In [2]:
import boto3
import pandas as pd
import io
import os
import numpy as np
from datetime import timedelta
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Create predictions directory
os.makedirs("predictions", exist_ok=True)

# S3 setup
s3 = boto3.client('s3')
bucket = 'sagemaker-us-west-2-090826559334'
prefix = 'airqualitydata/'

response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
csv_keys = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

for key in csv_keys:
    print(f"\nReading {key}")
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()))

    # Drop rows missing AQI
    df = df.dropna(subset=["AQI"])
    print(f"{len(df)} rows remaining after AQI cleanup")

    if df.empty:
        print("Skipping — no valid AQI rows")
        continue

    if "datetime" not in df.columns:
        print("Skipping — 'datetime' column missing")
        continue

    df.rename(columns={"datetime": "date"}, inplace=True)
    df["date"] = pd.to_datetime(df["date"], errors='coerce', utc=True)
    df = df.dropna(subset=["date"])

    df = df.sort_values("date")
    df = df.groupby("date").agg({"AQI": "mean"}).reset_index()

    if df.empty:
        print("Skipping — no valid daily AQI data")
        continue

    # Add features
    df["day_of_week"] = df["date"].dt.dayofweek
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["AQI_t-1"] = df["AQI"].shift(1)
    df["AQI_t-2"] = df["AQI"].shift(2)
    df["AQI_t-3"] = df["AQI"].shift(3)
    df["target_AQI"] = df["AQI"].shift(-1)

    df.dropna(inplace=True)

    features = ["AQI", "AQI_t-1", "AQI_t-2", "AQI_t-3", "day_of_week", "month", "day"]
    X = df[features]
    y = df["target_AQI"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"R² Score: {r2_score(y_test, y_pred):.3f}")

    # Predict next-day AQI for existing data
    df["predicted_next_day_AQI"] = model.predict(X)
    df["alert"] = df["predicted_next_day_AQI"].apply(lambda x: "YES" if x > 100 else "NO")

    # Forecast until July 1, 2025
    target_end_date = pd.Timestamp("2025-07-01", tz="UTC")
    last_known = df.iloc[-1]
    future_forecasts = []

    while last_known["date"] < target_end_date:
        next_date = last_known["date"] + timedelta(days=1)
        next_features = {
            "AQI": last_known["AQI"],
            "AQI_t-1": last_known["AQI_t-1"],
            "AQI_t-2": last_known["AQI_t-2"],
            "AQI_t-3": last_known["AQI_t-3"],
            "day_of_week": next_date.dayofweek,
            "month": next_date.month,
            "day": next_date.day
        }

        next_X = pd.DataFrame([next_features])
        predicted = model.predict(next_X)[0]

        future_forecasts.append({
            "date": next_date,
            "AQI": np.nan,
            "predicted_next_day_AQI": predicted,
            "alert": "YES" if predicted > 100 else "NO"
        })

        # Shift values for next step
        last_known = {
            "date": next_date,
            "AQI": predicted,
            "AQI_t-1": last_known["AQI"],
            "AQI_t-2": last_known["AQI_t-1"],
            "AQI_t-3": last_known["AQI_t-2"]
        }
    forecast_df = df[["date", "AQI", "predicted_next_day_AQI", "alert"]].copy()
    forecast_df = pd.concat([forecast_df, pd.DataFrame(future_forecasts)], ignore_index=True)

    # Save
    output_filename = f"predictions/{key.split('/')[-1].replace('.csv', '_predicted.csv')}"
    forecast_df.to_csv(output_filename, index=False)
    print(f"Saved predictions to {output_filename}")

print("\nAll files processed.")



Reading airqualitydata/bv_with_aqi.csv
58454 rows remaining after AQI cleanup
MAE: 1.84
R² Score: 0.774
Saved predictions to predictions/bv_with_aqi_predicted.csv

Reading airqualitydata/cavecamp_with_aqi.csv
20051 rows remaining after AQI cleanup
MAE: 14.22
R² Score: 0.142
Saved predictions to predictions/cavecamp_with_aqi_predicted.csv

Reading airqualitydata/constellationlabs_with_aqi.csv
3384 rows remaining after AQI cleanup
MAE: 2.80
R² Score: 0.390
Saved predictions to predictions/constellationlabs_with_aqi_predicted.csv

Reading airqualitydata/copperview_with_aqi.csv
112091 rows remaining after AQI cleanup
MAE: 1.87
R² Score: 0.814
Saved predictions to predictions/copperview_with_aqi_predicted.csv

Reading airqualitydata/erda_with_aqi.csv
143910 rows remaining after AQI cleanup
MAE: 2.66
R² Score: 0.759
Saved predictions to predictions/erda_with_aqi_predicted.csv

Reading airqualitydata/federalheights_with_aqi.csv
0 rows remaining after AQI cleanup
Skipping — no valid AQI rows
