In [None]:
import pandas as pd
import xgboost as xgb
import joblib
import numpy as np
from sqlalchemy import text
from database import engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

print("--- Starting Model Training --- ")

--- Starting Model Training --- 


In [None]:
query = """
SELECT 
    aq.time,
    COALESCE(aq.aqi, aq.pm25, aq.pm10, aq.o3, aq.no2, aq.so2, aq.co) as aqi,
    wf.temperature_2m,
    wf.relative_humidity_2m,
    wf.precipitation,
    wf.wind_speed_10m
FROM air_quality_data aq
JOIN weather_forecasts wf ON date_trunc('hour', aq.time AT TIME ZONE 'UTC') = wf.time
WHERE COALESCE(aq.aqi, aq.pm25, aq.pm10, aq.o3, aq.no2, aq.so2, aq.co) IS NOT NULL;
"""

print("Loading and joining data from the database...")
df = pd.read_sql(query, engine)
df = df.sort_values(by='time').reset_index(drop=True)
print(f"Loaded {len(df)} rows of combined data.")

Loading and joining data from the database...
Loaded 168 rows of combined data.


In [None]:
df['time'] = pd.to_datetime(df['time'])
df['hour'] = df['time'].dt.hour
df['dayofweek'] = df['time'].dt.dayofweek
df['month'] = df['time'].dt.month

# --- THIS IS THE KEY UPGRADE ---
df['aqi_lag_1'] = df['aqi'].shift(1)
df['aqi_lag_2'] = df['aqi'].shift(2)
df['aqi_lag_24'] = df['aqi'].shift(24) # AQI from the same time yesterday

# Drop rows with NaN values created by the shift
df = df.dropna()

features = ['temperature_2m', 'relative_humidity_2m', 'precipitation', 'wind_speed_10m', 'hour', 'dayofweek', 'month', 'aqi_lag_1', 'aqi_lag_2', 'aqi_lag_24']
target = 'aqi'

X = df[features]
y = df[target]

print(f"Data prepared for training with {len(df)} clean rows.")

Data prepared for training with 144 clean rows.


In [None]:
df['time'] = pd.to_datetime(df['time'])
df['hour'] = df['time'].dt.hour
df['dayofweek'] = df['time'].dt.dayofweek
df['month'] = df['time'].dt.month

features = ['temperature_2m', 'relative_humidity_2m', 'precipitation', 'wind_speed_10m', 'hour', 'dayofweek', 'month']
target = 'aqi'

X = df[features]
y = df[target]

print("Data prepared for training.")

Data prepared for training.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False) # Do not shuffle time-series data

model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    early_stopping_rounds=10
)

print("Training the improved XGBoost model...")
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
print(f"V2 Model training complete. New RMSE: {rmse:.2f}")

Training the XGBoost model...
Model training complete. RMSE: 74.13


In [None]:
model_filename = 'aqi_forecaster.joblib'
joblib.dump(model, model_filename)
print(f"✅ Model saved successfully as '{model_filename}'")

✅ Model saved successfully as 'aqi_forecaster.joblib'
