In [1]:
import sys
import os
import glob
import pandas as pd
import numpy as np

In [2]:
sys.path.append(os.path.abspath('..'))
from src.modeling import date_based_split, train_xgboost
from src.metrics import calculate_metrics

In [3]:
PROCESSED_DIR = '../data/processed/'
PREDICTIONS_DIR = '../data/predictions/'
os.makedirs(PREDICTIONS_DIR, exist_ok=True)

In [4]:
features = [
    'O3_forecast', 'NO2_forecast', 'T_forecast', 'q_forecast', 
    'u_forecast', 'v_forecast', 'w_forecast',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
    'O3_target_lag_1', 'O3_target_lag_24',
    'NO2_target_lag_1', 'NO2_target_lag_24'
]
targets = ['O3_target', 'NO2_target']

In [5]:
site_files = sorted(glob.glob('../data/processed/site_*_hourly_features.csv'))
print(f"Found {len(site_files)} sites: {[os.path.basename(f) for f in site_files]}")

Found 7 sites: ['site_1_hourly_features.csv', 'site_2_hourly_features.csv', 'site_3_hourly_features.csv', 'site_4_hourly_features.csv', 'site_5_hourly_features.csv', 'site_6_hourly_features.csv', 'site_7_hourly_features.csv']


In [6]:
def run_site_modeling(file_path):

    # Extract site name
    site_name = os.path.basename(file_path).replace('_hourly_features.csv', '')
    print(f"Training models for {site_name}...", end=" ")
    
    # Load Data
    df = pd.read_csv(file_path, parse_dates=['timestamp'], index_col='timestamp')
    
    # Split Data
    train_df, test_df = date_based_split(df)
    
    site_results = []

    test_preds = test_df.copy()
    
    for target in targets:
        baseline_col = target.replace('target', 'forecast')
        
        # Prepare Training Data
        X_train, y_train = train_df[features], train_df[target]
        X_test, y_test = test_df[features], test_df[target]
        
        model = train_xgboost(X_train, y_train)
        
        # Predict
        preds = model.predict(X_test)
        test_preds[f'{target}_pred'] = preds
        
        # Calculate Metrics
        metrics = calculate_metrics(y_test, preds, test_df[baseline_col])
        metrics['Site'] = site_name
        metrics['Pollutant'] = target.split('_')[0]
        site_results.append(metrics)
    
    # Save Predictions
    pred_path = os.path.join(PREDICTIONS_DIR, f'{site_name}_predictions.csv')
    test_preds.to_csv(pred_path)
    print("Done.")
    
    return site_results

In [7]:
all_metrics = []

for file_path in site_files:
    try:
        metrics = run_site_modeling(file_path)
        all_metrics.extend(metrics)
    except Exception as e:
        print(f"\nFailed to process {file_path}: {e}")

Training models for site_1... Done.
Training models for site_2... Done.
Training models for site_3... Done.
Training models for site_4... Done.
Training models for site_5... Done.
Training models for site_6... Done.
Training models for site_7... Done.


In [8]:
results_df = pd.DataFrame(all_metrics)

In [9]:
results_df

Unnamed: 0,RMSE,R2,RIA,Baseline_RMSE,Improvement,Site,Pollutant
0,7.011981,0.922405,0.869153,32.306177,25.294196,site_1,O3
1,9.500163,0.862862,0.862456,66.125125,56.624962,site_1,NO2
2,7.589394,0.917538,0.88562,35.768988,28.179595,site_2,O3
3,7.958585,0.858732,0.841732,50.637315,42.67873,site_2,NO2
4,11.194513,0.906906,0.875442,43.532461,32.337948,site_3,O3
5,9.774677,0.876418,0.853447,69.525624,59.750946,site_3,NO2
6,8.802184,0.935854,0.891972,42.900997,34.098813,site_4,O3
7,7.72684,0.825514,0.824758,69.715157,61.988317,site_4,NO2
8,9.020554,0.928705,0.888934,42.014861,32.994306,site_5,O3
9,13.41973,0.884206,0.872765,63.536577,50.116847,site_5,NO2


In [10]:
cols = ['Site', 'Pollutant', 'Baseline_RMSE', 'RMSE', 'Improvement', 'RIA', 'R2']
results_df = results_df[cols]

In [11]:
print("\nFinal Model Performance Summary")
results_df


Final Model Performance Summary


Unnamed: 0,Site,Pollutant,Baseline_RMSE,RMSE,Improvement,RIA,R2
0,site_1,O3,32.306177,7.011981,25.294196,0.869153,0.922405
1,site_1,NO2,66.125125,9.500163,56.624962,0.862456,0.862862
2,site_2,O3,35.768988,7.589394,28.179595,0.88562,0.917538
3,site_2,NO2,50.637315,7.958585,42.67873,0.841732,0.858732
4,site_3,O3,43.532461,11.194513,32.337948,0.875442,0.906906
5,site_3,NO2,69.525624,9.774677,59.750946,0.853447,0.876418
6,site_4,O3,42.900997,8.802184,34.098813,0.891972,0.935854
7,site_4,NO2,69.715157,7.72684,61.988317,0.824758,0.825514
8,site_5,O3,42.014861,9.020554,32.994306,0.888934,0.928705
9,site_5,NO2,63.536577,13.41973,50.116847,0.872765,0.884206


In [12]:
print("\nAverage Improvement")
avg_stats = results_df.groupby('Pollutant')[['Baseline_RMSE', 'RMSE', 'Improvement', 'RIA']].mean()
print(avg_stats)


Average Improvement
           Baseline_RMSE      RMSE  Improvement       RIA
Pollutant                                                
NO2            63.361105  9.420643    53.940462  0.847155
O3             39.716948  9.883492    29.833456  0.874775


In [13]:
results_df.to_csv('../data/processed/modeling_summary_metrics.csv', index=False)