Model that predicts pollutants concentrations on days with fire

In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from pathlib import Path

# -------------------------------
# CONFIG
# -------------------------------
regions = ["Portugal", "Spain", "Italy", "Greece"]
base_path = Path("D:/IPMA")

# File paths
era5_files = {
    "precip": base_path / "ERA5/Precipitation/daily_precipitation_stats_1999_2024_regrid.nc",
    "temp": base_path / "ERA5/Temperature/daily_temperature_stats_regrid.nc",
    "wind": base_path / "ERA5/UV_wind/daily_wind_speed_stats_regrid.nc",
}

pollutants = ["co", "no", "no2", "pm2p5", "pm10"]

# -------------------------------
# HELPER: aggregate dataset by region
# -------------------------------
def aggregate_region(ds, region_name, variables, time_dim="time"):
    """
    Aggregates an xarray dataset to daily regional values (mean + max).
    """
    out = {}
    
    # Loop over variables
    for var in variables:
        if var in ds:
            da = ds[var]
            out[f"{var}_mean"] = da.mean(dim=("latitude", "longitude")).to_series()
            out[f"{var}_max"] = da.max(dim=("latitude", "longitude")).to_series()
    
    df = pd.DataFrame(out)
    df["region"] = region_name
    df.index.name = "date"
    return df

# -------------------------------
# STEP 1: Load Meteorology
# -------------------------------
def load_meteorology():
    dfs = []
    for var, path in era5_files.items():
        ds = xr.open_dataset(path)
        # Convert Year, Month, Day into datetime
        times = pd.to_datetime(dict(year=ds["Year"], month=ds["Month"], day=ds["Day"]))
        ds = ds.assign_coords(time=("time", times.values)).stack(time=("Year","Month","Day"))
        
        # Pick relevant variables
        if var == "temp":
            variables = ["Mean", "Max", "Min"]
        elif var == "precip":
            variables = ["Total_Precipitation"]
        elif var == "wind":
            variables = ["Mean", "Max"]
        
        df = aggregate_region(ds, "ALL", variables)  # global first
        dfs.append(df)
    
    meteo_df = pd.concat(dfs, axis=1)
    return meteo_df

# -------------------------------
# STEP 2: Load Pollutants + Fire for a region
# -------------------------------
def load_pollutants_fire(region):
    dfs = []
    for pol in pollutants:
        path = base_path / f"CAMS/{pol}_fire_{region}.nc"
        ds = xr.open_dataset(path)
        
        # Variables to extract
        variables = ["Mean", "Max"]
        df_pol = aggregate_region(ds, region, variables)
        
        # Fire variables (only in "co" file but let’s be safe)
        for fire_var in [f"fire_binary_{region}", f"fire_label_{region}", f"frp_sum_{region}", f"frp_count_{region}"]:
            if fire_var in ds:
                df_pol[fire_var] = ds[fire_var].mean(dim=("latitude","longitude")).to_series()
        
        dfs.append(df_pol)
    
    return pd.concat(dfs, axis=1)

# -------------------------------
# STEP 3: Build final dataset
# -------------------------------
def build_dataset():
    all_regions = []
    for region in regions:
        df_firepoll = load_pollutants_fire(region)
        # Merge with meteorology (shared)
        df_meteo = load_meteorology()
        
        # Merge on date
        df = df_meteo.join(df_firepoll, how="inner")
        
        # Target = fire_binary (0/1)
        df["target"] = (df[f"fire_binary_{region}"] > 0).astype(int)
        
        # Add region name
        df["region"] = region
        
        all_regions.append(df.reset_index())
    
    return pd.concat(all_regions, axis=0)

# -------------------------------
# RUN
# -------------------------------
df = build_dataset()
print(df.head())
print(df.shape)
