In [2]:
!pip install tqdm
import os, math
import numpy as np
import pandas as pd
from tqdm import tqdm
import rasterio
from rasterio.transform import from_origin

# ---------------- USER SETTINGS ----------------
CATALOG_CSV = r"D:/Earthquake_Project/Datasets/Earthquake_datasets.csv"  # time, latitude, longitude, depth, mag
OUTPUT_DIR  = r"D:/Earthquake_Project/pga_outputs"
GRID_STEP_DEG = 1.0          # 1.0° (~111 km). Use 0.5 or 0.25 for finer grids (slower).
MIN_MAG = 4.5                # ignore tiny events to keep runtime reasonable; set 2.5 if you need all
MAX_DIST_KM = 300.0          # only affect cells within this epicentral distance
VS30_DEFAULT = 760.0         # rock site proxy (only used implicitly by GMPE coefficients here)
YEARS_RANGE = (1950, 2025)   # historical window
FORECAST_YEARS = [2030, 2050, 2100]
# ------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- LOAD CATALOG ----------------
eq = pd.read_csv(CATALOG_CSV)
eq.columns = [c.strip().lower() for c in eq.columns]
for col in ["time","latitude","longitude","depth","mag"]:
    if col not in eq.columns: raise ValueError(f"Missing column: {col}")

eq["time"] = pd.to_datetime(eq["time"], errors="coerce")
eq = eq.dropna(subset=["time","latitude","longitude","depth","mag"])
eq = eq[(eq["time"].dt.year >= YEARS_RANGE[0]) & (eq["time"].dt.year <= YEARS_RANGE[1])]
eq = eq[eq["mag"] >= MIN_MAG].copy()
eq["year"] = eq["time"].dt.year.astype(int)

print(f"Events used: {len(eq):,}")

# ---------------- BUILD GLOBAL GRID ----------------
lats = np.arange(-90 + GRID_STEP_DEG/2, 90, GRID_STEP_DEG)
lons = np.arange(-180 + GRID_STEP_DEG/2, 180, GRID_STEP_DEG)
nlat, nlon = len(lats), len(lons)
print(f"Grid: {nlat} x {nlon} cells (step {GRID_STEP_DEG}°)")

years = np.arange(YEARS_RANGE[0], YEARS_RANGE[1]+1, dtype=int)
ny = len(years)
year_to_idx = {y:i for i,y in enumerate(years)}

# store annual MAX PGA per cell (in g)
pga_max = np.zeros((ny, nlat, nlon), dtype=np.float32)

# quick look-up of grid cell indices by integer bins
lat_bins = np.floor(lats).astype(int)
lon_bins = np.floor(lons).astype(int)
bin_index = {}
for i, la in enumerate(lat_bins):
    for j, lo in enumerate(lon_bins):
        bin_index.setdefault((la, lo), []).append((i, j))

# ---------------- HELPERS ----------------
def haversine_km(lat1, lon1, lat2, lon2):
    # lat/lon in degrees, returns distance in km (vectorized for lat2/lon2 arrays)
    d2r = np.pi/180.0
    f1, f2 = lat1*d2r, lat2*d2r
    dlat = (lat2 - lat1)*d2r
    dlon = (lon2 - lon1)*d2r
    a = np.sin(dlat/2.0)**2 + np.cos(f1)*np.cos(f2)*np.sin(dlon/2.0)**2
    return 6371.0 * 2.0 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))

# Simple global GMPE-like proxy (OK for big-picture maps; not a design PSHA):
# ln(PGA[g]) = c0 + c1*M - c2*ln(R + h), where R ≈ sqrt( (epi-dist)^2 + depth^2 ), h avoids singularity.
# Coefficients tuned to produce reasonable g-values at global scale for rock sites.
c0, c1, c2, h = -1.5, 0.9, 1.1, 10.0

def pga_g_from_M_Rdepth(Mw, R_hyp_km):
    ln_pga = c0 + c1*Mw - c2*np.log(R_hyp_km + h)
    return np.exp(ln_pga)

# ---------------- AGGREGATE PER EVENT ----------------
ddeg = MAX_DIST_KM / 111.0   # ~degrees

for idx, ev in tqdm(eq.iterrows(), total=len(eq), desc="Computing annual max PGA per cell"):
    y = int(ev["year"]); iy = year_to_idx[y]
    ev_lat = float(ev["latitude"]); ev_lon = float(ev["longitude"])
    dep = max(0.0, float(ev["depth"]))
    Mw  = float(ev["mag"])

    # candidate bins within bounding box
    lat_lo = math.floor(ev_lat - ddeg); lat_hi = math.floor(ev_lat + ddeg)
    lon_lo = math.floor(ev_lon - ddeg); lon_hi = math.floor(ev_lon + ddeg)

    cand_idx = []
    for la in range(lat_lo, lat_hi+1):
        for lo in range(lon_lo, lon_hi+1):
            if (la, lo) in bin_index:
                cand_idx.extend(bin_index[(la, lo)])
    if not cand_idx: 
        continue

    # compute great-circle distance only for these candidates
    ci, cj = zip(*cand_idx)
    lat_arr = lats[np.array(ci)]
    lon_arr = lons[np.array(cj)]
    dist_km = haversine_km(ev_lat, ev_lon, lat_arr, lon_arr)
    mask = dist_km <= MAX_DIST_KM
    if not np.any(mask): 
        continue

    ci = np.array(ci)[mask]
    cj = np.array(cj)[mask]
    dist_sel = dist_km[mask]
    R_hyp = np.sqrt(dist_sel**2 + dep**2)

    pga_vals = pga_g_from_M_Rdepth(Mw, R_hyp)

    # update annual MAX per cell
    pga_max[iy, ci, cj] = np.maximum(pga_max[iy, ci, cj], pga_vals.astype(np.float32))

# ---------------- HISTORICAL SUMMARY (1950–2025) ----------------
# long-term max per cell (useful as base hazard layer)
pga_hist_max = np.nanmax(pga_max, axis=0)

# ---------------- PER-CELL TREND & FORECAST ----------------
# log-linear fit on annual max (small epsilon to avoid log(0))
eps = 1e-6
x = years.astype(float)
X = np.vstack([x, np.ones_like(x)]).T  # for polyfit-like normal equation

# prepare output rasters
fc_maps = {yr: np.zeros((nlat, nlon), dtype=np.float32) for yr in FORECAST_YEARS}

for i in range(nlat):
    row = pga_max[:, i, :]  # shape (ny, nlon)
    # vectorize across longitudes
    for j in range(nlon):
        ts = row[:, j]
        if np.all(ts == 0): 
            # no signal; leave zeros
            continue
        ylog = np.log10(np.maximum(ts, eps))
        # mask zeros if too many missing years
        valid = np.isfinite(ylog)
        if valid.sum() < 10:
            # not enough years to fit; use historical max as flat forecast
            for yr in FORECAST_YEARS:
                fc_maps[yr][i, j] = np.nanmax(ts).astype(np.float32)
            continue
        xv = x[valid]; yv = ylog[valid]
        # linear fit y = a*x + b
        a, b = np.polyfit(xv, yv, 1)
        for yr in FORECAST_YEARS:
            yhat = a*yr + b
            fc_maps[yr][i, j] = float(np.power(10.0, yhat))

# ---------------- WRITE OUTPUTS (GeoTIFF + CSV) ----------------
def write_geotiff(path, arr2d, dtype="float32"):
    res_x = GRID_STEP_DEG
    res_y = GRID_STEP_DEG
    transform = from_origin(lons.min()-res_x/2, lats.max()+res_y/2, res_x, res_y)
    with rasterio.open(
        path, "w",
        driver="GTiff",
        height=arr2d.shape[0],
        width=arr2d.shape[1],
        count=1,
        dtype=dtype,
        crs="EPSG:4326",
        transform=transform,
        compress="lzw"
    ) as dst:
        dst.write(np.flipud(arr2d.astype(dtype)), 1)

# Historical base map
hist_tif = os.path.join(OUTPUT_DIR, "PGA_hist_max_1950_2025.tif")
write_geotiff(hist_tif, pga_hist_max)

# Forecast maps
fc_tifs = []
for yr in FORECAST_YEARS:
    tif = os.path.join(OUTPUT_DIR, f"PGA_forecast_{yr}.tif")
    write_geotiff(tif, fc_maps[yr])
    fc_tifs.append(tif)

# Also save CSVs (grid centers)
def save_grid_csv(path, arr2d, colname):
    LAT, LON = np.meshgrid(lats, lons, indexing="ij")
    df = pd.DataFrame({
        "lat": LAT.ravel(),
        "lon": LON.ravel(),
        colname: arr2d.ravel()
    })
    df.to_csv(path, index=False)

save_grid_csv(os.path.join(OUTPUT_DIR, "PGA_hist_max_1950_2025.csv"), pga_hist_max, "pga_g")
for yr in FORECAST_YEARS:
    save_grid_csv(os.path.join(OUTPUT_DIR, f"PGA_forecast_{yr}.csv"), fc_maps[yr], "pga_g")

print("\nOutputs:")
print("Base map:", hist_tif)
for p in fc_tifs: print("Forecast:", p)
print("CSVs saved alongside the GeoTIFFs.")


Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Events used: 284,368
Grid: 180 x 360 cells (step 1.0°)


Computing annual max PGA per cell: 100%|█████████████████████████████████████| 284368/284368 [01:00<00:00, 4713.48it/s]



Outputs:
Base map: D:/Earthquake_Project/pga_outputs\PGA_hist_max_1950_2025.tif
Forecast: D:/Earthquake_Project/pga_outputs\PGA_forecast_2030.tif
Forecast: D:/Earthquake_Project/pga_outputs\PGA_forecast_2050.tif
Forecast: D:/Earthquake_Project/pga_outputs\PGA_forecast_2100.tif
CSVs saved alongside the GeoTIFFs.


In [7]:
import os, math, warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import rasterio
from rasterio.transform import from_origin
import geopandas as gpd

warnings.filterwarnings("ignore")

# ================== CONFIG ==================
CATALOG_CSV       = r"D:/Earthquake_Project/Datasets/Earthquake_datasets.csv"
ACTIVE_FAULTS_MARGINS_SHP = r"D:/Earthquake_Project/Datasets/Active faults/gem_active_faults.shp"
OUT_DIR           = r"D:/Earthquake_Project/pga_outputs2"
GRID_STEP_DEG     = 1.0
MIN_MAG           = 4.5
MAX_DIST_KM       = 300.0
YEARS_RANGE       = (1950, 2025)
FORECAST_YEARS    = [2030, 2050, 2100]
LAGS              = 5          # Set to 3 if RAM is an issue
SEED              = 42
C0, C1, C2, H     = -1.5, 0.9, 1.1, 10.0     # GMPE-like proxy
ACTIVE_CUTOFF     = 1e-6       # Keep cells whose HIST max PGA > this
# ============================================

os.makedirs(OUT_DIR, exist_ok=True)

def haversine_km(lat1, lon1, lat2, lon2):
    d2r = np.pi / 180.0
    f1, f2 = lat1*d2r, lat2*d2r
    dlat = (lat2-lat1)*d2r
    dlon = (lon2-lon1)*d2r
    a = np.sin(dlat/2)**2 + np.cos(f1) * np.cos(f2) * np.sin(dlon/2)**2
    return 6371.0 * 2 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))

def pga_proxy(Mw, R_hyp_km):
    ln_pga = C0 + C1 * Mw - C2 * np.log(R_hyp_km + H)
    return np.exp(ln_pga)

def write_geotiff(path, arr2d, lats, lons, dtype="float32"):
    res_x = lons[1] - lons[0]
    res_y = lats[1] - lats[0]
    transform = from_origin(lons.min() - res_x/2, lats.max() + res_y/2, res_x, res_y)
    with rasterio.open(path, "w", driver="GTiff",
                       height=arr2d.shape[0], width=arr2d.shape[1],
                       count=1, dtype=dtype, crs="EPSG:4326",
                       transform=transform, compress="lzw") as dst:
        dst.write(np.flipud(arr2d.astype(dtype)), 1)

def array_to_csv(path, arr2d, lats, lons, colname="pga_g"):
    LAT, LON = np.meshgrid(lats, lons, indexing="ij")
    pd.DataFrame({"lat": LAT.ravel(), "lon": LON.ravel(), colname: arr2d.ravel()}).to_csv(path, index=False)

def evaluate(y_true, y_pred, name):
    y_true = y_true.astype(np.float32); y_pred = y_pred.astype(np.float32)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> MAE: {mae:.6f} | RMSE: {rmse:.6f} | R²: {r2:.4f}")
    return {"model": name, "MAE": mae, "RMSE": rmse, "R2": r2}

# Load single shapefile for faults and margins
faults_margins = gpd.read_file(ACTIVE_FAULTS_MARGINS_SHP)

def nearest_distance_to_geometry(lat, lon, geometry_gdf):
    point = gpd.points_from_xy([lon], [lat])[0]
    distances = geometry_gdf.distance(point) * 111  # Convert degrees to km (approx.)
    return distances.min() if not distances.empty else np.nan

# 1) Load catalog
eq = pd.read_csv(CATALOG_CSV)
eq.columns = [c.strip().lower() for c in eq.columns]
for col in ["time", "latitude", "longitude", "depth", "mag"]:
    if col not in eq.columns: raise ValueError(f"Missing column: {col}")
eq["time"] = pd.to_datetime(eq["time"], errors="coerce")
eq = eq.dropna(subset=["time", "latitude", "longitude", "depth", "mag"])
eq = eq[(eq["time"].dt.year >= YEARS_RANGE[0]) & (eq["time"].dt.year <= YEARS_RANGE[1])]
eq = eq[eq["mag"] >= MIN_MAG].copy()
eq["year"] = eq["time"].dt.year.astype(int)
print(f"Events used: {len(eq):,}")

# Add distance feature to earthquake data
eq["dist_to_fault_margin_km"] = eq.apply(lambda row: nearest_distance_to_geometry(row["latitude"], row["longitude"], faults_margins), axis=1)

# 2) Grid
lats = np.arange(-90 + GRID_STEP_DEG/2, 90, GRID_STEP_DEG)
lons = np.arange(-180 + GRID_STEP_DEG/2, 180, GRID_STEP_DEG)
nlat, nlon = len(lats), len(lons)
years = np.arange(YEARS_RANGE[0], YEARS_RANGE[1]+1, dtype=int)
ny = len(years)
year_to_idx = {y: i for i, y in enumerate(years)}
print(f"Grid: {nlat} x {nlon} cells")

lat_bins = np.floor(lats).astype(int)
lon_bins = np.floor(lons).astype(int)
bin_index = {}
for i, la in enumerate(lat_bins):
    for j, lo in enumerate(lon_bins):
        bin_index.setdefault((la, lo), []).append((i, j))

# 3) Annual max PGA per cell with distance features
pga_year = np.zeros((ny, nlat, nlon), dtype=np.float32)
ddeg = MAX_DIST_KM / 111.0
for _, ev in eq.iterrows():
    y = int(ev["year"]); iy = year_to_idx[y]
    ev_lat, ev_lon = float(ev["latitude"]), float(ev["longitude"])
    dep = max(0.0, float(ev["depth"])); Mw = float(ev["mag"])
    lat_lo = math.floor(ev_lat - ddeg); lat_hi = math.floor(ev_lat + ddeg)
    lon_lo = math.floor(ev_lon - ddeg); lon_hi = math.floor(ev_lon + ddeg)
    cand = []
    for la in range(lat_lo, lat_hi+1):
        for lo in range(lon_lo, lon_hi+1):
            if (la, lo) in bin_index:
                cand.extend(bin_index[(la, lo)])
    if not cand: continue
    ci, cj = zip(*cand)
    ci = np.array(ci); cj = np.array(cj)
    dist = haversine_km(ev_lat, ev_lon, lats[ci], lons[cj])
    mask = dist <= MAX_DIST_KM
    if not np.any(mask): continue
    ci = ci[mask]; cj = cj[mask]
    R_hyp = np.sqrt(dist[mask]**2 + dep**2)
    pga_v = pga_proxy(Mw, R_hyp).astype(np.float32)
    pga_year[iy, ci, cj] = np.maximum(pga_year[iy, ci, cj], pga_v)

# Historical base & active mask
hist_max = np.nanmax(pga_year, axis=0).astype(np.float32)
write_geotiff(os.path.join(OUT_DIR, "PGA_hist_max_1950_2025.tif"), hist_max, lats, lons)
array_to_csv(os.path.join(OUT_DIR, "PGA_hist_max_1950_2025.csv"), hist_max, lats, lons)

active_mask = hist_max > ACTIVE_CUTOFF
print("Active cells:", int(active_mask.sum()), "of", nlat*nlon)

# 4) Long format + lags (active cells only) with distance features
rows = []
LAT, LON = np.meshgrid(lats, lons, indexing="ij")
for yi, y in enumerate(years):
    arr = pga_year[yi]
    arr = np.where(active_mask, arr, np.nan)
    # Add distance to nearest fault/margin for each grid cell
    dist_fault_margin = np.array([[nearest_distance_to_geometry(lat, lon, faults_margins) for lon in lons] for lat in lats])
    lat_grid, lon_grid = LAT.ravel(), LON.ravel()
    dist_fault_margin_flat = dist_fault_margin.ravel()
    rows.append(pd.DataFrame({
        "year": y,
        "lat": lat_grid,
        "lon": lon_grid,
        "pga_g": arr.ravel(),
        "dist_to_fault_margin_km": dist_fault_margin_flat
    }))
long_df = pd.concat(rows, ignore_index=True).dropna(subset=["pga_g"])

long_df = long_df.sort_values(["lat", "lon", "year"])
for k in range(1, LAGS+1):
    long_df[f"pga_lag{k}"] = long_df.groupby(["lat", "lon"])["pga_g"].shift(k)
ml_df = long_df.dropna().reset_index(drop=True)

# 5) Time-aware split
train = ml_df[ml_df["year"] <= 2015].copy()
valid = ml_df[(ml_df["year"] > 2015) & (ml_df["year"] <= 2025)].copy()

FEATURES = [f"pga_lag{k}" for k in range(1, LAGS+1)] + ["lat", "lon", "year", "dist_to_fault_margin_km"]
TARGET = "pga_g"

# Downcast to float32 to cut memory
X_tr = train[FEATURES].astype(np.float32).values
y_tr = train[TARGET].astype(np.float32).values
X_va = valid[FEATURES].astype(np.float32).values
y_va = valid[TARGET].astype(np.float32).values

metrics_rows = []

# 6) Random Forest
rf = RandomForestRegressor(
    n_estimators=150,
    max_depth=12,
    max_features=0.6,
    min_samples_split=4,
    min_samples_leaf=2,
    bootstrap=True,
    max_samples=0.6,
    random_state=SEED,
    n_jobs=1
)
rf.fit(X_tr, y_tr)
pred_va_rf = rf.predict(X_va).astype(np.float32)
metrics_rows.append(evaluate(y_va, pred_va_rf, "RandomForest"))

# 7) XGBoost
xgb_available = True
try:
    from xgboost import XGBRegressor
    xgb = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_lambda=1.0,
        tree_method="hist",
        max_bin=64,
        random_state=SEED,
        objective="reg:squarederror",
        n_jobs=1
    )
    xgb.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    pred_va_xgb = xgb.predict(X_va).astype(np.float32)
    metrics_rows.append(evaluate(y_va, pred_va_xgb, "XGBoost"))
except Exception as e:
    xgb_available = False
    print("XGBoost not available, skipping. Reason:", e)

# 8) LSTM
lstm_available = True
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models

    # Prepare sequences for active cells
    seqs_X_tr, seqs_y_tr = [], []
    seqs_X_va, seqs_y_va = [], []
    for (lat, lon), grp in long_df.groupby(["lat", "lon"]):
        g = grp.sort_values("year")
        vals = g["pga_g"].values.astype("float32")
        yrs = g["year"].values
        dist_fault_margin = g["dist_to_fault_margin_km"].iloc[0]  # Use first value as constant per cell
        if len(vals) < (LAGS + 1): continue
        for t in range(LAGS, len(vals)):
            seq = vals[t-LAGS:t]
            target = vals[t]
            feat_seq = np.column_stack([seq, [dist_fault_margin] * LAGS])
            if yrs[t] <= 2015:
                seqs_X_tr.append(feat_seq); seqs_y_tr.append(target)
            elif 2016 <= yrs[t] <= 2025:
                seqs_X_va.append(feat_seq); seqs_y_va.append(target)

    if len(seqs_X_tr) and len(seqs_X_va):
        X_tr_seq = np.array(seqs_X_tr, dtype=np.float32)
        y_tr_seq = np.array(seqs_y_tr, dtype=np.float32)
        X_va_seq = np.array(seqs_X_va, dtype=np.float32)
        y_va_seq = np.array(seqs_y_va, dtype=np.float32)

        model = models.Sequential([
            layers.Input(shape=(LAGS, 2)),  # Adjusted for lag and fault/margin dist
            layers.LSTM(48, return_sequences=False),
            layers.Dense(1)
        ])
        model.compile(optimizer="adam", loss="mae")
        model.fit(X_tr_seq, y_tr_seq, epochs=5, batch_size=256, verbose=1)

        lstm_pred_va = model.predict(X_va_seq, verbose=0).ravel().astype(np.float32)
        metrics_rows.append(evaluate(y_va_seq, lstm_pred_va, "LSTM"))

        # Rolling forecast for LSTM
        lstm_state = {}
        for (lat, lon), grp in long_df.groupby(["lat", "lon"]):
            g = grp.sort_values("year")
            vals = g[g["year"] <= 2025]["pga_g"].values.astype("float32")
            dist_fault_margin = g["dist_to_fault_margin_km"].iloc[0]
            if len(vals) >= LAGS:
                lstm_state[(lat, lon)] = list(zip(vals[-LAGS:], [dist_fault_margin] * LAGS))

        lstm_fc_grids = {yr: np.full((len(lats), len(lons)), np.nan, dtype=np.float32) for yr in FORECAST_YEARS}
        for yr in sorted(FORECAST_YEARS):
            for step in range(2026, yr + 1):
                for (lat, lon), tail in lstm_state.items():
                    if len(tail) < LAGS: continue
                    x = np.array([t[0] for t in tail], dtype=np.float32)[None, :, None]  # PGA sequence
                    yhat = float(model.predict(x, verbose=0)[0, 0])
                    tail.append((yhat, tail[0][1]))  # Append new PGA with same distance
                    if len(tail) > LAGS: tail.pop(0)
            for (lat, lon), tail in lstm_state.items():
                i = np.where(lats == lat)[0]; j = np.where(lons == lon)[0]
                if i.size and j.size:
                    lstm_fc_grids[yr][i[0], j[0]] = tail[-1][0]

        for yr, grid in lstm_fc_grids.items():
            write_geotiff(os.path.join(OUT_DIR, f"PGA_LSTM_{yr}.tif"), grid, lats, lons)
            array_to_csv(os.path.join(OUT_DIR, f"PGA_LSTM_{yr}.csv"), grid, lats, lons)
    else:
        print("Not enough sequences for LSTM; skipping.")
except Exception as e:
    lstm_available = False
    print("LSTM step skipped. Reason:", e)

# 9) Rolling forecast for RF and XGB
def roll_forecast(model, start_year=2025, target_years=FORECAST_YEARS):
    last = long_df[long_df["year"] <= start_year].sort_values(["lat", "lon", "year"])
    buff = last.groupby(["lat", "lon"]).apply(lambda g: list(zip(g["pga_g"].tail(LAGS), g["dist_to_fault_margin_km"].head(1)))).to_dict()
    preds = {}
    for yr in sorted(target_years):
        for y in range(start_year + 1, yr + 1):
            feats_rows, idx_rows = [], []
            for (lat, lon), tail in buff.items():
                if len(tail) < LAGS: continue
                row = {f"pga_lag{k}": tail[-k][0] for k in range(1, LAGS + 1)}
                row.update({"lat": lat, "lon": lon, "year": y, "dist_to_fault_margin_km": tail[0][1]})
                feats_rows.append(row); idx_rows.append((lat, lon))
            if not feats_rows: break
            F = pd.DataFrame(feats_rows)[FEATURES].astype(np.float32).values
            yhat = model.predict(F).astype(np.float32)
            for (lat, lon), val in zip(idx_rows, yhat):
                tail = buff[(lat, lon)]
                tail.append((float(val), tail[0][1]))
                if len(tail) > LAGS: tail.pop(0)
        grid = np.full((len(lats), len(lons)), np.nan, dtype=np.float32)
        for (lat, lon), tail in buff.items():
            i = np.where(lats == lat)[0]; j = np.where(lons == lon)[0]
            if i.size and j.size: grid[i[0], j[0]] = tail[-1][0]
        preds[yr] = grid
    return preds

if xgb_available:
    xgb_fc = roll_forecast(xgb)
    for yr, grid in xgb_fc.items():
        write_geotiff(os.path.join(OUT_DIR, f"PGA_XGB_{yr}.tif"), grid, lats, lons)
        array_to_csv(os.path.join(OUT_DIR, f"PGA_XGB_{yr}.csv"), grid, lats, lons)

rf_fc = roll_forecast(rf)
for yr, grid in rf_fc.items():
    write_geotiff(os.path.join(OUT_DIR, f"PGA_RF_{yr}.tif"), grid, lats, lons)
    array_to_csv(os.path.join(OUT_DIR, f"PGA_RF_{yr}.csv"), grid, lats, lons)

# 10) Save metrics
metrics_df = pd.DataFrame(metrics_rows)
metrics_path = os.path.join(OUT_DIR, "model_metrics_validation_2016_2025.csv")
metrics_df.to_csv(metrics_path, index=False)

print("\nSaved metrics to:", metrics_path)
print("Forecast files saved in:", OUT_DIR)
print("Expected outputs include:")
print("  - PGA_RF_{2030,2050,2100}.tif / .csv")
if xgb_available:
    print("  - PGA_XGB_{2030,2050,2100}.tif / .csv")
if lstm_available:
    print("  - PGA_LSTM_{2030,2050,2100}.tif / .csv")

Events used: 284,368


KeyboardInterrupt: 