In [None]:
# -*- coding: utf-8 -*-
"""
Random Forest – zone-wise (biome-wise) relative importance
Pipeline:
1) Load dependent variable raster (masked to valid range).
2) Load biome classification raster and reproject to match the dependent raster.
3) Load all explanatory rasters, align to the dependent raster, mask to valid pixels,
   and clip each to [1st, 99th] percentile to reduce outliers.
4) Train a RandomForestRegressor to compute normalized feature importances:
   - overall (all pixels)
   - per biome (TBMF, TCF, BF, TGSS, MGS, TUN, MFWS, DXS)
5) Save a CSV with overall and biome-wise importances.
"""

import os
import numpy as np
import pandas as pd
import rioxarray as rxr
from sklearn.ensemble import RandomForestRegressor
from joblib import Parallel, delayed

# -----------------------------
# 0) File inputs (placeholders)
# -----------------------------
dep_file = r"/path/to/dependent_variable.tif"          # e.g., threshold map (SPEI-triggered SOS delay)
classification_file = r"/path/to/biome_classification.tif"

# Explanatory variables used (examples across climate, vegetation, soil/structure, and resilience):
#   - Aridity Index (AI), VOD-based resilience proxy, Shortwave radiation (Srad), Vapor pressure deficit (VPD),
#   - Wind speed (Wind), Annual precipitation (PPT), Vegetation species richness (VegSpecies),
#   - NDVI/LAI composite metric (e.g., NDVI_EOS_POS_Difference_LAI), Root/plant hydraulic proxy (rplant),
#   - Pre-season drought/temperature indices (e.g., SPEI, STImin), Mean NDVI (mean_NDVI), Soil organic carbon & N (SOC_N)
explanatory_files = [
    r"/path/to/AI.tif",
    r"/path/to/VOD_resilience.tif",
    r"/path/to/Srad.tif",
    r"/path/to/VPD.tif",
    r"/path/to/Wind.tif",
    r"/path/to/Annual_PPT.tif",
    r"/path/to/Vegetation_species.tif",
    r"/path/to/NDVI_EOS_POS_Difference_LAI.tif",
    r"/path/to/rplant_proxy.tif",
    r"/path/to/SPEI_STImin.tif",
    r"/path/to/mean_NDVI.tif",
    r"/path/to/SOC_N.tif",
]

# Biome codes -> labels (example mapping)
biome_map = {
    4: "TBMF", 5: "TCF", 6: "BF", 8: "TGSS",
    10: "MGS", 11: "TUN", 12: "MFWS", 13: "DXS",
}

# -----------------------------
# 1) Load dependent variable
# -----------------------------
dep = rxr.open_rasterio(dep_file).squeeze()

# Keep valid range only (example: (-10, 0]; adjust as needed for your study)
dep = dep.where((dep > -10) & (dep <= -0.5))

# -----------------------------------------
# 2) Load & align biome classification map
# -----------------------------------------
biome = rxr.open_rasterio(classification_file).squeeze()
biome = biome.rio.reproject_match(dep)

# ----------------------------------------------------------
# 3) Load, align, mask, and robustly clip explanatory layers
# ----------------------------------------------------------
def load_align_clip(fp, ref_da):
    x = rxr.open_rasterio(fp).squeeze()
    x = x.rio.reproject_match(ref_da)
    # Mask to valid dependent pixels
    x = x.where(~ref_da.isnull())
    # Robust clip to [1st, 99th] percentile on finite values only
    vals = x.values
    finite = np.isfinite(vals)
    if finite.any():
        lo = np.nanpercentile(vals[finite], 1)
        hi = np.nanpercentile(vals[finite], 99)
        # Guard against degenerate lo/hi
        if np.isfinite(lo) and np.isfinite(hi) and lo < hi:
            x = x.clip(min=lo, max=hi)
    return x

exps = Parallel(n_jobs=-1)(
    delayed(load_align_clip)(fp, dep) for fp in explanatory_files
)
feature_names = [os.path.splitext(os.path.basename(fp))[0] for fp in explanatory_files]

# ------------------------------------------------
# 4) Utility: normalized RF feature importances
# ------------------------------------------------
def rf_relative_importance(dep_da, exp_list, rnd=42):
    dep_vals = dep_da.values.flatten()
    X = np.stack([e.values.flatten() for e in exp_list], axis=1)

    mask = np.isfinite(dep_vals) & np.all(np.isfinite(X), axis=1)
    Xm, ym = X[mask], dep_vals[mask]

    # Avoid training on too few samples
    if ym.size < 100:
        return np.full(len(exp_list), np.nan, dtype=float)

    rf = RandomForestRegressor(
        n_estimators=500,
        n_jobs=-1,
        random_state=rnd,
        # Optional: you may set max_features="sqrt" or similar if desired
    )
    rf.fit(Xm, ym)
    imp = np.asarray(rf.feature_importances_, dtype=float)
    s = imp.sum()
    return imp / s if s > 0 else np.full_like(imp, np.nan, dtype=float)

# -------------------------------
# 5) Overall relative importance
# -------------------------------
overall = rf_relative_importance(dep, exps)

# --------------------------------------------
# 6) Biome-wise (zone-wise) relative importance
# --------------------------------------------
result = {"Feature": feature_names, "Overall": overall}
for code, name in biome_map.items():
    zone_mask = biome == code
    dep_z = dep.where(zone_mask)
    exps_z = [e.where(zone_mask) for e in exps]
    result[name] = rf_relative_importance(dep_z, exps_z)

# ----------------
# 7) Save to CSV
# ----------------
df = pd.DataFrame(result)

# (Optional) sort rows by Overall importance descending for readability
try:
    df = df.sort_values(by="Overall", ascending=False)
except Exception:
    pass

out_csv = r"./RF_Importance_Zonewise.csv"
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print(f"✅ Saved overall and biome-wise importances to: {out_csv}")