In [1]:
from pathlib import Path
import re, itertools, math
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# --- Repo paths ---
REPO_ROOT = Path.cwd().resolve()
DATA_ROOT = (REPO_ROOT / ".." / "data" / "CARMINE-T2.4").resolve()
TABLE_DIR = (REPO_ROOT / ".." / "outputs" / "tables").resolve()
FIG_DIR   = (REPO_ROOT / ".." / "outputs" / "figures").resolve()

OUT_TBL_COV = (TABLE_DIR / "obs_coverage").resolve()
OUT_FIG_COV = (FIG_DIR   / "obs_coverage").resolve()
OUT_TBL_CMP = (TABLE_DIR / "obs_compare").resolve()
OUT_FIG_CMP = (FIG_DIR   / "obs_compare").resolve()

for p in (OUT_TBL_COV, OUT_FIG_COV, OUT_TBL_CMP, OUT_FIG_CMP):
    p.mkdir(parents=True, exist_ok=True)

READY = OUT_TBL_COV / "obs_compare_ready.csv"
INV   = OUT_TBL_COV / "obs_inventory.csv"
STATS = OUT_TBL_CMP / "obs_compare_stats.csv"

In [2]:
import sys
print(sys.executable)

C:\Users\localadmin\miniconda3\envs\carmine\python.exe


In [3]:
import importlib
importlib.import_module("xesmf")

<module 'xesmf' from 'C:\\Users\\localadmin\\miniconda3\\envs\\carmine\\Lib\\site-packages\\xesmf\\__init__.py'>

In [4]:
import os
print("GDAL_DATA =", os.environ.get("GDAL_DATA"))

GDAL_DATA = C:\Users\localadmin\miniconda3\envs\carmine\Library\share\gdal


In [5]:
import geopandas as gpd
import pyogrio, fiona
print("geopandas OK:", gpd.__version__)
print("pyogrio OK:", pyogrio.__version__)
print("fiona OK:", fiona.__version__)

geopandas OK: 1.1.1
pyogrio OK: 0.5.1
fiona OK: 1.9.1


In [6]:
# same shapefile used in plot-map_2D.ipynb
FUA_SHP = Path(r"C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\data\CARMINE-T2.4\shapefile\UI-boundaries-FUA\FUA_Boundaries.shp")

# load once (fast + avoids re-reading for every plot)
FUA_GDF = gpd.read_file(FUA_SHP).to_crs(epsg=4326)

def fua_boundary_for_csa(csa: str):
    """Return GeoDataFrame subset for one CSA (or None if not found)."""
    if "FUA_NAME" not in FUA_GDF.columns:
        raise ValueError(f"Expected column 'FUA_NAME' in {FUA_SHP.name}, but found: {list(FUA_GDF.columns)}")
    g = FUA_GDF[FUA_GDF["FUA_NAME"].astype(str) == str(csa)]
    return None if g.empty else g

In [7]:
import os  # optional; you can remove this too if you switch fully to Path
from pathlib import Path

def resolve_relpath(relpath: str, data_root: Path) -> Path | None:
    """
    Resolve a file path under data_root.
    Fallbacks:
      1) Search by basename inside **/INDICATORS/** (case variants)
      2) Search by basename anywhere, preferring paths containing "INDICATORS"
    Returns absolute Path or None.
    """
    p = (data_root / relpath).resolve()
    if p.exists():
        return p

    basename = Path(relpath).name

    # Prefer within INDICATORS folders
    for pat in (f"**/INDICATORS/{basename}", f"**/indicators/{basename}"):
        hits = list(data_root.glob(pat))
        if hits:
            return hits[0].resolve()

    # Last resort: anywhere; prefer INDICATORS
    hits = list(data_root.glob(f"**/{basename}"))
    if hits:
        hits_sorted = sorted(
            hits,
            key=lambda x: ("indicators" not in str(x).lower(), len(str(x)))
        )
        return hits_sorted[0].resolve()

    return None

In [8]:
# Prefer the INDICATORS-only exploded manifest (try likely output folders first)
candidates = [
    OUT_TBL_COV / "repo_manifest_INDONLY_exploded.csv",
    TABLE_DIR   / "repo_manifest_INDONLY_exploded.csv",
    TABLE_DIR   / "indicator_inventory_STANDARD.csv",
    TABLE_DIR   / "indicator_inventory_clean.csv",
]
COVER = next((p for p in candidates if p.exists()), None)
if COVER is None:
    raise FileNotFoundError("No inventory CSV found; re-run the inventory notebook.")

print("Using inventory file:", COVER)

df = pd.read_csv(COVER)
df.columns = [c.strip().lower() for c in df.columns]

# Required columns (keep minimal set actually needed downstream)
req = {"csa", "indicator_token", "period", "relpath"}
missing = req - set(df.columns)
if missing:
    raise ValueError(f"Inventory missing columns: {missing}")

# Keep only proper periods and INDICATORS rows (safety check even for INDONLY manifests)
df = df[df["period"].astype(str).str.match(r"^\d{4}-\d{4}$", na=False)].copy()
df = df[df["relpath"].str.contains("INDICATORS", case=False, na=False)].copy()

def canon_token(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())

df["indicator"] = df["indicator_token"].map(canon_token)

Using inventory file: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\repo_manifest_INDONLY_exploded.csv


In [9]:
# --- Target-grid policy ---
# prefer_eobs  -> if EOBS is in the pair, make EOBS the target (A). Otherwise pick the *coarser* dataset.
# coarser      -> always pick the nominally coarser dataset as target (A).
TARGET_POLICY = "prefer_eobs"

def nominal_res_km(dataset: str) -> float:
    d = (dataset or "").upper()
    if d == "CERRA": return 5.5
    if d == "EOBS":  return 11.0   # 0.10°
    if d == "ERA5":  return 31.0   # ~0.25° global
    if d == "EMO":   return 10.0   # adjust if you know exact
    return 20.0      # fallback

def order_by_policy(ds1: str, ds2: str, policy: str = TARGET_POLICY):
    ds1u, ds2u = (ds1 or "").upper(), (ds2 or "").upper()
    if policy == "prefer_eobs":
        if "EOBS" in (ds1u, ds2u):
            # EOBS must be A (target grid)
            return ("EOBS", ds2u if ds1u == "EOBS" else ds1u)
        # otherwise fall through to coarser-of-two
    # coarser policy (or fallback)
    return (ds1u, ds2u) if nominal_res_km(ds1u) >= nominal_res_km(ds2u) else (ds2u, ds1u)

In [10]:
OBS_TAGS  = {"cerra": "CERRA", "eobs": "EOBS", "era5": "ERA5", "emo": "EMO"}
OBS_SET   = set(OBS_TAGS.values())
HORIZONS  = {"1981-2010", "1991-2020"}

# Precompile patterns once (faster + cleaner)
_PATTERNS = {
    k: re.compile(rf"(^|[_\-/]){k}([_\-\.\/]|$)")
    for k in OBS_TAGS
}

def detect_dataset(path: str) -> str:
    name = str(path).lower()
    for k, pat in _PATTERNS.items():
        if pat.search(name):
            return OBS_TAGS[k]
    return "OTHER"

df["dataset"] = df["relpath"].map(detect_dataset)

# Keep only observation datasets of interest & periods
obs = df[df["dataset"].isin(OBS_SET) & df["period"].isin(HORIZONS)].copy()

print("Rows (obs only):", len(obs))
print("Datasets seen:", sorted(obs["dataset"].unique()))
print("Periods seen:", sorted(obs["period"].unique()))

Rows (obs only): 294
Datasets seen: ['CERRA', 'EMO', 'EOBS', 'ERA5']
Periods seen: ['1981-2010', '1991-2020']


In [11]:
# Inventory of obs files (one row per file we can try to use)
inv_obs = (obs[["csa","indicator","period","dataset","relpath"]]
           .sort_values(["csa","indicator","period","dataset"])
           .reset_index(drop=True))
inv_obs.to_csv(OUT_TBL_COV/"obs_inventory.csv", index=False)
print("Saved:", OUT_TBL_COV/"obs_inventory.csv")

# Availability summary per CSA×indicator×period
summ = (inv_obs.groupby(["csa","indicator","period"])["dataset"]
        .agg(lambda s: sorted(set(s)))
        .reset_index()
        .rename(columns={"dataset":"datasets"}))
summ["n_datasets"] = summ["datasets"].map(len)
summ.to_csv(OUT_TBL_COV/"obs_availability_summary.csv", index=False)
print("Saved:", OUT_TBL_COV/"obs_availability_summary.csv")

# Compare-ready combos (need at least 2 datasets)
ready = summ[summ["n_datasets"] >= 2].copy()
ready.to_csv(OUT_TBL_COV/"obs_compare_ready.csv", index=False)
print("Saved:", OUT_TBL_COV/"obs_compare_ready.csv")

print("\nTop lines of compare-ready:")
display(ready.head(10))

Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\obs_coverage\obs_inventory.csv
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\obs_coverage\obs_availability_summary.csv
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\obs_coverage\obs_compare_ready.csv

Top lines of compare-ready:


Unnamed: 0,csa,indicator,period,datasets,n_datasets
10,Barcelona,hd,1981-2010,"[CERRA, EOBS]",2
11,Barcelona,hd,1991-2020,"[CERRA, EOBS]",2
14,Barcelona,rr,1981-2010,"[CERRA, EOBS]",2
15,Barcelona,rr,1991-2020,"[CERRA, EOBS]",2
20,Barcelona,su,1981-2010,"[CERRA, EOBS]",2
21,Barcelona,su,1991-2020,"[CERRA, EOBS]",2
30,Birmingham,hd,1981-2010,"[CERRA, EOBS]",2
31,Birmingham,hd,1991-2020,"[CERRA, EOBS]",2
34,Birmingham,rr,1981-2010,"[CERRA, EOBS]",2
35,Birmingham,rr,1991-2020,"[CERRA, EOBS]",2


In [12]:
# --- Target policy helpers (define BEFORE building pairs) ---

def normalize_pairs_df(pairs_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in pairs_df.iterrows():
        a, b = order_by_policy(r["ds_a"], r["ds_b"], TARGET_POLICY)
        rows.append({**r, "ds_a": a, "ds_b": b})
    return (pd.DataFrame(rows)
            .drop_duplicates(subset=["csa","indicator","period","ds_a","ds_b"])
            .reset_index(drop=True))

In [13]:
# --- Section 2: Dataset selector (robust) ---

import ast
import itertools
import re

ALLOWED = {"CERRA","EOBS","ERA5","EMO"}

def parse_datasets_field(x):
    """
    Accepts:
      - Python list or tuple (e.g., ["EOBS","CERRA"])
      - Stringified list (e.g., '["EOBS","CERRA"]')
      - Single string (e.g., "EOBS")
      - Integer (e.g., 2) -> return []
      - None/NaN -> []
    Returns a clean, unique, sorted list of allowed dataset tags.
    """
    if x is None:
        return []
    if isinstance(x, (list, tuple, set)):
        seq = list(x)
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                seq = list(ast.literal_eval(s))
            except Exception:
                seq = [s]
        else:
            seq = [s]
    elif isinstance(x, (int, float)):
        # it's a count, not the actual list
        return []
    else:
        return []

    # normalize tokens
    out = []
    for item in seq:
        t = re.sub(r"[^a-z0-9]+","", str(item).lower()).upper()
        if t in ALLOWED:
            out.append(t)
    return sorted(set(out))

# Manual mode (leave None to batch)
MANUAL = dict(
    csa=None,          # e.g., "Prague"
    indicator=None,    # e.g., "su"
    period=None,       # "1981-2010" or "1991-2020"
    ds_a=None,         # "EOBS", "CERRA", "ERA5", "EMO"
    ds_b=None,
)

# Build pairs from ready (robust)
pairs_rows = []
for _, r in ready.iterrows():
    ds_list = parse_datasets_field(r.get("datasets"))
    if len(ds_list) < 2:
        continue
    for a, b in itertools.combinations(ds_list, 2):
        pairs_rows.append(dict(
            csa=r["csa"],
            indicator=re.sub(r"[^a-z0-9]+","", str(r["indicator"]).lower()),
            period=r["period"],
            ds_a=a,
            ds_b=b,
        ))
pairs = pd.DataFrame(pairs_rows)
pairs = normalize_pairs_df(pairs)
print("Expanded pairs:", len(pairs))
display(pairs.head(10))

Expanded pairs: 32


Unnamed: 0,csa,indicator,period,ds_a,ds_b
0,Barcelona,hd,1981-2010,EOBS,CERRA
1,Barcelona,hd,1991-2020,EOBS,CERRA
2,Barcelona,rr,1981-2010,EOBS,CERRA
3,Barcelona,rr,1991-2020,EOBS,CERRA
4,Barcelona,su,1981-2010,EOBS,CERRA
5,Barcelona,su,1991-2020,EOBS,CERRA
6,Birmingham,hd,1981-2010,EOBS,CERRA
7,Birmingham,hd,1991-2020,EOBS,CERRA
8,Birmingham,rr,1981-2010,EOBS,CERRA
9,Birmingham,rr,1991-2020,EOBS,CERRA


In [14]:
def normalize_pairs_df(pairs_df):
    rows = []
    for _, r in pairs_df.iterrows():
        a, b = order_by_policy(r["ds_a"], r["ds_b"], TARGET_POLICY)
        rows.append({**r, "ds_a": a, "ds_b": b})
    out = (pd.DataFrame(rows)
           .drop_duplicates(subset=["csa","indicator","period","ds_a","ds_b"])
           .reset_index(drop=True))
    return out

pairs = normalize_pairs_df(pairs)
print("Pairs after target policy (A is target grid):")
display(pairs.head(8))

Pairs after target policy (A is target grid):


Unnamed: 0,csa,indicator,period,ds_a,ds_b
0,Barcelona,hd,1981-2010,EOBS,CERRA
1,Barcelona,hd,1991-2020,EOBS,CERRA
2,Barcelona,rr,1981-2010,EOBS,CERRA
3,Barcelona,rr,1991-2020,EOBS,CERRA
4,Barcelona,su,1981-2010,EOBS,CERRA
5,Barcelona,su,1991-2020,EOBS,CERRA
6,Birmingham,hd,1981-2010,EOBS,CERRA
7,Birmingham,hd,1991-2020,EOBS,CERRA


In [15]:
# By CSA & period: number of (indicator) combos with >=2 datasets
pairs_by_csa_period = (ready.groupby(["csa","period"])
                       .size()
                       .reset_index(name="n_pairs"))
pairs_by_csa_period.to_csv(OUT_TBL_COV/"obs_pairs_by_csa_period.csv", index=False)
print("Saved:", OUT_TBL_COV/"obs_pairs_by_csa_period.csv")

# By indicator & period (across all CSAs)
pairs_by_ind_period = (ready.groupby(["indicator","period"])
                       .size()
                       .reset_index(name="n_pairs"))
pairs_by_ind_period.to_csv(OUT_TBL_COV/"obs_pairs_by_indicator_period.csv", index=False)
print("Saved:", OUT_TBL_COV/"obs_pairs_by_indicator_period.csv")

display(pairs_by_csa_period.sort_values(["period","n_pairs"], ascending=[True,False]).head(12))
display(pairs_by_ind_period.sort_values(["period","n_pairs"], ascending=[True,False]).head(12))

Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\obs_coverage\obs_pairs_by_csa_period.csv
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\tables\obs_coverage\obs_pairs_by_indicator_period.csv


Unnamed: 0,csa,period,n_pairs
0,Barcelona,1981-2010,3
2,Birmingham,1981-2010,3
4,Bologna,1981-2010,3
6,Brasov,1981-2010,3
8,Leipzig,1981-2010,2
10,Prague,1981-2010,2
1,Barcelona,1991-2020,3
3,Birmingham,1991-2020,3
5,Bologna,1991-2020,3
7,Brasov,1991-2020,3


Unnamed: 0,indicator,period,n_pairs
0,hd,1981-2010,6
4,su,1981-2010,6
2,rr,1981-2010,4
1,hd,1991-2020,6
5,su,1991-2020,6
3,rr,1991-2020,4


In [16]:
# --- Bar: compare-ready counts per CSA for each period ---
for per in sorted(HORIZONS):
    d = pairs_by_csa_period[pairs_by_csa_period["period"] == per].sort_values("n_pairs", ascending=False)
    if d.empty:
        print(f"No compare-ready pairs for {per}")
        continue
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.bar(d["csa"], d["n_pairs"])
    ax.set_title(f"Compare-ready (≥2 datasets) per CSA — {per}")
    ax.set_ylabel("# CSA×indicator combos")
    ax.set_xlabel("CSA")
    ax.tick_params(axis="x", rotation=45)
    plt.tight_layout()
    out = OUT_FIG_COV / f"obs_pairs_by_csa_{per}.png"
    fig.savefig(out, dpi=200, bbox_inches="tight"); plt.close(fig)
    print("Saved:", out)

# --- Heatmap: #datasets available per CSA×indicator (one heatmap per period) ---
for per in sorted(HORIZONS):
    base = summ[summ["period"] == per]
    if base.empty:
        print(f"No rows for period {per}")
        continue
    # pivot with counts of datasets (cap at 4 for visual scale)
    mat = (base.pivot_table(index="csa", columns="indicator", values="n_datasets", aggfunc="max")
                 .fillna(0).astype(int))
    if mat.empty:
        print(f"No matrix for {per}")
        continue
    fig, ax = plt.subplots(figsize=(12, max(4, 0.35*len(mat))))
    im = ax.imshow(mat.values, aspect="auto", vmin=0, vmax=4)
    ax.set_xticks(range(mat.shape[1])); ax.set_xticklabels(mat.columns, rotation=45, ha="right")
    ax.set_yticks(range(mat.shape[0])); ax.set_yticklabels(mat.index)
    ax.set_title(f"Obs dataset availability (count) — {per}")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="#datasets (CERRA/EOBS/ERA5/EMO)")
    plt.tight_layout()
    out = OUT_FIG_COV / f"obs_availability_heatmap_{per}.png"
    fig.savefig(out, dpi=200, bbox_inches="tight"); plt.close(fig)
    print("Saved:", out)

Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\figures\obs_coverage\obs_pairs_by_csa_1981-2010.png
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\figures\obs_coverage\obs_pairs_by_csa_1991-2020.png
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\figures\obs_coverage\obs_availability_heatmap_1981-2010.png
Saved: C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\outputs\figures\obs_coverage\obs_availability_heatmap_1991-2020.png


In [17]:
from pathlib import Path
import re, itertools, math
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# --- Paths ---
DATA_ROOT = Path("../data/CARMINE-T2.4")
TABLE_DIR = Path("../outputs/tables")
FIG_DIR   = Path("../outputs/figures")

OUT_TBL   = TABLE_DIR / "obs_compare"
OUT_FIG   = FIG_DIR / "obs_compare"
OUT_TBL.mkdir(parents=True, exist_ok=True)
OUT_FIG.mkdir(parents=True, exist_ok=True)

# --- Load "compare-ready" list from the coverage notebook ---
# (If you used a different name, adjust this path.)
READY = TABLE_DIR / "obs_coverage" / "obs_compare_ready.csv"
INV   = TABLE_DIR / "obs_coverage" / "obs_inventory.csv"   # needed to get relpaths per dataset

if not READY.exists() or not INV.exists():
    raise FileNotFoundError("Missing input tables. Run `30_obs_dataset_coverage.ipynb` first.")

ready = pd.read_csv(READY)
inv   = pd.read_csv(INV)

# Normalize column names
for df in (ready, inv):
    df.columns = [c.lower() for c in df.columns]

# Helper: nice canon forms
def canon(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())
ready["indicator"] = ready["indicator"].map(canon)
inv["indicator"]   = inv["indicator"].map(canon)

print("Compare-ready rows:", len(ready))
print("Sample ready rows:"); display(ready.head(3))

Compare-ready rows: 32
Sample ready rows:


Unnamed: 0,csa,indicator,period,datasets,n_datasets
0,Barcelona,hd,1981-2010,"['CERRA', 'EOBS']",2
1,Barcelona,hd,1991-2020,"['CERRA', 'EOBS']",2
2,Barcelona,rr,1981-2010,"['CERRA', 'EOBS']",2


In [18]:
# Choose **one** of the two modes below:

# --- Mode A: Manual selection (edit here) ---
MANUAL = dict(
    csa      = None,          # e.g., "Leipzig"
    indicator= None,          # e.g., "rx5day"
    period   = None,          # "1981-2010" or "1991-2020"
    ds_a     = None,          # "CERRA"/"EOBS"/"ERA5"/"EMO"
    ds_b     = None,          # "CERRA"/"EOBS"/"ERA5"/"EMO"
)

# --- Mode B: Batch over all compare-ready pairs (recommended to start) ---
# We’ll expand “datasets” into pairs (A vs B)
def list_pairs(datasets_list):
    s = sorted(set(eval(datasets_list)) if isinstance(datasets_list, str) and datasets_list.startswith("[") else datasets_list)
    return list(itertools.combinations(s, 2))

pairs = []
for _, r in ready.iterrows():
    for (a,b) in list_pairs(r["datasets"]):
        pairs.append(dict(csa=r["csa"], indicator=r["indicator"], period=r["period"], ds_a=a, ds_b=b))
pairs = pd.DataFrame(pairs)
print("Expanded pairs:", len(pairs))
display(pairs.head(5))

Expanded pairs: 32


Unnamed: 0,csa,indicator,period,ds_a,ds_b
0,Barcelona,hd,1981-2010,CERRA,EOBS
1,Barcelona,hd,1991-2020,CERRA,EOBS
2,Barcelona,rr,1981-2010,CERRA,EOBS
3,Barcelona,rr,1991-2020,CERRA,EOBS
4,Barcelona,su,1981-2010,CERRA,EOBS


In [23]:
def pick_var(ds: xr.Dataset, token: str) -> str:
    # pick variable that contains token (case-insensitive); fallback = first var
    token = token.lower()
    vars_ = list(ds.data_vars)
    hits  = [v for v in vars_ if token in v.lower()]
    return hits[0] if hits else vars_[0]

def to_days_if_timedelta(da: xr.DataArray) -> xr.DataArray:
    if np.issubdtype(da.dtype, np.timedelta64):
        da = (da / np.timedelta64(1, "D")).astype("float64")
        da.attrs["units"] = "days"
    else:
        da = da.astype("float64", copy=False)
    return da

def subset_period_time_mean(da: xr.DataArray, period: str) -> xr.DataArray:
    if "time" not in da.dims:
        return da.squeeze()
    years = pd.to_datetime(da["time"].values).year
    y1, y2 = map(int, period.replace("_","-").split("-"))
    return da.sel(time=(years>=y1)&(years<=y2)).mean("time", skipna=True).squeeze()

def load_obs_map(csa: str, indicator: str, period: str, dataset: str) -> xr.DataArray:
    rows = inv[(inv["csa"]==csa) & (inv["indicator"]==indicator) &
               (inv["period"]==period) & (inv["dataset"]==dataset)]
    if rows.empty:
        raise FileNotFoundError(f"No file for {csa}/{indicator}/{period}/{dataset}")

    # Prefer the shortest relpath (often cleaner)
    rows = rows.sort_values(by="relpath", key=lambda s: s.str.len()).reset_index(drop=True)

    last_err = None
    for _, r in rows.iterrows():
        rel = str(r["relpath"])
        f = resolve_relpath(rel, DATA_ROOT)
        if f is None:
            last_err = FileNotFoundError(f"Unresolved path for basename: {os.path.basename(rel)}")
            continue
        if not f.exists():
            last_err = FileNotFoundError(f"Not on disk: {f}")
            continue
        try:
            ds  = xr.open_dataset(f, decode_timedelta=True)
            v   = pick_var(ds, indicator)
            da  = ds[v]
            da  = to_days_if_timedelta(da)
            da  = subset_period_time_mean(da, period)
            ds.close()
            return da
        except Exception as e:
            last_err = e
            try: ds.close()
            except: pass

    # If we’re here, none of the candidates opened successfully
    raise FileNotFoundError(last_err if last_err is not None else "Unknown path error")

def align_on(A, B, method="nearest"):
    """
    A = target, B = source.
    Returns (A2, B2) where B2 is on A's grid.
    If grids are incompatible and xESMF isn't available, raises a ValueError.
    """

    def _rectilinear_latlon(da):
        for lat in ("latitude", "lat"):
            for lon in ("longitude", "lon"):
                if (lat in da.dims) and (lon in da.dims) and da[lat].ndim == 1 and da[lon].ndim == 1:
                    return lat, lon
        return None

    # 1) Rectilinear -> use xarray.interp (fast, simple)
    axesA = _rectilinear_latlon(A)
    axesB = _rectilinear_latlon(B)
    if axesA and axesB:
        latA, lonA = axesA
        latB, lonB = axesB
        B_i = B.interp({latB: A[latA], lonB: A[lonA]}, method=method)
        A2, B2 = xr.align(A, B_i, join="inner")
        return A2.squeeze(), B2.squeeze()

    # 2) Non-rectilinear case (e.g., CERRA on y/x) -> try xESMF if lat/lon coords exist
    #    If not possible, fail loudly instead of cropping shapes.
    def _has_latlon_coords(da):
        return (("lat" in da.coords or "latitude" in da.coords) and ("lon" in da.coords or "longitude" in da.coords))

    if _has_latlon_coords(A) and _has_latlon_coords(B):
        try:
            import numpy as np
            import xesmf as xe

            def _grid(da):
                lat = da["lat"] if "lat" in da.coords else da["latitude"]
                lon = da["lon"] if "lon" in da.coords else da["longitude"]
                if lat.ndim == 1 and lon.ndim == 1:
                    lon2d, lat2d = np.meshgrid(lon.values, lat.values)
                    return {"lon": lon2d, "lat": lat2d}
                if lat.ndim == 2 and lon.ndim == 2:
                    return {"lon": lon.values, "lat": lat.values}
                raise ValueError("Unsupported lat/lon coordinate shapes for regridding.")

            rmethod = "nearest_s2d" if method == "nearest" else "bilinear"
            R = xe.Regridder(_grid(B), _grid(A), rmethod, reuse_weights=False)
            B_i = xr.DataArray(R(B.values), coords=A.coords, dims=A.dims, name=B.name)
            A2, B2 = xr.align(A, B_i, join="inner")
            return A2.squeeze(), B2.squeeze()

        except Exception as e:
            raise ValueError(f"Cannot regrid non-rectilinear grids (xESMF failed): {type(e).__name__}: {e}")

    raise ValueError("Cannot align grids: missing usable lat/lon for xESMF and not rectilinear for xarray.interp.")

def summary_stats(a: np.ndarray, b: np.ndarray) -> dict:
    mask = np.isfinite(a) & np.isfinite(b)
    if mask.sum() == 0:
        return dict(n=0, mean_a=np.nan, mean_b=np.nan, bias=np.nan, rmse=np.nan, mae=np.nan, r=np.nan)
    A = a[mask]; B = b[mask]
    diff = B - A
    return dict(
        n      = int(mask.sum()),
        mean_a = float(np.nanmean(A)),
        mean_b = float(np.nanmean(B)),
        bias   = float(np.nanmean(diff)),
        rmse   = float(np.sqrt(np.nanmean(diff**2))),
        mae    = float(np.nanmean(np.abs(diff))),
        r      = float(np.corrcoef(A, B)[0,1]) if len(A) > 2 else np.nan,
    )


In [24]:
# Safety guards
try:
    MANUAL
except NameError:
    MANUAL = dict(csa=None, indicator=None, period=None, ds_a=None, ds_b=None)

try:
    pairs
except NameError:
    pairs = pd.DataFrame(columns=["csa","indicator","period","ds_a","ds_b"])

In [27]:
def plot_map(a, b, csa, indicator, period, ds_a, ds_b, outdir: Path):
    # Difference map (B - A)
    a2, b2 = align_on(a, b)
    diff = (b2 - a2)

    # vmin/vmax symmetric about zero, using robust percentiles
    vals = diff.values.ravel()
    vals = vals[np.isfinite(vals)]
    if vals.size == 0:
        print("  (warn) No finite values for map.")
        return None
    vmax = np.nanpercentile(np.abs(vals), 98)
    vmin = -vmax

    fig, ax = plt.subplots(figsize=(6.0, 4.8))

    # lon/lat coords (degrees)
    lat = diff["lat"] if "lat" in diff.coords else diff["latitude"]
    lon = diff["lon"] if "lon" in diff.coords else diff["longitude"]

    # plot in lon/lat degrees
    if lat.ndim == 1 and lon.ndim == 1:
        # ensure (lat, lon) order
        if "latitude" in diff.dims and "longitude" in diff.dims:
            diff2 = diff.transpose("latitude", "longitude")
        elif "lat" in diff.dims and "lon" in diff.dims:
            diff2 = diff.transpose("lat", "lon")
        else:
            diff2 = diff
        im = ax.pcolormesh(lon.values, lat.values, diff2.values,
                           shading="auto", vmin=vmin, vmax=vmax)
    else:
        im = ax.pcolormesh(lon.values, lat.values, diff.values,
                           shading="auto", vmin=vmin, vmax=vmax)

    fig.colorbar(im, ax=ax)
    ax.set_xlabel("Longitude [°E]")
    ax.set_ylabel("Latitude [°N]")

    # Overlay FUA boundary (if available)
    try:
        fua = fua_boundary_for_csa(csa)
        if fua is not None:
            fua.boundary.plot(ax=ax, edgecolor="black", linewidth=1.0)
    except Exception as e:
        print("  (warn) Could not load/plot FUA shapefile:", e)

    ax.set_title(f"{csa} — {indicator} — {period}\n{ds_b} minus {ds_a}")
    fig.tight_layout()
    out = outdir / f"{csa}_{indicator}_{period}_{ds_b}-minus-{ds_a}_map.png"
    fig.savefig(out, dpi=200, bbox_inches="tight")
    plt.close(fig)
    print("  Saved map:", out)
    return out

def plot_scatter(a, b, csa, indicator, period, ds_a, ds_b, outdir: Path):
    a2, b2 = align_on(a, b)
    A = a2.values.ravel()
    B = b2.values.ravel()

    mask = np.isfinite(A) & np.isfinite(B)
    A, B = A[mask], B[mask]
    if A.size < 10:
        print("  (warn) too few valid points for scatter.")
        return None

    # stats & 1:1 line
    stats = summary_stats(A, B)
    lim_min = float(np.nanpercentile(np.concatenate([A, B]), 1))
    lim_max = float(np.nanpercentile(np.concatenate([A, B]), 99))
    if not math.isfinite(lim_min) or not math.isfinite(lim_max):
        lim_min, lim_max = float(np.nanmin(A)), float(np.nanmax(A))

    fig, ax = plt.subplots(figsize=(5.2, 5.2))
    ax.scatter(A, B, s=5, alpha=0.25)
    ax.plot([lim_min, lim_max], [lim_min, lim_max], lw=1.0)
    ax.set_xlim(lim_min, lim_max)
    ax.set_ylim(lim_min, lim_max)

    ax.set_xlabel(f"{ds_a}")
    ax.set_ylabel(f"{ds_b}")
    ax.set_title(f"{csa} — {indicator} — {period}\nScatter: {ds_b} vs {ds_a}")

    txt = f"n={stats['n']}  bias={stats['bias']:.2f}  RMSE={stats['rmse']:.2f}  r={stats['r']:.2f}"
    ax.text(
        0.02, 0.98, txt,
        transform=ax.transAxes,
        va="top", ha="left", fontsize=9,
        bbox=dict(boxstyle="round", fc="white", alpha=0.7, lw=0.3)
    )

    fig.tight_layout()
    out = outdir / f"{csa}_{indicator}_{period}_{ds_b}-vs-{ds_a}_scatter.png"
    fig.savefig(out, dpi=200, bbox_inches="tight")
    plt.close(fig)
    print("  Saved scatter:", out)
    return out


def plot_hist(a, b, csa, indicator, period, ds_a, ds_b, outdir: Path):
    a2, b2 = align_on(a, b)
    diff = (b2 - a2).values.ravel()
    diff = diff[np.isfinite(diff)]
    if diff.size < 10:
        print("  (warn) too few valid points for histogram.")
        return None
    fig, ax = plt.subplots(figsize=(5.6, 3.4))
    ax.hist(diff, bins=40)
    ax.set_title(f"{csa} — {indicator} — {period}\n{ds_b} − {ds_a} (histogram)")
    ax.set_xlabel("Difference (units of indicator)")
    ax.set_ylabel("Frequency")
    fig.tight_layout()
    out = outdir / f"{csa}_{indicator}_{period}_{ds_b}-minus-{ds_a}_hist.png"
    fig.savefig(out, dpi=200, bbox_inches="tight"); plt.close(fig)
    print("  Saved hist:", out)
    return out

In [28]:
def run_one(csa, indicator, period, ds_a, ds_b):
    # Enforce target-grid policy here (robust against cell execution order / manual mode)
    ds_a, ds_b = order_by_policy(ds_a, ds_b, TARGET_POLICY)

    print(f"\n=== {csa} | {indicator} | {period} | target={ds_a} | source={ds_b} ===")

    A = load_obs_map(csa, indicator, period, ds_a)  # target grid
    B = load_obs_map(csa, indicator, period, ds_b)  # will be regridded to A

    # Compute stats at field level (aligned)
    A2, B2 = align_on(A, B)
    stats = summary_stats(A2.values, B2.values)
    stats.update(dict(
        csa=csa, indicator=indicator, period=period, ds_a=ds_a, ds_b=ds_b
    ))

    # Output dir per CSA
    outdir = OUT_FIG / canon(csa)
    outdir.mkdir(parents=True, exist_ok=True)

    # Plots
    map_path = plot_map(A, B, csa, indicator, period, ds_a, ds_b, outdir)
    sca_path = plot_scatter(A, B, csa, indicator, period, ds_a, ds_b, outdir)
    his_path = plot_hist(A, B, csa, indicator, period, ds_a, ds_b, outdir)

    return stats

# --- Choose mode ---
results = []
if all(MANUAL.values()):
    results.append(run_one(**MANUAL))
else:
    for _, row in pairs.iterrows():
        try:
            res = run_one(row.csa, row.indicator, row.period, row.ds_a, row.ds_b)
            results.append(res)
        except FileNotFoundError as e:
            print("  (skip)", e)
        except Exception as e:
            print("  (error)", type(e).__name__, e)

# Save summary table
if results:
    df_stats = pd.DataFrame(results)
    out_csv = OUT_TBL / "obs_compare_stats.csv"
    df_stats.to_csv(out_csv, index=False)
    print("\nSaved summary stats:", out_csv)
    display(df_stats.head(10))
else:
    print("No results produced.")


=== Barcelona | hd | 1981-2010 | target=EOBS | source=CERRA ===
  Saved map: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1981-2010_CERRA-minus-EOBS_map.png
  Saved scatter: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1981-2010_CERRA-vs-EOBS_scatter.png
  Saved hist: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1981-2010_CERRA-minus-EOBS_hist.png

=== Barcelona | hd | 1991-2020 | target=EOBS | source=CERRA ===
  Saved map: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1991-2020_CERRA-minus-EOBS_map.png
  Saved scatter: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1991-2020_CERRA-vs-EOBS_scatter.png
  Saved hist: ..\outputs\figures\obs_compare\barcelona\Barcelona_hd_1991-2020_CERRA-minus-EOBS_hist.png

=== Barcelona | rr | 1981-2010 | target=EOBS | source=CERRA ===
  Saved map: ..\outputs\figures\obs_compare\barcelona\Barcelona_rr_1981-2010_CERRA-minus-EOBS_map.png
  Saved scatter: ..\outputs\figures\obs_compare\barcelona\Barcelona_rr_1981-2

Unnamed: 0,n,mean_a,mean_b,bias,rmse,mae,r,csa,indicator,period,ds_a,ds_b
0,462,0.737157,0.82376,0.086602,0.759192,0.355611,0.899195,Barcelona,hd,1981-2010,EOBS,CERRA
1,462,1.386724,1.365224,-0.021501,0.980802,0.494372,0.909691,Barcelona,hd,1991-2020,EOBS,CERRA
2,462,61.927922,102.775225,40.847303,47.830443,41.093201,0.778621,Barcelona,rr,1981-2010,EOBS,CERRA
3,462,62.280375,103.433911,41.153535,48.072926,41.31241,0.784059,Barcelona,rr,1991-2020,EOBS,CERRA
4,462,53.136508,49.661422,-3.475086,22.43188,18.107054,0.85886,Barcelona,su,1981-2010,EOBS,CERRA
5,462,57.839394,55.079798,-2.759596,25.726959,20.69798,0.837852,Barcelona,su,1991-2020,EOBS,CERRA
6,210,0.003968,0.015568,0.0116,0.023023,0.012552,0.453479,Birmingham,hd,1981-2010,EOBS,CERRA
7,210,0.012063,0.024921,0.012857,0.029725,0.018254,0.669344,Birmingham,hd,1991-2020,EOBS,CERRA
8,210,140.599524,143.033333,2.43381,5.338025,4.374689,0.85961,Birmingham,rr,1981-2010,EOBS,CERRA
9,210,142.203175,149.566508,7.363333,9.10719,7.827778,0.834712,Birmingham,rr,1991-2020,EOBS,CERRA
