#### Analysis of EURO-CORDEX annual data

- The CARMINE Euro-Cordex selected experiments have an impact on the climate indicators for the CARMINE CSAs in Europe
- This notebook is designed to quantify the uncertainties of the indicators for the CSAs and to provide communication material for stakeholders and scientific conferences

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable, List

def find_repo_root(start: Path) -> Path:
    for p in [start.resolve()] + list(start.resolve().parents):
        if (p / ".git").exists():
            return p
    raise RuntimeError("Could not find repo root (.git not found).")

REPO_ROOT = find_repo_root(Path.cwd())


# In this repository, the dataset tree lives directly under the repo root
DATA_ROOT: Path = REPO_ROOT

# Outputs (should NOT be committed)
OUT_ROOT: Path = REPO_ROOT / "outputs"
TABLE_DIR: Path = OUT_ROOT / "tables"
FIG_DIR: Path = OUT_ROOT / "figures"
TABLE_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

print("REPO_ROOT:", REPO_ROOT)
#print("DATA_ROOT:", DATA_ROOT)
print("TABLE_DIR exists:", TABLE_DIR.exists())
print("FIG_DIR exists:", FIG_DIR.exists())

In [None]:
# =============================================================================
# EURO-CORDEX test data paths (temporary testing workflows)
# =============================================================================

# Root folder containing zipped EURO-CORDEX test data
EUROCORDEX_TESTDATA_ROOT = (
    REPO_ROOT / "2601_EURO_CORDEX_testing_data"
    if "REPO_ROOT" in globals()
    else Path("./2601_EURO_CORDEX_testing_data")
)

# Unzip cache folder (keep it short to avoid Windows path length issues)
EUROCORDEX_TEST_UNZIP_ROOT = EUROCORDEX_TESTDATA_ROOT / "_unzipped"

# Dedicated output root for EURO-CORDEX test results
EUROCORDEX_TEST_OUTPUT_ROOT = (
    REPO_ROOT / "outputs" / "eurocordex_testing"
    if "REPO_ROOT" in globals()
    else Path("./outputs/eurocordex_testing")
)

# Create output directory (safe: ignored by git)
EUROCORDEX_TEST_OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

print("DATA:", EUROCORDEX_TESTDATA_ROOT)

In [None]:
# =============================================================================
# Unzip EURO-CORDEX test data (zip â†’ _unzipped)
# =============================================================================
from pathlib import Path
import zipfile

zip_root = EUROCORDEX_TESTDATA_ROOT
unzip_root = EUROCORDEX_TEST_UNZIP_ROOT

zip_files = sorted(zip_root.rglob("*.zip"))
print(f"Found {len(zip_files)} zip file(s)")

for zpath in zip_files:
    out_dir = unzip_root / zpath.stem
    out_dir.mkdir(parents=True, exist_ok=True)

    print("\nZIP:", zpath.name)
    print(" -> unpack to:", out_dir)

    with zipfile.ZipFile(zpath, "r") as z:
        members = [m for m in z.namelist() if not m.endswith("/")]
        print("   entries:", len(members))

        written = []
        for m in members:
            target = out_dir / Path(m).name  # flatten: ignore internal paths
            with z.open(m) as src:
                target.write_bytes(src.read())
            written.append(target)

    print("   written files:", len(written))
    if written:
        print("   first file:", written[0].name, "bytes:", written[0].stat().st_size)

In [None]:
p = r"C:\Users\reinhvlr\OneDrive\Documents\RUB\Projects\CARMINE\Work\carmine-analyses\data\CARMINE-T2.4VR\CARMINE-T2.4\2601_EURO_CORDEX_testing_data\_unzipped\67a8bafca25cded2f165c94dc94b23ce\tas_EUR-11_NCC-NorESM1-M_rcp26_r1i1p1_KNMI-RACMO22E_v1_mon_203101-204012.nc"
print("len(path) =", len(p))
print("len(dir)  =", len(p.rsplit("\\", 1)[0]))


In [None]:
# =============================================================================
# Inventory (unzipped NetCDFs): files, timesteps, time coverage, data origin
# =============================================================================
from __future__ import annotations

from pathlib import Path
import warnings
import pandas as pd
import xarray as xr

UNZIP_ROOT = EUROCORDEX_TEST_UNZIP_ROOT
print("UNZIP_ROOT:", UNZIP_ROOT)

nc_files = sorted([p for p in UNZIP_ROOT.rglob("*") if p.suffix.lower() in (".nc", ".nc4", ".cdf")])
print(f"Found {len(nc_files)} NetCDF file(s) under {UNZIP_ROOT}")

def _first_attr(ds: xr.Dataset, keys: list[str]) -> str | None:
    for k in keys:
        if k in ds.attrs and ds.attrs.get(k) not in (None, ""):
            return str(ds.attrs.get(k))
    return None

def summarize_netcdf(fp: Path) -> dict:
    fp = Path(fp)
    row = {
        "zip_folder": fp.parent.name,          # extracted zip stem folder
        "file_name": fp.name,
        "path": str(fp),
        "bytes": fp.stat().st_size,
        "vars": None,
        "dims": None,
        "n_time": None,
        "time_start": None,
        "time_end": None,
        # "data origin" / provenance (best-effort from global attrs)
        "institution": None,
        "source": None,
        "driving_model": None,
        "rcm_model": None,
        "ensemble_member": None,
        "experiment": None,
        "domain": None,
        "grid": None,
    }

    try:
        # decode_times=True should be fine for monthly; if it ever fails, we still capture n_time
        ds = xr.open_dataset(fp, decode_times=True)
        try:
            row["vars"] = ",".join(list(ds.data_vars.keys()))
            row["dims"] = ",".join([f"{k}:{v}" for k, v in ds.sizes.items()])

            if "time" in ds.sizes:
                row["n_time"] = int(ds.sizes["time"])
            if "time" in ds.coords and ds.sizes.get("time", 0) > 0:
                try:
                    t0 = pd.to_datetime(ds["time"].values[0])
                    t1 = pd.to_datetime(ds["time"].values[-1])
                    row["time_start"] = str(t0)
                    row["time_end"] = str(t1)
                except Exception:
                    row["time_start"] = str(ds["time"].values[0])
                    row["time_end"] = str(ds["time"].values[-1])

            row["institution"] = _first_attr(ds, ["institution", "institute_id"])
            row["source"] = _first_attr(ds, ["source", "title"])
            row["driving_model"] = _first_attr(ds, ["driving_model_id", "driving_model"])
            row["rcm_model"] = _first_attr(ds, ["model_id", "rcm_model", "regional_model_id"])
            row["ensemble_member"] = _first_attr(ds, ["driving_model_ensemble_member", "ensemble_member", "realization"])
            row["experiment"] = _first_attr(ds, ["experiment_id", "scenario", "scenario_id"])
            row["domain"] = _first_attr(ds, ["domain_id", "CORDEX_domain"])
            row["grid"] = _first_attr(ds, ["grid", "grid_label"])

        finally:
            ds.close()

    except Exception as e:
        row["vars"] = "ERROR"
        row["dims"] = "ERROR"
        row["experiment"] = f"ERROR: {e}"

    return row

if not nc_files:
    warnings.warn("No NetCDFs found. Did the unzip cell run successfully?")
else:
    rows = [summarize_netcdf(fp) for fp in nc_files]
    df = pd.DataFrame(rows)

    # Key view (compact)
    key_cols = [
        "zip_folder", "file_name", "n_time", "time_start", "time_end",
        "experiment", "driving_model", "rcm_model", "ensemble_member",
        "institution", "source", "domain", "grid",
        "vars", "dims",
    ]
    display(df[key_cols])

    # Optional: quick counts by origin fields
    display(
        df.groupby(["experiment", "driving_model", "rcm_model"], dropna=False)
          .size()
          .reset_index(name="n_files")
          .sort_values("n_files", ascending=False)
    )
