# EMD Graph Fetcher

Fetches all JSON-LD graphs for each EMD vocabulary endpoint using `cmipld`.

Mirrors the branch-aware logic in `helpers/data_loader.py`:
- **`docs` branch** — fetches from remote prefix URLs (`emd:endpoint/_graph.json`)
- **`main`/`pages`/production** — mounts local files with `cmipld.map_current()`

Endpoints covered (matching `summaries_config.py`):

| Stage | Endpoints |
|---|---|
| Stage 1: Grid Foundations | `horizontal_grid_cells`, `horizontal_subgrid` |
| Stage 2: Computational Grids | `horizontal_computational_grid`, `vertical_computational_grid` |
| Families | `model_family` |
| Stage 3: Components | `model_component`, `component_config` |
| Stage 4: Models | `model` |

## 1. Setup

In [None]:
import sys
import json
import subprocess
from pathlib import Path
from pprint import pprint

# Add the scripts helpers to path so we can reuse data_loader
SCRIPT_DIR = Path(".").resolve()
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

# Also add the cmipld package if running from a dev checkout
CMIPLD_DEV = Path("../../../CMIP-LD").resolve()
if CMIPLD_DEV.exists() and str(CMIPLD_DEV) not in sys.path:
    sys.path.insert(0, str(CMIPLD_DEV))

try:
    import cmipld
    print(f"cmipld loaded from: {cmipld.__file__}")
    CMIPLD_AVAILABLE = True
except ImportError as e:
    print(f"cmipld not available: {e}")
    CMIPLD_AVAILABLE = False
    
# cmipld.client.cache_clear()

cmipld loaded from: /Users/daniel.ellis/WIPwork/CMIP-LD/cmipld/__init__.py
Cache cleared: 8 entries removed


8

In [74]:
def get_current_branch() -> str:
    try:
        result = subprocess.run(
            ['git', 'rev-parse', '--abbrev-ref', 'HEAD'],
            capture_output=True, text=True, check=True,
            cwd=SCRIPT_DIR
        )
        return result.stdout.strip()
    except Exception:
        return "unknown"


def setup_cmipld():
    """Mirror data_loader.setup_for_branch() — mount local files on production branches."""
    if not CMIPLD_AVAILABLE:
        raise RuntimeError("cmipld is not available")

    branch = get_current_branch()
    prefix = cmipld.prefix()
    print(f"Branch : {branch}")
    print(f"Prefix : {prefix}")

    if branch == "docs":
        print("Mode   : remote (fetching from prefix URLs)")
        use_local = False
    else:
        # Mount the repo root so emd:foo/bar resolves locally
        repo_root = subprocess.run(
            ['git', 'rev-parse', '--show-toplevel'],
            capture_output=True, text=True, check=True,
            cwd=SCRIPT_DIR
        ).stdout.strip() + '/'
        cmipld.map_current(prefix, path=repo_root)
        print(f"Mode   : local  (mounted {repo_root})")
        use_local = True

    return prefix, use_local


PREFIX, USE_LOCAL = setup_cmipld()

Branch : docs
Prefix : emd
Mode   : remote (fetching from prefix URLs)


## 2. Endpoint definitions

All endpoints are taken directly from `summaries_config.py`.

In [75]:
# Ordered to follow the submission pipeline stages
ENDPOINTS = [
    # Stage 1: Grid Foundations
    {"key": "horizontal_grid_cells",        "label": "Horizontal Grid Cells",         "stage": "Stage 1"},
    {"key": "horizontal_subgrid",            "label": "Horizontal Subgrids",           "stage": "Stage 1"},
    # Stage 2: Computational Grids
    {"key": "horizontal_computational_grid", "label": "Horizontal Computational Grids","stage": "Stage 2"},
    {"key": "vertical_computational_grid",   "label": "Vertical Computational Grids",  "stage": "Stage 2"},
    # Families
    {"key": "model_family",                  "label": "Model Families",                "stage": "Families"},
    # Stage 3: Components
    {"key": "model_component",               "label": "Model Components",              "stage": "Stage 3"},
    {"key": "component_config",              "label": "Component Configurations",      "stage": "Stage 3"},
    # Stage 4: Models
    {"key": "model",                         "label": "Models",                        "stage": "Stage 4"},
]

print(f"{'Endpoint':<40} {'Stage'}")
print("-" * 55)
for ep in ENDPOINTS:
    print(f"{ep['key']:<40} {ep['stage']}")

Endpoint                                 Stage
-------------------------------------------------------
horizontal_grid_cells                    Stage 1
horizontal_subgrid                       Stage 1
horizontal_computational_grid            Stage 2
vertical_computational_grid              Stage 2
model_family                             Families
model_component                          Stage 3
component_config                         Stage 3
model                                    Stage 4


## 3. Fetch all graphs

In [76]:
def _normalise_contents(raw):
    """Coerce 'contents' to a list regardless of depth-dependent compaction shape."""
    if not raw: return []
    if isinstance(raw, list): return raw
    if isinstance(raw, dict): return list(raw.values())  # keyed by @id at depth>=4
    return []

def fetch_graph(endpoint_key: str, depth: int = 4) -> dict | None:
    """Fetch a single endpoint's _graph.json and return the raw compact JSON-LD."""
    url = f"{PREFIX}:{endpoint_key}/_graph.json"
    try:
        data = cmipld.get(url, depth=depth)
        return data
    except Exception as e:
        print(f"  ERROR fetching {url}: {e}")
        return None


def fetch_all_graphs(depth: int = 4) -> dict:
    """Fetch graphs for every endpoint. Returns {endpoint_key: data}."""
    graphs = {}
    print(f"{'Endpoint':<40} {'Entries':>7}  Status")
    print("-" * 60)
    for ep in ENDPOINTS:
        key = ep["key"]
        data = fetch_graph(key, depth=depth)
        graphs[key] = data
        if data is not None:
            n = len(_normalise_contents(data.get("contents")))
            print(f"{key:<40} {n:>7}  OK")
        else:
            print(f"{key:<40} {'?':>7}  FAILED")
    return graphs


GRAPHS = fetch_all_graphs(depth=4)
print(f"\nFetched {sum(1 for v in GRAPHS.values() if v is not None)}/{len(GRAPHS)} graphs successfully.")

Endpoint                                 Entries  Status
------------------------------------------------------------
[Cache MISS] emd:horizontal_grid_cells/_graph.json (depth=4)
horizontal_grid_cells                          5  OK
[Cache MISS] emd:horizontal_subgrid/_graph.json (depth=4)
horizontal_subgrid                             5  OK
[Cache MISS] emd:horizontal_computational_grid/_graph.json (depth=4)
horizontal_computational_grid                  4  OK
[Cache MISS] emd:vertical_computational_grid/_graph.json (depth=4)
vertical_computational_grid                    7  OK
[Cache MISS] emd:model_family/_graph.json (depth=4)
model_family                                  30  OK
[Cache MISS] emd:model_component/_graph.json (depth=4)
model_component                               10  OK
[Cache MISS] emd:component_config/_graph.json (depth=4)
component_config                              14  OK
[Cache MISS] emd:model/_graph.json (depth=4)
model                                         16

## 4. Inspect individual graphs

### 4a. Stage 1 — Grid Foundations

In [77]:
# ── Horizontal Grid Cells ─────────────────────────────────────────────────────
hgc = GRAPHS.get("horizontal_grid_cells")
if hgc:
    items = _normalise_contents(hgc.get("contents"))
    print(f"horizontal_grid_cells: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

horizontal_grid_cells: 5 entries

First entry:
{'@id': 'g100',
 '@type': ['emd', 'wcrp:horizontal_grid_cells', 'esgvoc:HorizontalGridCells'],
 'description': 'Global regular latitude-longitude grid with 1.25° x 0.9° '
                'resolution and 55296 cells.',
 'grid_mapping': {'@id': 'latitude-longitude',
                  '@type': ['universal',
                            'wcrp:grid_mapping',
                            'esgvoc:GridMapping',
                            'constants'],
                  'description': 'A projection that treats latitude and '
                                 'longitude as planar X and Y coordinates, as '
                                 'defined by the CF conventions.',
                  'ui_label': 'Latitude-Longitude',
                  'validation_key': 'latitude_longitude'},
 'grid_type': {'@id': 'regular-latitude-longitude',
               '@type': ['universal',
                         'wcrp:grid_type',
                         'esgvoc:GridType

In [78]:
# ── Horizontal Subgrids ───────────────────────────────────────────────────────
hsg = GRAPHS.get("horizontal_subgrid")
if hsg:
    items = _normalise_contents(hsg.get("contents"))
    print(f"horizontal_subgrid: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

horizontal_subgrid: 5 entries

First entry:
{'@id': 's103',
 '@type': ['emd', 'wcrp:horizontal_subgrid', 'esgvoc:HorizontalSubgrid'],
 'cell_variable_type': [{'@id': 'mass',
                         '@type': ['universal',
                                   'wcrp:cell_variable_type',
                                   'esgvoc:CellVariableType',
                                   'constants'],
                         'description': 'Mass-related variables, including '
                                        'those representing thermodynamic and '
                                        'hydrodynamic quantities.',
                         'ui_label': 'Mass',
                         'validation_key': 'mass'},
                        {'@id': 'x-velocity',
                         '@type': ['universal',
                                   'wcrp:cell_variable_type',
                                   'esgvoc:CellVariableType',
                                   'constants'],
                

### 4b. Stage 2 — Computational Grids

In [79]:
# ── Horizontal Computational Grids ───────────────────────────────────────────
hcg = GRAPHS.get("horizontal_computational_grid")
if hcg:
    items = _normalise_contents(hcg.get("contents"))
    print(f"horizontal_computational_grid: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

horizontal_computational_grid: 4 entries

First entry:
{'@id': 'c100',
 '@type': ['emd',
           'wcrp:horizontal_computational_grid',
           'esgvoc:HorizontalComputationalGrid'],
 'arrangement': {'@id': 'arakawa-c',
                 '@type': ['universal',
                           'wcrp:arrangement',
                           'esgvoc:Arrangement',
                           'constants'],
                 'description': 'The Arakawa C grid places velocity-related '
                                'quantities at the centres of mass cell edges, '
                                'such that each component is perpendicular to '
                                'its edge.',
                 'ui_label': 'Arakawa C-grid',
                 'validation_key': 'arakawa-c'},
 'description': 'Arakawa-C grid combining regular latitude-longitude and '
                'spectral Gaussian subgrids for atmospheric modelling.',
 'horizontal_subgrids': [{'@id': 's100',
                          '@t

In [80]:
# ── Vertical Computational Grids ─────────────────────────────────────────────
vcg = GRAPHS.get("vertical_computational_grid")
if vcg:
    items = _normalise_contents(vcg.get("contents"))
    print(f"vertical_computational_grid: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

vertical_computational_grid: 7 entries

First entry:
{'@id': 'v102',
 '@type': ['emd',
           'wcrp:vertical_computational_grid',
           'esgvoc:VerticalComputationalGrid'],
 'bottom_layer_thickness': 4,
 'description': '14 soil layers (0.01, 0.04, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, '
                '1.5, 2.0, 3.0, 5.0, 8.0, and 12.0 m) but the soil moisture '
                'profile is computed only over the rooting depth, from 0.2 '
                '(rocks) to 8 m (tropical forest) according to the land cover '
                'type, while the soil temperature is computed down to a depth '
                'of 12 m.',
 'n_z': 14,
 'top_layer_thickness': 0.01,
 'total_thickness': 12,
 'ui_label': '',
 'validation_key': 'v102',
 'vertical_coordinate': {'@id': 'depth',
                         '@type': ['universal',
                                   'wcrp:vertical_coordinate',
                                   'esgvoc:VerticalCoordinate',
                                   'consta

### 4c. Families

In [81]:
# ── Model Families (component + ESM) ─────────────────────────────────────────
mf = GRAPHS.get("model_family")
if mf:
    items = _normalise_contents(mf.get("contents"))
    component_families = [x for x in items if x.get("family_type") == "component"]
    model_families     = [x for x in items if x.get("family_type") == "model"]
    other_families     = [x for x in items if x.get("family_type") not in ("component", "model")]

    print(f"model_family total : {len(items)}")
    print(f"  component_family : {len(component_families)}")
    print(f"  model_family     : {len(model_families)}")
    print(f"  other / unset    : {len(other_families)}")
    print("\nFirst model family entry:")
    pprint(model_families[0] if model_families else items[0] if items else {})

model_family total : 30
  component_family : 14
  model_family     : 16
  other / unset    : 0

First model family entry:
{'@id': 'icon',
 '@type': ['emd', 'wcrp:model_family', 'esgvoc:ModelFamily'],
 'collaborative_institutions': {'@id': 'mpi-m',
                                '@type': ['wcrp:organisation',
                                          'esgvoc:Organisation',
                                          'universal',
                                          'constants'],
                                'acronyms': 'MPI-M',
                                'aliases': [],
                                'description': '',
                                'established': 1975,
                                'kind': 'facility',
                                'labels': ['Max Planck Institute for '
                                           'Meteorology',
                                           'Max-Planck-Institut für '
                                           'Meteorologie']

### 4d. Stage 3 — Components

In [82]:
# ── Model Components ──────────────────────────────────────────────────────────
mc = GRAPHS.get("model_component")
if mc:
    items = _normalise_contents(mc.get("contents"))
    print(f"model_component: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

model_component: 10 entries

First entry:
{'@id': 'surfex-v8-modeling-platform',
 '@type': ['emd', 'wcrp:model_component', 'esgvoc:ModelComponent'],
 'code_base': 'https://www.umr-cnrm.fr/surfex/spip.php?rubrique8',
 'component': {'@id': 'land-surface',
               '@type': ['wcrp:scientific_domain',
                         'esgvoc:ScientificDomain',
                         'universal',
                         'constants'],
               'alias': 'land',
               'description': 'Land Surface and Subsurface',
               'ui_label': 'Land Surface and Subsurface',
               'validation_key': 'land_surface'},
 'description': 'SURFEXv8.0 encompasses several submodules for modeling the '
                'interactions between the atmosphere, the ocean, the lakes and '
                'the land surface.',
 'family': {'@id': 'surfex',
            '@type': ['emd', 'wcrp:model_family', 'esgvoc:ModelFamily'],
            'collaborative_institutions': [],
            'common_s

In [83]:
# ── Component Configurations ──────────────────────────────────────────────────
cc = GRAPHS.get("component_config")
if cc:
    items = _normalise_contents(cc.get("contents"))
    print(f"component_config: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

component_config: 14 entries

First entry:
{'@id': 'ocean-biogeochemistry-piscesv2-gas-c101-v103',
 '@type': ['emd', 'wcrp:component_config', 'esgvoc:ComponentConfig'],
 'description': 'Configuration for PISCESv2-gas with horizontal grid c101 and '
                'vertical grid v103.',
 'horizontal_computational_grid': {'@id': 'c101',
                                   '@type': ['emd',
                                             'wcrp:horizontal_computational_grid',
                                             'esgvoc:HorizontalComputationalGrid'],
                                   'arrangement': {'@id': 'arakawa-c',
                                                   '@type': ['universal',
                                                             'wcrp:arrangement',
                                                             'esgvoc:Arrangement',
                                                             'constants'],
                                                   'descrip

### 4e. Stage 4 — Models

In [84]:
# ── Models ────────────────────────────────────────────────────────────────────
models = GRAPHS.get("model")
if models:
    items = _normalise_contents(models.get("contents"))
    print(f"model: {len(items)} entries")
    print("\nFirst entry:")
    pprint(items[0] if items else {})

model: 16 entries

First entry:


KeyError: 0

## 5. Summary table

In [None]:
print(f"{'Stage':<12} {'Endpoint':<40} {'Entries':>7}")
print("-" * 62)
for ep in ENDPOINTS:
    key   = ep["key"]
    stage = ep["stage"]
    data  = GRAPHS.get(key)
    n = len(_normalise_contents(data.get("contents"))) if data else "ERR"
    print(f"{stage:<12} {key:<40} {str(n):>7}")

## 6. Generate D3 / visualization graphs (graphify)

Uses `cmipld.generate.graphify` to produce:
- `_graph.json` per vocab directory
- `_d3graph.json` — entity-level relationship graph
- `_d3structure.json` — folder-level structure graph

In [None]:
try:
    from cmipld.generate.graphify import (
        find_vocab_directories,
        generate_jsonld_graph,
        process_all,
        print_summary,
    )
    GRAPHIFY_AVAILABLE = True
    print("graphify imported OK")
except ImportError as e:
    GRAPHIFY_AVAILABLE = False
    print(f"graphify not available: {e}")

In [None]:
if GRAPHIFY_AVAILABLE:
    # Run from the repo root (two levels above docs/scripts/)
    REPO_ROOT = (SCRIPT_DIR / "../.." ).resolve()
    print(f"Repo root: {REPO_ROOT}")

    vocab_dirs = find_vocab_directories(REPO_ROOT)
    print(f"Found {len(vocab_dirs)} vocab directories:")
    for d in vocab_dirs:
        print(f"  {d.name}")

In [None]:
if GRAPHIFY_AVAILABLE and vocab_dirs:
    # Generate _graph.json for each vocab directory
    print("Generating JSON-LD graph files...\n")
    for vd in vocab_dirs:
        print(f"  {vd.name}")
        result = generate_jsonld_graph(vd, verbose=True)
        if result["status"] != "success":
            print(f"    WARNING: {result['status']} — {result['message']}")

In [None]:
if GRAPHIFY_AVAILABLE and vocab_dirs:
    # Full run: JSON-LD + RDF + D3 visualisation graphs
    # Set generate_rdf=False if rdflib is not installed
    try:
        import rdflib
        GEN_RDF = True
    except ImportError:
        GEN_RDF = False
        print("rdflib not installed — skipping RDF/Turtle and D3 graph generation")

    results = process_all(
        base_path=REPO_ROOT,
        vocab_dirs=vocab_dirs,
        generate_rdf=GEN_RDF,
        generate_viz=GEN_RDF,   # D3 graphs require RDF triples
        verbose=True,
    )

    print_summary(results)

## 7. Visualise the structure graph (optional)

Requires `networkx` and `ipysigma`:
```
pip install networkx ipysigma
```

In [None]:
structure_file = REPO_ROOT / "_d3structure.json"

if structure_file.exists():
    try:
        import networkx as nx
        from ipysigma import Sigma

        with open(structure_file) as f:
            struct_data = json.load(f)

        G_struct = nx.node_link_graph(struct_data)
        print(f"Structure graph: {G_struct.number_of_nodes()} nodes, {G_struct.number_of_edges()} edges")
        Sigma(G_struct, node_color="color", edge_color="color", height=700)
    except ImportError:
        print("networkx / ipysigma not installed — skipping visualisation")
        print(f"Structure graph saved to: {structure_file}")
else:
    print("_d3structure.json not found — run Section 6 first (requires rdflib)")

In [None]:
entity_file = REPO_ROOT / "_d3graph.json"

if entity_file.exists():
    try:
        import networkx as nx
        from ipysigma import Sigma

        with open(entity_file) as f:
            entity_data = json.load(f)

        G_entity = nx.node_link_graph(entity_data)
        print(f"Entity graph: {G_entity.number_of_nodes()} nodes, {G_entity.number_of_edges()} edges")
        Sigma(G_entity, node_color="color", edge_color="color", height=700)
    except ImportError:
        print("networkx / ipysigma not installed — skipping visualisation")
        print(f"Entity graph saved to: {entity_file}")
else:
    print("_d3graph.json not found — run Section 6 first (requires rdflib)")