In [None]:
import os
import glob
from pathlib import Path
import xarray as xr
import pandas as pd

# Base directory for CLIMBRA data
inputs_base = "../../workspace/demo_inputs"
climbra_base = Path(inputs_base) 


## Step 1: Explore a specific folder structure

Let's pick the `Compound/FWI/ensemble` folder as an example to understand the file structure.


In [None]:
# Pick a specific folder to explore
example_folder = climbra_base / "hazards" / "Compound" / "FWI" / "ensemble"

# List all files in this folder
print(f"Files in {example_folder}:")
print("=" * 60)

files_info = []
for file in sorted(example_folder.glob("*.nc")):
    file_size = file.stat().st_size / (1024 * 1024)  # Size in MB
    files_info.append({
        'filename': file.name,
        'size_mb': f"{file_size:.2f}"
    })
    print(f"  {file.name:45s} ({file_size:.2f} MB)")

print(f"\nTotal files: {len(files_info)}")

# Quick peek at one file structure
if files_info:
    sample_file = example_folder / files_info[0]['filename']
    print(f"\n{'='*60}")
    print(f"Structure of {sample_file.name}:")
    print("=" * 60)
    ds = xr.open_dataset(sample_file)
    print(ds)
    ds.close()


Files in ../../workspace/demo_inputs/Compound/FWI/ensemble:

Total files: 0


## Step 2: Find all files matching `*_return_periods.nc` pattern

Now let's search across all CLIMBRA folders for files matching the pattern.


In [12]:
# Find all files matching *_return_periods.nc pattern
return_period_files_1 = list(climbra_base.glob("**/ensemble/*ensemble_return_periods.nc"))
return_period_files_2 = list(climbra_base.glob("**/ensemble/*ensemble_return_period.nc"))
return_period_files_3 = list(climbra_base.glob("**/Flood/GIRI_flood_depth_cube.nc"))
return_period_files = return_period_files_1 + return_period_files_2 + return_period_files_3

print(f"Found {len(return_period_files)} files matching '*_return_periods.nc' pattern:")
print("=" * 80)

for file in sorted(return_period_files):
    # Get relative path from climbra_base
    rel_path = file.relative_to(climbra_base)
    file_size = file.stat().st_size / (1024 * 1024)  # Size in MB
    print(f"  {str(rel_path):60s} ({file_size:.2f} MB)")


Found 11 files matching '*_return_periods.nc' pattern:
  hazards/Compound/FWI/ensemble/ensemble_return_periods.nc     (2.57 MB)
  hazards/Compound/HI/ensemble/ensemble_return_period.nc       (2.55 MB)
  hazards/Drought/CDD/ensemble/ensemble_return_period.nc       (2.83 MB)
  hazards/Drought/SPI6/ensemble/ensemble_return_period.nc      (2.66 MB)
  hazards/ExtremeRainfall/CWD/ensemble/ensemble_return_period.nc (2.90 MB)
  hazards/ExtremeRainfall/Rx1day/ensemble/ensemble_return_period.nc (2.87 MB)
  hazards/ExtremeRainfall/Rx5day/ensemble/ensemble_return_period.nc (2.79 MB)
  hazards/Flood/GIRI_flood_depth_cube.nc                       (7539.85 MB)
  hazards/Heat/Frost/ensemble/ensemble_return_period.nc        (0.95 MB)
  hazards/Heat/TNN/ensemble/ensemble_return_period.nc          (2.78 MB)
  hazards/Heat/TXX/ensemble/ensemble_return_period.nc          (2.32 MB)


## Step 3: Load all return period files into a dictionary

We'll organize the data by hazard type and model type (ensemble vs individual).

In [13]:
# Load all return period files into a structured dictionary
return_period_data = {}

print("Loading return period datasets...")
print("=" * 80)

for file in sorted(return_period_files):
    # Parse the path to extract metadata
    parts = file.relative_to(climbra_base).parts
    
    # Create a hierarchical key structure
    # e.g., "Compound/FWI/ensemble"
    category = parts[0]  # e.g., "Compound", "Drought", "Heat"
    hazard = parts[1] if len(parts) > 1 else "unknown"  # e.g., "FWI", "CDD"
    model_type = parts[2] if len(parts) > 2 else "unknown"  # e.g., "ensemble", "individual_models"
    filename = file.name
    
    # Create nested structure
    if category not in return_period_data:
        return_period_data[category] = {}
    if hazard not in return_period_data[category]:
        return_period_data[category][hazard] = {}
    if model_type not in return_period_data[category][hazard]:
        return_period_data[category][hazard][model_type] = {}
    
    # Load the dataset
    try:
        ds = xr.open_dataset(file)
        return_period_data[category][hazard][model_type][filename] = ds
        print(f"✓ Loaded: {category}/{hazard}/{model_type}/{filename}")
    except Exception as e:
        print(f"✗ Failed to load {file}: {e}")

print(f"\n{'='*80}")
print(f"Successfully loaded {len(return_period_files)} return period datasets")
print(f"{'='*80}")


Loading return period datasets...
✓ Loaded: hazards/Compound/FWI/ensemble_return_periods.nc
✓ Loaded: hazards/Compound/HI/ensemble_return_period.nc
✓ Loaded: hazards/Drought/CDD/ensemble_return_period.nc
✓ Loaded: hazards/Drought/SPI6/ensemble_return_period.nc
✓ Loaded: hazards/ExtremeRainfall/CWD/ensemble_return_period.nc
✓ Loaded: hazards/ExtremeRainfall/Rx1day/ensemble_return_period.nc
✓ Loaded: hazards/ExtremeRainfall/Rx5day/ensemble_return_period.nc
✓ Loaded: hazards/Flood/GIRI_flood_depth_cube.nc/GIRI_flood_depth_cube.nc
✓ Loaded: hazards/Heat/Frost/ensemble_return_period.nc
✓ Loaded: hazards/Heat/TNN/ensemble_return_period.nc
✓ Loaded: hazards/Heat/TXX/ensemble_return_period.nc

Successfully loaded 11 return period datasets


## Step 4: Explore the loaded data structure

Let's see what we have loaded and examine one dataset.


In [14]:
# Display the structure of loaded data
print("Data structure:")
print("=" * 80)

for category in return_period_data:
    print(f"\n{category}/")
    for hazard in return_period_data[category]:
        print(f"  {hazard}/")
        for model_type in return_period_data[category][hazard]:
            files = list(return_period_data[category][hazard][model_type].keys())
            print(f"    {model_type}/ ({len(files)} files)")
            for file in files:
                print(f"      - {file}")

# Access example: Get the FWI ensemble return periods dataset
if 'Compound' in return_period_data and 'FWI' in return_period_data['Compound']:
    if 'ensemble' in return_period_data['Compound']['FWI']:
        example_ds = list(return_period_data['Compound']['FWI']['ensemble'].values())[0]
        print(f"\n{'='*80}")
        print("Example dataset (Compound/FWI/ensemble):")
        print("=" * 80)
        print(example_ds)


Data structure:

hazards/
  Compound/
    FWI/ (1 files)
      - ensemble_return_periods.nc
    HI/ (1 files)
      - ensemble_return_period.nc
  Drought/
    CDD/ (1 files)
      - ensemble_return_period.nc
    SPI6/ (1 files)
      - ensemble_return_period.nc
  ExtremeRainfall/
    CWD/ (1 files)
      - ensemble_return_period.nc
    Rx1day/ (1 files)
      - ensemble_return_period.nc
    Rx5day/ (1 files)
      - ensemble_return_period.nc
  Flood/
    GIRI_flood_depth_cube.nc/ (1 files)
      - GIRI_flood_depth_cube.nc
  Heat/
    Frost/ (1 files)
      - ensemble_return_period.nc
    TNN/ (1 files)
      - ensemble_return_period.nc
    TXX/ (1 files)
      - ensemble_return_period.nc


## Accessing the Data

You can now access the loaded datasets using the hierarchical structure:

```python
# Access specific dataset
ds = return_period_data['Compound']['FWI']['ensemble']['ensemble_return_periods.nc']

# Or iterate through all datasets
for category in return_period_data:
    for hazard in return_period_data[category]:
        for model_type in return_period_data[category][hazard]:
            for filename, dataset in return_period_data[category][hazard][model_type].items():
                # Do something with dataset
                pass
```


In [15]:

# Or iterate through all datasets
for category in return_period_data:
    for hazard in return_period_data[category]:
        for model_type in return_period_data[category][hazard]:
            for filename, dataset in return_period_data[category][hazard][model_type].items():
                # Do something with dataset
                print(dataset)

<xarray.Dataset> Size: 9MB
Dimensions:        (ensemble: 4, GWL: 4, lat: 162, lon: 168, return_period: 5)
Coordinates:
  * ensemble       (ensemble) <U6 96B 'mean' 'median' 'p10' 'p90'
  * GWL            (GWL) <U7 112B 'present' '1.5' '2' '3'
  * lat            (lat) float64 1kB -34.12 -33.88 -33.62 ... 5.625 5.875 6.125
  * lon            (lon) float64 1kB -74.12 -73.88 -73.62 ... -32.62 -32.38
  * return_period  (return_period) int64 40B 5 10 25 50 100
Data variables:
    FWI_max        (ensemble, GWL, lat, lon, return_period) float32 9MB ...
<xarray.Dataset> Size: 9MB
Dimensions:        (ensemble: 4, GWL: 4, return_period: 5, lat: 162, lon: 168)
Coordinates:
  * ensemble       (ensemble) <U6 96B 'mean' 'median' 'p10' 'p90'
  * GWL            (GWL) <U7 112B 'present' '1.5' '2' '3'
  * return_period  (return_period) int64 40B 5 10 25 50 100
  * lat            (lat) float64 1kB -34.12 -33.88 -33.62 ... 5.625 5.875 6.125
  * lon            (lon) float64 1kB -74.12 -73.88 -73.62 ... -32.

## Step 5: Copy files to demo_inputs/hazards


Copy all return period files to `workspace/demo_inputs/hazards` while preserving the folder structure.


In [None]:
import shutil

# Define destination directory
dest_base = Path(inputs_base) / "hazards"

print(f"Copying {len(return_period_files)} files to {dest_base}")
print("=" * 80)

# Copy each file while preserving folder structure
copied_count = 0
failed_count = 0

for file in sorted(return_period_files):
    # Get relative path from CLIMBRA base
    rel_path = file.relative_to(climbra_base)
    
    # Create destination path
    dest_file = dest_base / rel_path
    
    # Create parent directories if they don't exist
    dest_file.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Copy the file
        shutil.copy2(file, dest_file)
        copied_count += 1
        print(f"✓ Copied: {rel_path}")
    except Exception as e:
        failed_count += 1
        print(f"✗ Failed to copy {rel_path}: {e}")

print(f"\n{'='*80}")
print(f"Summary:")
print(f"  Successfully copied: {copied_count} files")
print(f"  Failed: {failed_count} files")
print(f"  Destination: {dest_base.absolute()}")
print(f"{'='*80}")


Copying 11 files to demo_inputs/hazards
✓ Copied: hazards/Compound/FWI/ensemble/ensemble_return_periods.nc
✓ Copied: hazards/Compound/HI/ensemble/ensemble_return_period.nc
✓ Copied: hazards/Drought/CDD/ensemble/ensemble_return_period.nc
✓ Copied: hazards/Drought/SPI6/ensemble/ensemble_return_period.nc
✓ Copied: hazards/ExtremeRainfall/CWD/ensemble/ensemble_return_period.nc
✓ Copied: hazards/ExtremeRainfall/Rx1day/ensemble/ensemble_return_period.nc
✓ Copied: hazards/ExtremeRainfall/Rx5day/ensemble/ensemble_return_period.nc
✓ Copied: hazards/Flood/GIRI_flood_depth_cube.nc
✓ Copied: hazards/Heat/Frost/ensemble/ensemble_return_period.nc
✓ Copied: hazards/Heat/TNN/ensemble/ensemble_return_period.nc
✓ Copied: hazards/Heat/TXX/ensemble/ensemble_return_period.nc

Summary:
  Successfully copied: 11 files
  Failed: 0 files
  Destination: /Users/bertrandgallice/code/Theia-Finance-Labs/climate.risk.tool/data-raw/notebooks/demo_inputs/hazards
