# Process Variable Registry CSVs to WCRP-universe JSON Format

This notebook processes CSV files from the Variable Registry and converts them to JSON format compatible with WCRP-universe.

In [52]:
# Import required libraries
import pandas as pd
import json
import os
import sys
from pathlib import Path
import subprocess
from glob import glob

# Add CMIP-LD to path for imports
sys.path.append('/Users/daniel.ellis/WIPwork/CMIP-LD')

# Import cmipld modules
# from cmipld.generate.validate_json import validate_json_file
# from cmipld.generate.create_readme import create_readme

In [53]:
# Define paths
VARIABLE_REGISTRY_PATH = Path('/Users/daniel.ellis/WIPwork/Variable-Registry/.src')
OUTPUT_PATH = Path('/Users/daniel.ellis/WIPwork/Variable-Registry/src-data')

# List CSV files in the directory
csv_files = list(VARIABLE_REGISTRY_PATH.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files:")
for csv_file in csv_files:
    print(f"  - {csv_file.name}")

Found 3 CSV files:
  - Coordinates and Dimensions-Grid view.csv
  - Temporal Shape-Grid view.csv
  - Cell Methods-Grid view.csv


In [54]:
# Helper functions
def clean_value(value):
    """Clean and process values from CSV"""
    if pd.isna(value):
        return ""
    if isinstance(value, str):
        return value.strip()
    return value

def create_context_file(output_dir):
    """Create the _context_ file for the directory"""
    context_data = {
        "@context": {
            "@base": "https://wcrp-cmip.github.io/Variable-Regirstry/",
            "@vocab": "https://wcrp-cmip.github.io/Variable-Regirstry/",
            "vr": "https://wcrp-cmip.github.io/Variable-Registry/",
        }
    }
    
    context_path = output_dir / '_context_'
    with open(context_path, 'w') as f:
        json.dump(context_data, f, indent=2)

## Process Cell Methods CSV

In [None]:
def process_cell_methods(df):
    """Process Cell Methods CSV to JSON format"""
    json_files = []
    try:
        os.mkdir(OUTPUT_PATH / 'cell-method')
    except FileExistsError:
        print("Directory 'cell-method' already exists, skipping creation.")
    
    for idx, row in df.iterrows():
        # Create ID from the label or cell methods
        if not isinstance(row['label'], str) or pd.isna(row['label']):
            print(f"  Warning: Skipping row {idx} with invalid Name: {row['label']}")
            continue
        id_value = row['label'].strip().replace(' ', '_').replace(':', '').lower()
        
        # Create JSON structure matching WCRP-universe format
        json_data = {
            "id": id_value,
            "validation-key": id_value,
            "ui-label": clean_value(row['title']),
            "description": clean_value(row['Cell Methods'].split('(mask')[0].strip()),
            "mask": clean_value(row['Mask']),
            "@context": "_context_",
            "type": ["wcrp:cell-method", "universal"]
        }
        

            
        json_files.append((f"{id_value}.json", json_data))
    
    return json_files

## Process Coordinates and Dimensions CSV
This CSV contains both coordinates and dimensions, which need to be separated based on the CF Category column.

In [None]:
def process_coordinates_and_dimensions(df):
    """Process Coordinates and Dimensions CSV to separate coordinate and dimension JSON files"""
    coordinate_files = []
    dimension_files = []
    
    try:
        os.mkdir(OUTPUT_PATH / 'coordinate')
        os.mkdir(OUTPUT_PATH / 'dimension')
    except FileExistsError:
        print("Directories 'coordinates' or 'dimensions' already exist, skipping creation.")
    
    for idx, row in df.iterrows():
        # Use the Name field as ID
        if not isinstance(row['Name'], str) or pd.isna(row['Name']):
            print(f"  Warning: Skipping row {idx} with invalid Name: {row['Name']}")
            continue
        id_value = clean_value(row['Name']).lower().replace(' ', '_').replace(':', '')
        
        # Determine type based on CF Category
        cf_category = clean_value(row.get('CF Category', '')).lower()
        
        # Create JSON structure
        if cf_category == 'coordinate':
            json_data = {
                "id": id_value,
                "validation-key": id_value,
                "ui-label": clean_value(row['Title']),
                "description": clean_value(row['Description']),
                "cf-standard-name": clean_value(row['CF Standard Name']),
                "axis-flag": clean_value(row['Axis Flag']),
                "@context": "_context_",
                "type": ["wcrp:coordinate", "universal"]
            }
            
            output_list = coordinate_files
        elif cf_category == 'dimension':
            json_data = {
                "id": id_value,
                "validation-key": id_value,
                "ui-label": clean_value(row['Title']),
                "description": clean_value(row['Description']),
                "cf-standard-name": clean_value(row['CF Standard Name']),
                "spatial-shape": clean_value(row['Spatial Shape']),
                "axis-flag": clean_value(row['Axis Flag']),
                "@context": "_context_",
                "type": ["wcrp:dimension", "universal"]
            }
            output_list = dimension_files
        else:
            # Skip if CF Category is not coordinate or dimension
            print(f"  Warning: Skipping {id_value} with unknown CF Category: {cf_category}")
            continue
        
        # #  optional fields
        # if pd.notna(row.get('CF Standard Name')):
        #     json_data['cf_standard_name'] = clean_value(row['CF Standard Name'])
      
            
        output_list.append((f"{id_value}.json", json_data))
    
    return coordinate_files, dimension_files

## Main Processing Loop

In [62]:
# Create output directory if it doesn't exist
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# Find all CSV files in the directory
csv_files = list(VARIABLE_REGISTRY_PATH.glob('*.csv'))

print(f"Found {len(csv_files)} CSV files in {VARIABLE_REGISTRY_PATH}")

for csv_path in csv_files:
    csv_file = csv_path.name
    
    # Skip temporary or system files
    if csv_file.startswith('.') or csv_file.startswith('~'):
        continue
        
    print(f"\nProcessing {csv_file}...")
    
    # Read CSV
    try:
        df = pd.read_csv(csv_path)
        print(f"  Found {len(df)} rows")
    except Exception as e:
        print(f"  Error reading {csv_file}: {str(e)}")
        continue
    
    # Determine processing based on filename
    if 'Cell Methods' in csv_file:
        output_dirname = 'cell-method'
        json_files = process_cell_methods(df)
        directories_to_process = [(output_dirname, json_files)]
        
    elif 'Coordinates and Dimensions' in csv_file:
        # Special case: split into two directories
        coord_files, dim_files = process_coordinates_and_dimensions(df)
        directories_to_process = [
            ('coordinate', coord_files),
            ('dimension', dim_files)
        ]
       
    else:
        print(f"  Unknown file type: {csv_file} - skipping")
        continue
    
    # Process each directory
    for output_dirname, json_files in directories_to_process:
        if not json_files:
            print(f"  No files to create for {output_dirname}")
            continue
            
        # Create output directory
        output_dir = OUTPUT_PATH / output_dirname
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Write JSON files
        for filename, data in json_files:
            filepath = output_dir / filename
            with open(filepath, 'w') as f:
                json.dump(data, f, indent=2)
        
        print(f"  Created {len(json_files)} JSON files in {output_dirname}/")
        
        # Create context file
        create_context_file(output_dir)
        print(f"  Created _context_ file")

Found 3 CSV files in /Users/daniel.ellis/WIPwork/Variable-Registry/.src

Processing Coordinates and Dimensions-Grid view.csv...
  Found 113 rows
Directories 'coordinates' or 'dimensions' already exist, skipping creation.
  Created 66 JSON files in coordinate/
  Created _context_ file
  Created 47 JSON files in dimension/
  Created _context_ file

Processing Temporal Shape-Grid view.csv...
  Found 7 rows
  Unknown file type: Temporal Shape-Grid view.csv - skipping

Processing Cell Methods-Grid view.csv...
  Found 89 rows
Directory 'cell-method' already exists, skipping creation.
  Created 83 JSON files in cell-method/
  Created _context_ file


## Validate JSON Files

In [None]:

os.popen("validate_json ../src-data").read()

Processing JSON files: 100%|██████████| 195/195 [00:00<00:00, 3233.57file/s, Modified=195, Errors=0]




## Create README Files

In [None]:

os.popen("create_readme ../src-data").read()



## Summary