In [None]:
# #!/usr/bin/env python3
# """
# JSON-LD Link Validator

# This script validates JSON-LD files and checks if all URLs in the document
# and its context are resolvable.
# """

# import json
# import sys
# import re
# import requests
# from urllib.parse import urlparse
# from typing import Set, List, Dict, Any, Union
# import argparse


# class JSONLDValidator:
#     def __init__(self, timeout: int = 10):
#         self.timeout = timeout
#         self.session = requests.Session()
#         self.session.headers.update({
#             'User-Agent': 'JSON-LD-Validator/1.0'
#         })
    
#     def load_jsonld(self, file_path: str) -> Dict[Any, Any]:
#         """Load and parse JSON-LD file"""
#         try:
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 data = json.load(f)
#             return data
#         except FileNotFoundError:
#             print(f"Error: File '{file_path}' not found")
#             sys.exit(1)
#         except json.JSONDecodeError as e:
#             print(f"Error: Invalid JSON in '{file_path}': {e}")
#             sys.exit(1)
    
#     def extract_urls_from_value(self, value: Any) -> Set[str]:
#         """Recursively extract URLs from any JSON value"""
#         urls = set()
        
#         if isinstance(value, str):
#             # Check if string is a URL
#             if self.is_url(value):
#                 urls.add(value)
#         elif isinstance(value, dict):
#             # Recursively check dictionary values
#             for v in value.values():
#                 urls.update(self.extract_urls_from_value(v))
#         elif isinstance(value, list):
#             # Recursively check list items
#             for item in value:
#                 urls.update(self.extract_urls_from_value(item))
        
#         return urls
    
#     def is_url(self, string: str) -> bool:
#         """Check if string is a valid HTTP/HTTPS URL"""
#         url_pattern = re.compile(
#             r'^https?://'  # http:// or https://
#             r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
#             r'localhost|'  # localhost...
#             r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
#             r'(?::\d+)?'  # optional port
#             r'(?:/?|[/?]\S+)$', re.IGNORECASE)
#         return url_pattern.match(string) is not None
    
#     def extract_context_urls(self, data: Dict[Any, Any]) -> Set[str]:
#         """Extract URLs specifically from @context"""
#         context_urls = set()
        
#         if '@context' not in data:
#             return context_urls
        
#         context = data['@context']
        
#         if isinstance(context, str):
#             if self.is_url(context):
#                 context_urls.add(context)
#         elif isinstance(context, list):
#             for item in context:
#                 if isinstance(item, str) and self.is_url(item):
#                     context_urls.add(item)
#                 elif isinstance(item, dict):
#                     # For embedded contexts, extract URLs from values
#                     context_urls.update(self.extract_urls_from_value(item))
#         elif isinstance(context, dict):
#             # Extract URLs from context object values
#             context_urls.update(self.extract_urls_from_value(context))
        
#         return context_urls
    
#     def extract_all_urls(self, data: Dict[Any, Any]) -> Set[str]:
#         """Extract all URLs from the JSON-LD document"""
#         return self.extract_urls_from_value(data)
    
#     def check_url(self, url: str, context: str = "") -> bool:
#         """Check if URL is resolvable"""
#         print(f"  Checking {url} ... ", end="", flush=True)
        
#         try:
#             response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
#             if response.status_code in [200, 301, 302, 303, 307, 308]:
#                 print("✓ OK")
#                 return True
#             else:
#                 print(f"✗ FAILED (HTTP {response.status_code}) [{context}]")
#                 return False
#         except requests.exceptions.Timeout:
#             print(f"✗ FAILED (Timeout) [{context}]")
#             return False
#         except requests.exceptions.ConnectionError:
#             print(f"✗ FAILED (Connection Error) [{context}]")
#             return False
#         except requests.exceptions.RequestException as e:
#             print(f"✗ FAILED ({str(e)}) [{context}]")
#             return False
    
#     def validate_file(self, file_path: str) -> bool:
#         """Main validation function"""
#         print(f"Analyzing JSON-LD file: {file_path}")
#         print("=" * 50)
        
#         # Load JSON-LD file
#         data = self.load_jsonld(file_path)
        
#         # Check if it has @context
#         if '@context' not in data:
#             print("Warning: No @context found in JSON-LD file")
        
#         failed_urls = 0
#         total_urls = 0
        
#         # Extract and check context URLs
#         print("\nChecking @context URLs:")
#         print("-" * 30)
        
#         context_urls = self.extract_context_urls(data)
        
#         if context_urls:
#             for url in sorted(context_urls):
#                 total_urls += 1
#                 if not self.check_url(url, "@context"):
#                     failed_urls += 1
#         else:
#             print("  No context URLs found")
        
#         # Extract and check all other URLs
#         print("\nChecking all other URLs in document:")
#         print("-" * 40)
        
#         all_urls = self.extract_all_urls(data)
#         other_urls = all_urls - context_urls
        
#         if other_urls:
#             for url in sorted(other_urls):
#                 total_urls += 1
#                 if not self.check_url(url, "document content"):
#                     failed_urls += 1
#         else:
#             print("  No additional URLs found in document")
        
#         # Print summary
#         print(f"\nSUMMARY:")
#         print("=" * 20)
#         print(f"Total URLs checked: {total_urls}")
#         print(f"Failed URLs: {failed_urls}")
#         success_rate = ((total_urls - failed_urls) * 100 // total_urls) if total_urls > 0 else 100
#         print(f"Success rate: {success_rate}%")
        
#         if failed_urls == 0:
#             print("✓ All URLs are resolvable!")
#             return True
#         else:
#             print("✗ Some URLs failed to resolve")
#             return False


# def main():
#     parser = argparse.ArgumentParser(
#         description="Validate JSON-LD files and check if all links are resolvable"
#     )
#     parser.add_argument("file", help="JSON-LD file to validate")
#     parser.add_argument(
#         "--timeout", 
#         type=int, 
#         default=10, 
#         help="Timeout in seconds for URL checks (default: 10)"
#     )
    
#     args = parser.parse_args()
    
#     validator = JSONLDValidator(timeout=args.timeout)
#     success = validator.validate_file(args.file)
    
#     sys.exit(0 if success else 1)


# if __name__ == "__main__":
#     main()

In [2]:
import cmipld
from pyld import jsonld

In [10]:
cmipld.get('cmip7:experiment/amip.json',depth=0)

{'@context': 'cmip7:experiment/amip.json',
 '@id': 'amip',
 '@type': ['wcrp:experiment', 'esgvoc:Experiment', 'cmip7'],
 'activity': ['cmip'],
 'alias': [],
 'description': 'DECK: AMIP \n AMIP (Atmospheric Model Intercomparison Project) experiment with prescribed SSTs and sea ice',
 'end': 2021,
 'min_number_yrs_per_sim': 43,
 'minimum_number_of_years': 42,
 'model_realms': [{'@id': 'agcm', 'is_required': True},
  {'@id': 'aer', 'is_required': False},
  {'@id': 'chem', 'is_required': False},
  {'@id': 'bgc', 'is_required': False}],
 'parent_experiment': ['none'],
 'start': 1979,
 'start_date': 'none',
 'tier': 0,
 'ui_label': 'Describe amip in one sentence.',
 'validation_key': 'amip'}

In [8]:
from rdflib import Graph
import rdflib
# Load and extract all subjects and objects (which include @id values)
g = Graph()
g.parse('../experiment/amip.json', format='json-ld')

# Get all URIs (subjects and objects)
uris = set()
for s, p, o in g:
    if isinstance(s, rdflib.URIRef):
        uris.add(str(s))
    if isinstance(o, rdflib.URIRef):
        uris.add(str(o))

for uri in sorted(uris):
    print(uri)

esgvoc:Experiment
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/aer
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/agcm
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/amip
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/bgc
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/chem
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/cmip
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/none
https://esgf.github.io/esgf-vocab/api_documentation/data_descriptors.html#esgvoc.api.data_descriptors.Experiment.cmip7
wcrp:experiment


In [5]:
from rdflib import Graph, URIRef

g = Graph()
g.parse('../experiment/amip.json', format='json-ld')

uris = {str(term) for term in g.all_nodes() if isinstance(term, URIRef)}
for uri in sorted(uris):
    print(uri)

esgvoc:Experiment
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/aer
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/agcm
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/amip
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/bgc
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/chem
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/cmip
file:///Users/daniel.ellis/WIPwork/CMIP7-CVs/experiment/none
https://esgf.github.io/esgf-vocab/api_documentation/data_descriptors.html#esgvoc.api.data_descriptors.Experiment.cmip7
wcrp:experiment


dict_values(['https://wcrp-cmip.github.io/CMIP7-CVs/experiment/1pctco2-bgc', ['wcrp:experiment', 'esgvoc:Experiment', 'https://esgf.github.io/esgf-vocab/api_documentation/data_descriptors.html#esgvoc.api.data_descriptors.Experiment.cmip7', 'cmip:fast-track'], [{'@id': 'https://wcrp-cmip.github.io/WCRP-universe/activity/damip'}], [], [{'@value': '1pctCO2-bgcisretainedfromCMIP6.ItisasimulationwhichbranchesfrompiControlwitha1%peryearincreaseinCO2concentrationfrompre-industriallevels,withtheCO2“biogeochemically-coupled”.Thismeansthatonlythemodel’scarboncyclecomponents(bothlandandocean)respondtotheincreaseinCO2,whereasthemodel’sradiationcodeusesaconstant,preindustrialconcentrationofCO2.The1pctCO2DECKsimulationisrequiredforcomparison.Thisexperimentisdesignedtoisolatecarbon-concentrationandcarbonclimateelementsoftheglobalcarbonfeedbacks.Itwillalsoenablecalibrationofclimateemulators.SeeJonesetal2016:https://gmd.copernicus.org/articles/9/2853/2016/,especiallysection3.2.3Exampleofuse/analysisisA

In [77]:



def check(file):
    expanded = jsonld.expand(file)
    # assert (expanded)<2, "This check is not inten"
    
    results = {}
    
    for entry in expanded:

        values = entry.values()

        ids = set([item['@id'] for elem in values if isinstance(elem, list)
            for item in elem if isinstance(item, dict) and '@id' in item])

        broken = []

        for i in ids: 
            try:
                jsonld.expand(i)
            except Exception as ex:
                # print(f"Broken link: {i} ({ex})")
                broken.append(i)
            
        results[entry.get('@id')] = {'broken_links': broken, 'all_links': ids}
        
    return results

In [80]:
file = 'cmip7:experiment/graph.json'
log = check(file)
log

{'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/1pctco2-bgc': {'broken_links': [],
  'all_links': {'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/picontrol',
   'https://wcrp-cmip.github.io/WCRP-universe/activity/damip',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/aer',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/aogcm',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/bgc',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/chem',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/ism'}},
 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/1pctco2-rad': {'broken_links': [],
  'all_links': {'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/picontrol',
   'https://wcrp-cmip.github.io/WCRP-universe/activity/damip',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/aer',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/aogcm',
   'https://wcrp-cmip.github.io/WCRP-universe/source_type/bgc',
   'https://wcrp-cmip.githu

In [None]:
reverse = {}
for k,v in log.items():
    for b in v['broken_links']:
        if b not in reverse:
            reverse[b] = []
        reverse[b].append(k)
reverse


{'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/none': ['https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-0p5co2',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-4xco2-1950',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-m4k',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-rad',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-turb',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-piforcing',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/dcppb-forecast-cmip6',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/esm-picontrol',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/g7-1p5k-sai',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highres-yr2020',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highressst-p2k-pat',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/hig

In [86]:
cmipld.compact_direct_url(str(reverse))

"{'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/none': ['https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-0p5co2', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-4xco2-1950', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-m4k', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-rad', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-turb', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-piforcing', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/dcppb-forecast-cmip6', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/esm-picontrol', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/g7-1p5k-sai', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highres-yr2020', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highressst-p2k-pat', 'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highressst-p4k-pat', 'https:

In [None]:
import re,json
data = reverse
if isinstance(data, dict):
    quick_compact = json.dumps(data)
else:
    quick_compact = str(data)
for i,j in cmipld.direct.items():
    quick_compact = re.sub(r'\b'+re.escape(j)+r':\b', i, quick_compact)
    
if not isinstance(data, dict):
    quick_compact = json.loads(quick_compact)
    
quick_compact

{'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/none': ['https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-0p5co2',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/abrupt-4xco2-1950',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-m4k',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-rad',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k-sst-turb',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-p4k',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip-piforcing',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/amip',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/dcppb-forecast-cmip6',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/esm-picontrol',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/g7-1p5k-sai',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highres-yr2020',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/highressst-p2k-pat',
  'https://wcrp-cmip.github.io/CMIP7-CVs/experiment/hig