# Register new input4MIPs source_id entry

## imports

In [1]:
%%time
import json
import os
import pdb
import requests
from IPython.display import clear_output

CPU times: user 38.3 ms, sys: 15 ms, total: 53.3 ms
Wall time: 59.4 ms


## function defs

In [135]:
def make_srcId(keyId, stdDict, dataProviderDict, dataProviderFileDict):
    '''
    Generate source_id from user-provided inputs
    '''
    srcId = {}
    srcId[keyId] = {}
    # standard keys
    stdKeys = ["_status","contact", "dataset_category", "datetime_start",
               "datetime_stop", "frequency", "further_info_url",
               "grid_label", "institution_id", "license", "mip_era",
               "nominal_resolution", "realm", "region", "source",
               "source_id", "source_version", "target_mip", "title"]
    # fill standard entries
    for cnt, key in enumerate(stdKeys):
        srcId[keyId][key] = stdDict[key]
    # data provider extra keys
    dataProviderExtraKeys = dataProviderDict.keys()
    srcId[keyId]["|dataProviderExtra"] = {}
    # fill data provider extra entries
    for key in dataProviderExtraKeys:
        srcId[keyId]["|dataProviderExtra"][key] = dataProviderDict[key]
    # data provider file keys
    dataProviderFileKeys = ["Conventions", "creation_date", "tracking_id"]
    srcId[keyId]["|dataProviderFile"] = {}
    # fill data provider file entries
    for key in dataProviderFileKeys:
        srcId[keyId]["|dataProviderFile"][key] = dataProviderFileDict[key]
    # ESGF default entries (once data is published)
    esgfKeys = ["_timestamp", "data_node", "latest", "replica",
                "version", "xlink"]
    srcId[keyId]["|esgfIndex"] = {}
    # fill ESGF entries
    for key in esgfKeys:
        srcId[keyId]["|esgfIndex"][key] = ""

    return srcId



## Data provider to complete form entries below

In [136]:
stdDict = {}
# required keys - SOLARIS-HEPPA-CMIP-4-1
stdDict["_status"] = "Registered"
stdDict["contact"] = "bernd@iaa.es"
stdDict["dataset_category"] = "solar"
stdDict["datetime_start"] = "1850-01-01"
stdDict["datetime_stop"] = "2023-12-31"
stdDict["frequency"] = ["day", "mon", "yr"]
stdDict["further_info_url"] = "http://solarisheppa.geomar.de/cmip7"
stdDict["grid_label"] = "gn"
stdDict["institution_id"] = "SOLARIS-HEPPA"
stdDict["license"] = "CC BY 4.0"
stdDict["mip_era"] = "CMIP6Plus"
stdDict["nominal_resolution"] = "250 km"
stdDict["realm"] = "atmos"
stdDict["region"] = ["global"]
stdDict["source"] = " ".join(["SOLARIS-HEPPA CMIP 4.1 solar forcing based on SSI,",
                              "TSI, and F10.7 from ssi_v03r00_preliminary",
                              "(Odele Coddington et al., pers. comm.); Ap and Kp",
                              "from ftp.ngdc.noaa.gov until 2014, afterwards from",
                              "GFZ Potsdam (https://kp.gfz-potsdam.de), P-IPR from",
                              "SEP-II (Ilya Usoskin et al., pers. comm.), MEE-IPR",
                              "from FMI APEEP v2024b_cmip7 (Max van de Kamp et al.,",
                              "pers. comm.), GCR-IPR from CRII v2024-02 (Ilya",
                              "Usoskin et al., pers. comm.)"])
stdDict["source_id"] = "SOLARIS-HEPPA-CMIP-4-1"
print(stdDict["source_id"])
stdDict["source_version"] = "4.1"
stdDict["target_mip"] = "CMIP"
stdDict["title"] = "SOLARIS-HEPPA 4.1 CMIP7 solar forcing"

# optional entries
# dataProviderExtra
dataProviderDict = {}
dataProviderDict["source_variables"] = ["multiple"]
dataProviderDict["metadata_url"] = "".join(["see http://solarisheppa.geomar.de/solarisheppa/",
                                            "sites/default/files/data/cmip7/CMIP7_metadata_",
                                            "description_4.1.pdf"])
dataProviderDict["contributor_names"] = " ".join(["Bernd Funke, Timo Asikainen, Stefan Bender,",
                                                  "Odele Coddington, Thierry Dudok de Wit,",
                                                  "Illaria Ermolli, Margit Haberreiter,",
                                                  "Doug Kinnison, Judith Lean,",
                                                  "Sergey Koldoboskiy, Daniel R. Marsh,",
                                                  "Hilde Nesse, Annika Seppaelae,",
                                                  "Miriam Sinnhuber, Ilya Usoskin,",
                                                  "Max van de Kamp, Pekka T. Verronen"])
dataProviderDict["reference"] = " ".join([
    "Funke et al., 2024: Towards the definition of a solar forcing dataset for CMIP7,",
    "Geosci. Model Dev., 17 (3), pp 1217--1227. doi: https://doi.org/10.5194/gmd-17-1217-2024"])

# dataProviderFile
dataProviderFileDict = {}
dataProviderFileDict["Conventions"] = "CF-1.6"
dataProviderFileDict["creation_date"] = ""
dataProviderFileDict["tracking_id"] = ""

# call function to populate
newId = make_srcId(stdDict["source_id"], stdDict, dataProviderDict, dataProviderFileDict)

SOLARIS-HEPPA-CMIP-4-1


In [137]:
newId

{'SOLARIS-HEPPA-CMIP-4-1': {'_status': 'Registered',
  'contact': 'bernd@iaa.es',
  'dataset_category': 'solar',
  'datetime_start': '1850-01-01',
  'datetime_stop': '2023-12-31',
  'frequency': ['day', 'mon', 'yr'],
  'further_info_url': 'http://solarisheppa.geomar.de/cmip7',
  'grid_label': 'gn',
  'institution_id': 'SOLARIS-HEPPA',
  'license': 'CC BY 4.0',
  'mip_era': 'CMIP6Plus',
  'nominal_resolution': '250 km',
  'realm': 'atmos',
  'region': ['global'],
  'source': 'SOLARIS-HEPPA CMIP 4.1 solar forcing based on SSI, TSI, and F10.7 from ssi_v03r00_preliminary (Odele Coddington et al., pers. comm.); Ap and Kp from ftp.ngdc.noaa.gov until 2014, afterwards from GFZ Potsdam (https://kp.gfz-potsdam.de), P-IPR from SEP-II (Ilya Usoskin et al., pers. comm.), MEE-IPR from FMI APEEP v2024b_cmip7 (Max van de Kamp et al., pers. comm.), GCR-IPR from CRII v2024-02 (Ilya Usoskin et al., pers. comm.)',
  'source_id': 'SOLARIS-HEPPA-CMIP-4-1',
  'source_version': '4.1',
  'target_mip': 'CMIP',

## sync with repo - input4MIPs_source_id.json

In [153]:
%%time
# read input4mips_cvs source_id
# remote
srcIdGithub = "https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_source_id.json"
js = requests.get(srcIdGithub)
srcIds = json.loads(js.text)
# local
#srcIdGithub = "../input4MIPs_source_id.json"
#with open(srcIdGithub, 'r') as f:
#    srcIds = json.load(f)

# add new source_id
srcIds["source_id"][stdDict["source_id"]] = {}
srcIds["source_id"][stdDict["source_id"]] = newId[stdDict["source_id"]]

# Write all out
oF = "../input4MIPs_source_id.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(srcIds, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

CPU times: user 16.5 ms, sys: 3.64 ms, total: 20.1 ms
Wall time: 32.6 ms


## validate all source_id's have entries

In [157]:
srcIdGithub = "https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_source_id.json"
js = requests.get(srcIdGithub)
srcIds = json.loads(js.text)

In [167]:
srcIds["source_id"].keys()

dict_keys(['ACCESS1-3-rcp85-1-0', 'CCMI-hist-nat-1-0', 'CCMI-hist-nat-1-1', 'CCMI-hist-sol-1-0', 'CCMI-hist-sol-1-1', 'CCMI-hist-stratO3-1-0', 'CCMI-hist-volc-1-0', 'CCMI-hist-volc-1-1', 'CCMI-ssp245-nat-1-0', 'CCMI-ssp245-sol-1-0', 'CCMI-ssp245-stratO3-1-0', 'CCMI-ssp245-volc-1-0', 'CCSM4-rcp26-1-0', 'CCSM4-rcp85-1-0', 'CEDS-2016-06-18', 'CEDS-2016-06-18-sectorDimV2', 'CEDS-2016-06-18-supplemental-data', 'CEDS-2016-07-26', 'CEDS-2016-07-26-sectorDim', 'CEDS-2016-07-26-sectorDim-supplemental-data', 'CEDS-2017-05-18', 'CEDS-2017-05-18-supplemental-data', 'CEDS-2017-08-30', 'CEDS-2017-08-30-supplemental-data', 'CEDS-2017-10-05', 'CESM2-ssp585-1-0', 'CNRM-CM6-1-ssp126-1-0', 'CNRM-CM6-1-ssp585-1-0', 'CNRM-ESM2-1-ssp585-1-0', 'CR-CMIP-0-2-0', 'CSIRO-MK3-6-0-rcp85-1-0', 'DCPP-C-amv-1-1', 'DCPP-C-ipv-1-1', 'DRES-CMIP-BB4CMIP7-1-0', 'HadGEM2-ES-rcp85-1-0', 'IACETH-SAGE3lambda-2-1-0', 'IACETH-SAGE3lambda-3-0-0', 'IAMC-AIM-ssp370-1-0', 'IAMC-AIM-ssp370-1-1', 'IAMC-AIM-ssp370-1-1-supplemental-dat

In [175]:
stdKeys = ["_status","contact", "dataset_category", "datetime_start",
           "datetime_stop", "frequency", "further_info_url",
           "grid_label", "institution_id", "license", "mip_era",
           "nominal_resolution", "realm", "region", "source",
           "source_id", "source_version", "target_mip", "title"]
stdKeyList = list(srcIds["source_id"]["IAMC-IMAGE-ssp126-1-0"].keys()) # PCMDI-AMIP-1-1-9
stdKeyList.remove("|dataProviderExtra")
stdKeyList.remove("|dataProviderFile")
stdKeyList.remove("|esgfIndex")
stdKeyList
s = set(stdKeys)
diff = [x for x in stdKeyList if x not in s]
diff

[]

In [5]:
%%time
# read input4mips_cvs source_id
# remote
srcIdGithub = "https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_source_id.json"
js = requests.get(srcIdGithub)
srcIds = json.loads(js.text)

# title missing: PCMDI new, CR, DRES
srcIds["source_id"]["CR-CMIP-0-2-0"]["title"] = "Climate Resource CMIP 0.2.0 dataset prepared for input4MIPs"
srcIds["source_id"]["DRES-CMIP-BB4CMIP7-1-0"]["title"] = "Deltares CMIP BB4CMIP7 1.0 global fire emissions"
srcIds["source_id"]["PCMDI-AMIP-ERSST5-1-0"]["title"] = "PCMDI-AMIP ERSST5 1.0 dataset prepared for input4MIPs"
srcIds["source_id"]["PCMDI-AMIP-Had1p1-1-0"]["title"] = "PCMDI-AMIP Had-1.1 1.0 dataset prepared for input4MIPs"
srcIds["source_id"]["PCMDI-AMIP-OI2p1-1-0"]["title"] = "PCMDI-AMIP OI-2.1 1.0 dataset prepared for input4MIPs"
srcIds["source_id"]["SOLARIS-HEPPA-CMIP-4-1"]["title"] = "SOLARIS HEPPA CMIP 4.1 dataset prepared for input4MIPs"

# entries to check
stdKeys = ["_status","contact", "dataset_category", "datetime_start",
           "datetime_stop", "frequency", "further_info_url",
           "grid_label", "institution_id", "license", "mip_era",
           "nominal_resolution", "realm", "region", "source",
           "source_id", "source_version", "target_mip", "title"]
esgfKeys = ["_timestamp", "data_node", "latest", "replica",
            "version", "xlink"]

for cnt, srcId in enumerate(srcIds["source_id"].keys()):
    print("srcId:", cnt, srcId)
    srcId = srcIds["source_id"][srcId]
    print(srcId)
    # check stdKeys
    stdKeyList = list(srcId.keys())
    stdKeyList.sort()
    print("stdKeyList:", stdKeyList)
    s = set(stdKeys)
    diff = [x for x in s if x not in stdKeyList]
    if diff:
        print()
        print("diff:", diff)
        sys.exit()
        #pdb.set_trace()
    # check esgfKeys
    esgfKeyList = list(srcId["|esgfIndex"].keys())
    esgfKeyList.sort()
    print("esgfKeyList:", esgfKeyList)
    s = set(esgfKeys)
    diff = [x for x in s if x not in esgfKeyList]
    if diff:
        print()
        print("diff:", diff)
        sys.exit()
        #pdb.set_trace()    
    print("catch")
    #pdb.set_trace()
    clear_output(wait=False)

# Write all out
oF = "../input4MIPs_source_id.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(srcIds, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

CPU times: user 72.1 ms, sys: 27.3 ms, total: 99.4 ms
Wall time: 114 ms
