# Query ESGF input4MIPs index and create source_id entries

## notes

## imports

In [5]:
%%time
import json
import numpy as np
import os
import pdb
import requests
from IPython.display import clear_output

CPU times: user 18 μs, sys: 1 μs, total: 19 μs
Wall time: 21.9 μs


## example source_id queries

## get SOLR source_id entries

In [6]:
%%time
# both input4MIPs and input4mips
actId = "input4MIPs"

# Dataset search
inputsD = "https://esgf-node.llnl.gov/solr/datasets/select?q=*:*&rows=0&wt=json&facet=true&" \
         "fq=type:Dataset&fq=replica:false&fq=activity_id:" + actId + "&facet.field=source_id"
js = requests.get(inputsD)
js_mipsD = json.loads(js.text)
print("inputsD:", inputsD)

# File search
inputsF = "https://esgf-node.llnl.gov/solr/files/select?q=*:*&rows=0&wt=json&facet=true&" \
          "fq=type:File&fq=replica:false&fq=activity_id:" + actId + "&facet.field=source_id"
js = requests.get(inputsF)
js_mipsF = json.loads(js.text)
print("inputsF:", inputsF)

inputsD: https://esgf-node.llnl.gov/solr/datasets/select?q=*:*&rows=0&wt=json&facet=true&fq=type:Dataset&fq=replica:false&fq=activity_id:input4MIPs&facet.field=source_id
inputsF: https://esgf-node.llnl.gov/solr/files/select?q=*:*&rows=0&wt=json&facet=true&fq=type:File&fq=replica:false&fq=activity_id:input4MIPs&facet.field=source_id
CPU times: user 32.8 ms, sys: 7.21 ms, total: 40 ms
Wall time: 264 ms


## extract SOLR source_id entries and composite

In [7]:
dicInpmD = js_mipsD["facet_counts"]["facet_fields"]["source_id"]
print("len(dicInpmD):", len(dicInpmD))
dicInpmF = js_mipsF["facet_counts"]["facet_fields"]["source_id"]
print("len(dicInpmF):", len(dicInpmF))
#print(dicInpmD)
#print(dicInpmF)

len(dicInpmD): 372
len(dicInpmF): 350


In [8]:
# datasets
srcIdLen = len(dicInpmD)
print("srcIdDLen:", srcIdLen)
els = np.arange(0, srcIdLen, 2)
srcIdDDict = {}
srcIds = dicInpmD  # reset to generic variable name
counts = 0
for cnt, srcId in enumerate(els.tolist()):
    #print(cnt, srcId)
    srcIdDDict[srcIds[srcId]] = srcIds[srcId+1]
    counts += int(srcIds[srcId+1])

print("len(srcIdDDict.keys()):", len(srcIdDDict.keys()))
print("dataset counts:", counts)
# sort dictionary
srcIdDDictList = list(sorted(srcIdDDict.keys()))
#print(srcIdDDictList)

# files
srcIdLen = len(dicInpmF)
print("srcIdFLen:", srcIdLen)
els = np.arange(0, srcIdLen, 2)
srcIdFDict = {}
srcIds = dicInpmF  # reset to generic variable name
counts = 0
for cnt, srcId in enumerate(els.tolist()):
    #print(cnt, srcId)
    srcIdFDict[srcIds[srcId]] = srcIds[srcId+1]
    counts += int(srcIds[srcId+1])

print("len(srcIdFDict.keys()):", len(srcIdFDict.keys()))
print("dataset counts:", counts)
# sort dictionary
srcIdFDictList = list(sorted(srcIdFDict.keys()))
#print(srcIdFDictList)

# determine missing
print("Search results: Dataset includes, excluded from File searches (likely latest:false):")
set(srcIdDDictList).difference(srcIdFDictList)

srcIdDLen: 372
len(srcIdDDict.keys()): 186
dataset counts: 5900
srcIdFLen: 350
len(srcIdFDict.keys()): 175
dataset counts: 10387
Search results: Dataset includes, excluded from File searches (likely latest:false):


{'CCMI-hist-nat-1-0',
 'CCMI-hist-sol-1-0',
 'CCMI-hist-volc-1-0',
 'IAMC-AIM-ssp370-1-0',
 'IAMC-GCAM4-ssp434-1-0',
 'IAMC-GCAM4-ssp460-1-0',
 'IAMC-IMAGE-ssp119-1-0',
 'IAMC-IMAGE-ssp126-1-0',
 'IAMC-MESSAGE-GLOBIOM-ssp245-1-0',
 'IAMC-REMIND-MAGPIE-ssp534-over-1-0',
 'IAMC-REMIND-MAGPIE-ssp585-1-0'}

## example esg-search source_id queries

## using source_id entries from Dataset search build a library

In [9]:
solrQry = "https://esgf-node.llnl.gov/esg-search/search/?limit=1000&format=application%2Fsolr%2Bjson&source_id=" \
          "PLACEHOLDER" + "&project=input4mips&project=input4MIPs&distrib=false&fields=*"  # all fields
mstrJson = {}  # create catch dictionary
oF = "tmp.json"
for count, srcId in enumerate(srcIdDDict.keys()):
    print(count, srcId)
    qryStr = solrQry.replace("PLACEHOLDER", srcId)
    #print("qryStr:", qryStr)
    js = requests.get(qryStr)
    js_srcId = json.loads(js.text)
    #print(js_srcId["response"]["docs"][0])
    # write to placeholder to test
    srcIdLen = len(js_srcId["response"]["docs"])
    clear_output(wait=False)
    # https://stackoverflow.com/questions/24816237/ipython-notebook-clear-cell-output-in-code
    print("len(js_srcId):", srcIdLen)
    for entry in np.arange(0, srcIdLen):
        a = js_srcId["response"]["docs"][entry]
        instId = a["instance_id"]
        mstrJson[instId] = a        
        if os.path.exists(oF):
            os.remove(oF)
        fH = open(oF, "w")
        json.dump(a, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
        #json.dump(mstrJson, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
        fH.close()
    #pdb.set_trace()
print("All done")
# cleanup
os.remove(oF)  # cleanup
# Write all out
oF = "comp.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(mstrJson, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

len(js_srcId): 247
All done


## from SOLR scour create input4MIPs_source_id.json

In [6]:
%%time
srcIdDict = {}
srcIdDict["source_id"] = {}
# Look up info
# Implied by project: activity_id=input4MIPs, license="CC BY 4.0"
# Not already here and
# source from input4mips-cmor-tables: region, title (ESGF version), variable_id (per file in SOLR)
# needs validating: datetime_start, datetime_stop  # Adding for dataset/source_id temporal coverage
# file: Conventions, creation_date, tracking_id 
# irrelevant: institution, table_id  # need for lookup

# Define all keys
stdKeys = [
    "contact", "dataset_category", "datetime_start",
    "datetime_stop", "frequency", "further_info_url", "grid_label", "institution_id",
    "mip_era", "nominal_resolution", "realm", "source", "source_id",
    "source_version", "target_mip",
       ]
extraKeys = ["license", "region", "title"]
dataProviderFileKeys = ["Conventions", "creation_date", "tracking_id"]
dataProviderExtraKeys = ["source_variables"]
esgfIndexKeys = ["_timestamp", "data_node", "latest", "replica", "version", "xlink"]

# loop through entries
for c1, a in enumerate(mstrJson):
    tmp = mstrJson[a]
    srcId = tmp["source_id"][0]
    print(srcId)
    if srcId not in srcIdDict.keys():
        dic = [tmp.get(key) for key in stdKeys]
        srcIdDict["source_id"][srcId] = {}
        # add "published" status
        srcIdDict["source_id"][srcId]["_status"] = "Published"
        # do stdKeys entries
        for c2, b in enumerate(stdKeys):
            c = dic[c2]
            if isinstance(c, list):
                c = c[0]
            if isinstance(c, str) and len(c) > 0 and "datetime_" in b:
                ind = c.index('T')
                c = c[:ind]
            srcIdDict["source_id"][srcId][stdKeys[c2]] = c
        # do extraKeys entries
        for c2, b in enumerate(extraKeys):
            srcIdDict["source_id"][srcId][extraKeys[c2]] = ""
        srcIdDict["source_id"][srcId]["license"] = "CC BY 4.0"
        # do dataProviderFileKeys entries
        dic = [tmp.get(key) for key in dataProviderFileKeys]
        srcIdDict["source_id"][srcId]["|dataProviderFile"] = {}
        for c2, b in enumerate(dataProviderFileKeys):
            c = dic[c2]
            if isinstance(c, list):
                c = c[0]
            srcIdDict["source_id"][srcId]["|dataProviderFile"][dataProviderFileKeys[c2]] = c
        # do dataProviderExtraKeys entries
        dic = [tmp.get(key) for key in dataProviderExtraKeys]
        srcIdDict["source_id"][srcId]["|dataProviderExtra"] = {}
        srcIdDict["source_id"][srcId]["|dataProviderExtra"]["source_variables"] = ""
        for c2, b in enumerate(dataProviderExtraKeys):
            c = dic[c2]
            if isinstance(c, list):
                c = c[0]
            srcIdDict["source_id"][srcId]["|dataProviderExtra"][dataProviderExtraKeys[c2]] = c
        # do esgfIndexKeys entries
        dic = [tmp.get(key) for key in esgfIndexKeys]
        srcIdDict["source_id"][srcId]["|esgfIndex"] = {}
        for c2, b in enumerate(esgfIndexKeys):
            c = dic[c2]
            if isinstance(c, list):
                c = c[0]
            srcIdDict["source_id"][srcId]["|esgfIndex"][esgfIndexKeys[c2]] = c

clear_output(wait=False)
print("len(srcIdDict[\"source_id\"].keys()):", len(srcIdDict["source_id"].keys()))
# Write all out
oF = "../input4MIPs_source_id.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(srcIdDict, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

len(srcIdDict["source_id"].keys()): 185
CPU times: user 105 ms, sys: 4.68 ms, total: 110 ms
Wall time: 110 ms


## augment source_id with input4mips-cmor-tables values

In [7]:
%%time
# read input4mips-cmor-tables holdings
srcIdGithub = "https://raw.githubusercontent.com/PCMDI/input4MIPs-cmor-tables/master/input4MIPs_source_id.json"
js = requests.get(srcIdGithub)
srcIds = json.loads(js.text)

# source from input4mips-cmor-tables: region, title (ESGF version), variable_id (per file in SOLR)
for count, srcId in enumerate(srcIds["source_id"].keys()):
    print("srcId:", srcId)
    keyList = srcIds["source_id"][srcId].keys()
    if "region" in keyList:
        a = srcIds["source_id"][srcId]["region"]
        print("source_id:region", a)
        srcIdDict["source_id"][srcId]["region"] = a
    if "title" in keyList:
        a = srcIds["source_id"][srcId]["title"]
        print("source_id:title", a)
        srcIdDict["source_id"][srcId]["title"] = a
    if "source_variables" in keyList:
        a = srcIds["source_id"][srcId]["source_variables"]
        print("source_id:source_variables", a)
        srcIdDict["source_id"][srcId]["|dataProviderExtra"]["source_variables"] = a
    clear_output(wait=False)
# Write all out
oF = "../input4MIPs_source_id.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(srcIdDict, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

CPU times: user 42.2 ms, sys: 18.6 ms, total: 60.8 ms
Wall time: 377 ms


## Check institution_id entries in source_id

In [70]:
instIds = []
for cnt, key in enumerate(srcIdDict["source_id"].keys()):
    instId = srcIdDict["source_id"][key]["institution_id"]
    if instId not in instIds:
        instIds.append(instId)
        #print("srcId:inst_id:", instId)

instIds.sort()
#instIds - matches

# Create other CV entries

## create other *.json entries - lift direct from repo

In [21]:
%%time
# read input4mips_CVs
rawPath = "https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/"
CVList = ["DRS", "activity_id", "dataset_category", "license", "mip_era",
          "product", "required_global_attributes", "target_mip", "tracking_id"]
# loop through entries
for count, cv in enumerate(CVList):
    path = "".join([rawPath, "input4MIPs_", cv, ".json"])
    print(path)
    js = requests.get(path)
    vars()[cv] = json.loads(js.text)

https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_DRS.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_activity_id.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_dataset_category.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_institution_id.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_license.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_mip_era.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_product.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_required_global_attributes.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_target_mip.json
https://raw.githubusercontent.com/PCMDI/input4MIPs_CVs/main/input4MIPs_tracking_id.json
CPU times: user 77.2 ms, sys: 21.2 ms, total: 98.4 ms
Wall time: 2.07 s


## create institution_id from source_id entries

In [14]:
# load data
srcIdPath = "../input4MIPs_source_id.json"
with open(srcIdPath, 'r') as fH:
    js_srcId = json.load(fH)

#js_srcId["source_id"].keys()
#js_srcId["source_id"]["ACCESS1-3-rcp85-1-0"]["institution_id"]

In [25]:
# iterate through and catch
instId = []
for count, key in enumerate(js_srcId["source_id"].keys()):
    instIdTmp = js_srcId["source_id"][key]["institution_id"]
    #print(count, key, instIdTmp)
    if instIdTmp not in instId:
        instId.append(instIdTmp)
instId.sort()
instId

['CCCma',
 'CNRM-Cerfacs',
 'IACETH',
 'IAMC',
 'ImperialCollege',
 'MOHC',
 'MPI-B',
 'MPI-M',
 'MRI',
 'NASA-GSFC',
 'NCAR',
 'NCAS',
 'PCMDI',
 'PNNL-JGCRI',
 'SOLARIS-HEPPA',
 'UCI',
 'UColorado',
 'UReading',
 'UoM',
 'UofMD',
 'VUA']

In [26]:
# write to input4MIPs_institution_id.json
tmp = {}
tmp["institution_id"] = instId
# Write all out
oF = "../input4MIPs_institution_id.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(tmp, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

## create composite CV

In [41]:
CVExportList = ["activity_id", "dataset_category", "DRS", "institution_id", "license", "mip_era",
                "product", "required_global_attributes", "source_id", "target_mip", "tracking_id"]
CVExportList.sort()

# Collate all CVs in master
tmp = {}
tmp["CV"] = {}
for key in CVExportList:
    fileName = "".join(["input4MIPs_", key, ".json"])
    filePath = os.path.join("..", fileName)
    #print(filePath)
    with open(filePath,'r') as fH:
        keyDict = json.load(fH)
        tmp["CV"][key] = keyDict[key]

# Write collated CVs to file
if not os.path.exists("../CVs"):
    os.makedirs("../CVs")
oF = "../CVs/input4MIPs_CV.json"
if os.path.exists(oF):
    os.remove(oF)
fH = open(oF, "w")
json.dump(tmp, fH, ensure_ascii=True, sort_keys=True, indent=4, separators=(",", ":"),)
fH.close()

In [34]:
tmp["CV"].keys()

dict_keys(['DRS', 'activity_id', 'dataset_category', 'institution_id', 'license', 'mip_era', 'product', 'required_global_attributes', 'source_id', 'target_mip', 'tracking_id'])