# Attempt to remap variables to MIP tables

Imports

In [1]:
import json, os
from collections import defaultdict
from copy import copy, deepcopy
import glob
import shutil
import hashlib

# Add dreqPy v01.00.33 to path
import sys
sys.path.insert(0, os.path.join(os.environ['HOME'], 'CDDS/dreq/dreq_versions/01.00.33/'))
from dreqPy import dreq

Specify directories

In [2]:
CMIP6_LOCATION = os.path.join(os.environ['HOME'], 'CDDS/github/cmip6-cmor-tables/Tables')
OUTPUT_LOCATION = '../../Tables'

Load miptables into a dictionary

In [3]:
mip_tables_list = os.listdir(CMIP6_LOCATION)
# Ignore non-mip table files
for i in ['CMIP6_CV.json', 'CMIP6_coordinate.json', 'CMIP6_formula_terms.json',
          'CMIP6_grids.json', 'CMIP6_input_example.json']:
    mip_tables_list.remove(i)

    
# Read variable data into a dictionary structure ignore header for the moment.
mip_tables_data = {}

for mip_table_name in mip_tables_list:
    with open(os.path.join(CMIP6_LOCATION, mip_table_name)) as handle:
        table_data = json.load(handle)
        table_name = table_data['Header']['table_id'].split()[-1]
        
        mip_tables_data[table_name] = table_data['variable_entry']


In [4]:
mip_tables_data['Amon']['tas']

{'frequency': 'mon',
 'modeling_realm': 'atmos',
 'standard_name': 'air_temperature',
 'units': 'K',
 'cell_methods': 'area: time: mean',
 'cell_measures': 'area: areacella',
 'long_name': 'Near-Surface Air Temperature',
 'comment': 'near-surface (usually, 2 meter) air temperature',
 'dimensions': 'longitude latitude time height2m',
 'out_name': 'tas',
 'type': 'real',
 'positive': '',
 'valid_min': '',
 'valid_max': '',
 'ok_min_mean_abs': '',
 'ok_max_mean_abs': ''}

Integrate data from the CMIP6 data request. Add unique identifiers for provenance information and insert QC ranges held in the data request, but not propagated to the mip tables previously

In [5]:
# get dictionary of uids corresponding to CMOR variables from data request
dq = dreq.loadDreq()
dreq_uids = {(i.mipTable, i.label): i.uid for i in dq.inx.CMORvar.uid.values()}
QC_FIELDS = ['valid_min', 'valid_max', 'ok_min_mean_abs', 'ok_max_mean_abs']

# loop over tables
for table in mip_tables_data:
    for variable in mip_tables_data[table]:
        # add data request uid to mip tables, to be used in provenance data later.
        uid = dreq_uids[(table, variable)]
        mip_tables_data[table][variable]['dreq_uid'] = uid
        
        # add QC information held in data request if available
        qc_key = '{}-{}'.format(table, variable)
        if qc_key in dq.inx.qcranges.label:
            
            qc_info_uid = dq.inx.qcranges.label[qc_key]
            qc_info_obj = dq.inx.uid[qc_info_uid[0]]
            qc_info = {}
            for field in QC_FIELDS:
                value = getattr(qc_info_obj, field)
                # Some qc entries have another object as one of their fields.
                # Filter these out as we can't handle this.
                if isinstance(value, (int, float)):
                    qc_info[field] = value
            # Update MIP table QC information        
            mip_tables_data[table][variable].update(qc_info)




MIP table entries now look like:

In [6]:
mip_tables_data['Amon']['tas']

{'frequency': 'mon',
 'modeling_realm': 'atmos',
 'standard_name': 'air_temperature',
 'units': 'K',
 'cell_methods': 'area: time: mean',
 'cell_measures': 'area: areacella',
 'long_name': 'Near-Surface Air Temperature',
 'comment': 'near-surface (usually, 2 meter) air temperature',
 'dimensions': 'longitude latitude time height2m',
 'out_name': 'tas',
 'type': 'real',
 'positive': '',
 'valid_min': 170.0,
 'valid_max': 350.0,
 'ok_min_mean_abs': 255.0,
 'ok_max_mean_abs': 295.0,
 'dreq_uid': 'bab9237c-e5dd-11e5-8482-ac72891c3257'}

Assign default MIP table prefix based on realm

In [7]:
realms = {
    "aerosol": "AE", 
    "atmos": "AP", 
    "atmosChem": "AC", 
    "land": "LP", 
    "landIce": "LI", 
    "ocean":"OP", 
    "ocnBgchem": "OB", 
    "seaIce": "SI", 
}

Reassign variables to new MIP tables. For clarity I've replaced 1hrCM with monDiurnal -- this affects variables in the E1hrClimMon table which are reported as monthly means sampled at each hour of the day, e.g. average of values sampled at 00Z, 01Z, ... 22Z, 23Z.

In [8]:
new_tables = defaultdict(lambda: defaultdict(list))
new_table_frequency = {}
for mip_table_name, data in mip_tables_data.items():
    
    for var_name, entry in data.items():
        dimensions = entry['dimensions'].split()
        prefix = realms[entry['modeling_realm'].split()[0]]
        frequency = entry['frequency']
        cell_methods = entry['cell_methods']
        
        # replace monC (monthly climatology) with monClim for clarity
        if frequency == 'monC':
            frequency = 'monClim'
        # replace 1hrCM (monthly mean sampled at defined time of day) with monDiurnal 
        elif frequency == '1hrCM':
            frequency = 'monDiurnal'
        
        # Construct suffixes
        suffix = ''
        
        if 'site' in dimensions:
            # Site specific (CFsites)
            suffix = 'Site'
        elif mip_table_name in ['IyrAnt', 'IfxAnt', 'ImonAnt']:
            # Glacier-Ice-Antarctica    
            prefix = 'GIA'
        elif mip_table_name in ['IyrGre', 'IfxGre', 'ImonGre']:
            # Glacier-Ice-Greenland    
            prefix = 'GIG' 
        elif frequency == 'fx':
            # Don't separate fixed fields
            pass
        elif 'longitude' not in dimensions and 'latitude' in dimensions:
            # zonal means
            suffix = 'Z'
        elif any([i in dimensions for i in ['alevel', 'alevhalf', 'olevel', 'olevhalf']]) and frequency != 'fx':
            # atmosphere & ocean levels
            suffix = 'Lev'
        
        new_table = '{}{}{}'.format(prefix, frequency, suffix)

        new_tables[new_table][var_name].append((mip_table_name, var_name))

List new tables

In [9]:
len(new_tables), sorted(list(new_tables.keys()))

(78,
 ['ACmon',
  'ACmonZ',
  'AE1hr',
  'AE3hrPt',
  'AE3hrPtLev',
  'AE6hr',
  'AE6hrPt',
  'AE6hrPtLev',
  'AEday',
  'AEmon',
  'AEmonLev',
  'AEmonZ',
  'AEsubhrPt',
  'AEsubhrPtSite',
  'AP1hr',
  'AP1hrPt',
  'AP3hr',
  'AP3hrPt',
  'AP3hrPtLev',
  'AP6hr',
  'AP6hrPt',
  'AP6hrPtLev',
  'AP6hrPtZ',
  'APday',
  'APdayLev',
  'APdayZ',
  'APfx',
  'APfxSite',
  'APmon',
  'APmonClim',
  'APmonClimLev',
  'APmonDiurnal',
  'APmonLev',
  'APmonZ',
  'APsubhrPt',
  'APsubhrPtLev',
  'APsubhrPtSite',
  'GIAfx',
  'GIAmon',
  'GIAyr',
  'GIGfx',
  'GIGmon',
  'GIGyr',
  'LI3hrPt',
  'LI6hrPt',
  'LIday',
  'LIfx',
  'LImon',
  'LIsubhrPtSite',
  'LP3hr',
  'LP3hrPt',
  'LP6hrPt',
  'LPday',
  'LPfx',
  'LPmon',
  'LPyr',
  'LPyrPt',
  'OBday',
  'OBmon',
  'OBmonLev',
  'OByr',
  'OByrLev',
  'OP3hrPt',
  'OPday',
  'OPdec',
  'OPdecLev',
  'OPdecZ',
  'OPfx',
  'OPmon',
  'OPmonClim',
  'OPmonClimLev',
  'OPmonLev',
  'OPmonZ',
  'OPyr',
  'OPyrLev',
  'SIday',
  'SImon',
  'SImonPt

Example of variables in a particular table

In [10]:
new_tables['APsubhrPt']

defaultdict(list,
            {'hfls': [('Esubhr', 'hfls')],
             'hfss': [('Esubhr', 'hfss')],
             'huss': [('Esubhr', 'huss')],
             'pr': [('Esubhr', 'pr')],
             'prc': [('Esubhr', 'prc')],
             'prw': [('Esubhr', 'prw')],
             'ps': [('Esubhr', 'ps')],
             'rlut': [('Esubhr', 'rlut')],
             'rsdt': [('Esubhr', 'rsdt')],
             'rsut': [('Esubhr', 'rsut')],
             'tas': [('Esubhr', 'tas')]})

In [11]:
new_tables['OPmonLev']

defaultdict(list,
            {'ocontempdiff': [('Emon', 'ocontempdiff')],
             'ocontemppadvect': [('Emon', 'ocontemppadvect')],
             'ocontemppmdiff': [('Emon', 'ocontemppmdiff')],
             'ocontemppsmadvect': [('Emon', 'ocontemppsmadvect')],
             'ocontemprmadvect': [('Emon', 'ocontemprmadvect')],
             'ocontemptend': [('Emon', 'ocontemptend')],
             'opottempdiff': [('Emon', 'opottempdiff')],
             'opottemppadvect': [('Emon', 'opottemppadvect')],
             'opottemppmdiff': [('Emon', 'opottemppmdiff')],
             'opottemppsmadvect': [('Emon', 'opottemppsmadvect')],
             'opottemprmadvect': [('Emon', 'opottemprmadvect')],
             'opottemptend': [('Emon', 'opottemptend')],
             'osaltdiff': [('Emon', 'osaltdiff')],
             'osaltpadvect': [('Emon', 'osaltpadvect')],
             'osaltpmdiff': [('Emon', 'osaltpmdiff')],
             'osaltpsmadvect': [('Emon', 'osaltpsmadvect')],
             'osal

Fixed tables:

In [12]:
{i:j for i,j in new_tables.items() if 'fx' in i}

{'APfxSite': defaultdict(list,
             {'latitude': [('CFsubhr', 'latitude')],
              'longitude': [('CFsubhr', 'longitude')]}),
 'LPfx': defaultdict(list,
             {'clayfrac': [('Efx', 'clayfrac')],
              'fldcapacity': [('Efx', 'fldcapacity')],
              'ksat': [('Efx', 'ksat')],
              'rootdsl': [('Efx', 'rootdsl')],
              'sandfrac': [('Efx', 'sandfrac')],
              'slthick': [('Efx', 'slthick')],
              'vegHeight': [('Efx', 'vegHeight')],
              'wilt': [('Efx', 'wilt')],
              'areacellr': [('fx', 'areacellr')],
              'mrsofc': [('fx', 'mrsofc')],
              'orog': [('fx', 'orog')],
              'rootd': [('fx', 'rootd')],
              'sftgif': [('fx', 'sftgif')]}),
 'APfx': defaultdict(list,
             {'ps': [('Efx', 'ps')],
              'rld': [('Efx', 'rld')],
              'rlu': [('Efx', 'rlu')],
              'rsd': [('Efx', 'rsd')],
              'rsu': [('Efx', 'rsu')],
          

The fixed variables could do with some re-arrangement, e.g. siltfrac should be in LPfx, APfxSite should be removed. Leave this for after first draft

## Branded variable assignment

Add branded variable name as a variable attribute, but this methodology likely needs updating

In [13]:
INTERVAL_LOOKUP = {
    'subhrPt': 'subhr',
    '1hr': '1hr', 
    '1hrCM': '1hrCM', 
    '1hrPt': '1hr',
    '3hr': '3hr', 
    '3hrPt': '3hr',
    '6hr': '6hr', 
    '6hrPt': '6hr',
    'day': 'day',
    'mon': 'mon', 
    'monC': 'mon', 
    'monPt': 'mon',
    'yr': 'yr', 
    'yrPt': 'yr',
    'dec':'dec',
    'fx': 'fx',
}


def interval_label(variable):
    return INTERVAL_LOOKUP[variable['frequency']]


REALM_LOOKUP = {
    'atmos': 'ap',
    'atmosChem': 'ac',
    'aerosol': 'ae',
    'land': 'ld',
    'landIce': 'li',
    'ocean': 'op',
    'ocnBgchem': 'oc',
    'seaIce': 'si',
}


def realm_label(variable):
    # not taking into account secondary realms, will have errors
    return REALM_LOOKUP[variable['modeling_realm'].split(' ')[0]]


TIME_LOOKUP = {
    'time': 'tav',
    'time1': 'tpt',
    'time2': 'tcla',
    'time3': 'tcld',
}


def temporal_label(variable):
    # default of "none" to cover fixed field case
    label = 'none'
    for dim in variable['dimensions'].split(' '):
        if dim in TIME_LOOKUP:
            label = TIME_LOOKUP[dim]
            break
    return label

VERTICAL_LOOKUP = {
    'sdepth': 'l',
    'olevel': 'l',
    'alevel': 'l',
    'alevhalf': 'l',
    'olevhalf': 'l',
    'rho': 'rhon',
    'height2m': 'h02',
    'height10m': 'h010',
    'height100m': 'h0100',
    'sdepth1': 'z01s',
    'sdepth10': 'z010',
    'depth0m': 'z00',
    'depth100m': 'z0100',
    'depth300m': 'z0300',
    'depth700m': 'z0700',
    'depth2000m': 'z02000',
    'olayer100m': 'z0100',
    'p10': 'p010',
    'p100': 'p0100',
    'p220': 'p0220',
    'p500': 'p0500',
    'p560': 'p0560',
    'pl700': 'p0700',
    'p840': 'p0840',
    'p850': 'p0850',
    'p1000': 'p01000',
    'alt16': 'z16',
    'alt40': 'z40',
    'plev3': 'p3',
    'plev4': 'p4',
    'plev8': 'p8',
    'plev7c': 'p7c',
    'plev7h': 'p7h',
    'plev19': 'p19',
    'plev27': 'p27',
    'plev39': 'p39',
}

def vertical_label(variable):
    for dim in variable['dimensions'].split(' '):
        if dim in VERTICAL_LOOKUP:
            return VERTICAL_LOOKUP[dim]
    # default:
    return 'z0'
        

def horizontal_label(variable):
    dimensions = set(variable['dimensions'].split(' '))
    latlon = {'latitude', 'longitude'}
    ant = {'xant', 'yant'}
    gre = {'xgre', 'ygre'}
    latbas = {'latitude', 'basin'}
    
    if (latlon.intersection(dimensions) == latlon or
        ant.intersection(dimensions) == ant or
        gre.intersection(dimensions) == gre):
        # simple lat lon
        result = 'hxy'
    elif ('latitude' in dimensions and 
        'longitude' not in dimensions and 
        'basin' not in dimensions):
        # zonal means, but not by basin
        result = 'hy'
    elif not {'latitude', 'yant', 'ygre', 'gridLatitude', 'site', 'oline', 'oline'}.intersection(dimensions):
        # spatial means means
        result = 'hm'
    elif latbas.intersection(dimensions) == latbas:
        # basin means
        result = 'hys'
    elif 'site' in dimensions:
        # CF sites
        result = 'hxys'
    elif 'oline' in dimensions or 'siline' in dimensions:
        # transports
        result = 'ht'
    else:
        raise KeyError('Could not determine label for "{}"'.format(variable['dimensions']))
    
    return result


def branded_variable_name(variable):
    # Order of terms after variable label
    functions = [interval_label, realm_label, temporal_label, vertical_label, horizontal_label]
    return '{}_{}'.format(variable['out_name'], '-'.join([f(variable) for f in functions]))

Examples

In [14]:
branded_variable_name(mip_tables_data['Emon']['hus27']), branded_variable_name(mip_tables_data['Emon']['hus'])

('hus_mon-ap-tav-p27-hxy', 'hus_mon-ap-tav-p7h-hxy')

In [15]:
branded_variable_name(mip_tables_data['E1hrClimMon']['rlut'])

'rlut_1hrCM-ap-tcld-z0-hxy'

Put together a list of branded variable names and add the branded variable name to the MIP table entry.
There are a lot of "collisions", i.e. multiple variables with the same branded variable name, most of which relate to the icesheet variables over Greenland and Antarctica. For now add suffix "A" or "G"

In [16]:
branded_vns = defaultdict(list)

for table in mip_tables_data:
    for variable, entry in mip_tables_data[table].items():
        bvn = branded_variable_name(entry)
        # allow for the Glacier ice to be distinguished by appending 'G' for greenland and 'A' for antarctica
        if 'Gre' in table:
            bvn += 'G'
        if 'Ant' in table:
            bvn += 'A'
        entry['branded_variable_name'] = bvn
        branded_vns[bvn].append((table, variable))

for bvn in branded_vns:
    if len(branded_vns[bvn]) != 1:
        print(bvn, branded_vns[bvn])

ps_3hr-ap-tpt-z0-hxy [('3hr', 'ps'), ('CF3hr', 'ps'), ('E3hrPt', 'ps')]
ps_mon-ap-tav-z0-hxy [('AERmon', 'ps'), ('Amon', 'ps'), ('CFmon', 'ps'), ('Emon', 'ps')]
prsn_mon-ap-tav-z0-hxy [('Amon', 'prsn'), ('Omon', 'prsn')]
sbl_mon-li-tav-z0-hxy [('Amon', 'sbl'), ('LImon', 'sbl')]
clt_day-ap-tav-z0-hxy [('Eday', 'clt'), ('day', 'clt')]
hfls_day-ap-tav-z0-hxy [('Eday', 'hfls'), ('day', 'hfls')]
hfss_day-ap-tav-z0-hxy [('Eday', 'hfss'), ('day', 'hfss')]
rls_day-ap-tav-z0-hxy [('Eday', 'rls'), ('day', 'rls')]
rss_day-ap-tav-z0-hxy [('Eday', 'rss'), ('day', 'rss')]


Some of the above are dealt with later.

## Check for conflicts

Build lists of new MIP table, variable and where the variable came from

In [17]:
new_table_mapping = defaultdict(lambda: defaultdict(list))
new_table_mapping_with_conflicts = defaultdict(lambda: defaultdict(list))
for table in new_tables:
    for variable in new_tables[table]:
        new_table_mapping[table][variable]+= new_tables[table][variable]
        if len(new_tables[table][variable]) > 1:
            new_table_mapping_with_conflicts[table][variable] += new_tables[table][variable]


Sample of the entries:

In [18]:
new_table_mapping['AP6hrPtLev']

defaultdict(list,
            {'hus': [('6hrLev', 'hus')],
             'pfull': [('6hrLev', 'pfull')],
             'ta': [('6hrLev', 'ta')],
             'ua': [('6hrLev', 'ua')],
             'va': [('6hrLev', 'va')]})

List variables with table conflicts, i.e. where a new variable has multiple corresponding original variables, and report whether/how the variables differ.

In [19]:
new_table_mapping_with_conflicts

defaultdict(<function __main__.<lambda>()>,
            {'AP3hrPt': defaultdict(list,
                         {'ps': [('3hr', 'ps'),
                           ('CF3hr', 'ps'),
                           ('E3hrPt', 'ps')]}),
             'APday': defaultdict(list,
                         {'clt': [('Eday', 'clt'), ('day', 'clt')],
                          'hfls': [('Eday', 'hfls'), ('day', 'hfls')],
                          'hfss': [('Eday', 'hfss'), ('day', 'hfss')],
                          'hus': [('Eday', 'hus'), ('day', 'hus')],
                          'rls': [('Eday', 'rls'), ('day', 'rls')],
                          'rss': [('Eday', 'rss'), ('day', 'rss')],
                          'ta': [('Eday', 'ta'), ('day', 'ta')],
                          'ua': [('Eday', 'ua'), ('day', 'ua')],
                          'va': [('Eday', 'va'), ('day', 'va')],
                          'wap': [('Eday', 'wap'), ('day', 'wap')],
                          'zg': [('Eday', 'zg'), ('day', 

In [20]:
for table, variables in new_table_mapping_with_conflicts.items():
    
    for variable, mappings in variables.items():
        print('{}/{}'.format(table, variable))
        ref_mapping = mappings[0]
        ref_entry = mip_tables_data[ref_mapping[0]][ref_mapping[1]]
        differences = False
        for mapping in mappings[1:]:
            test_entry = mip_tables_data[mapping[0]][mapping[1]]
            for field in ref_entry:
                if field == 'dreq_uid' or field in QC_FIELDS:
                    # Don't report data request identifier differences or qc information
                    continue
                if ref_entry[field] != test_entry[field]:
                    differences = True
                    print('\t{}'.format(field))
                    print('\t\t{}/{} {}={}'.format(ref_mapping[0], ref_mapping[1], field, ref_entry[field]))
                    print('\t\t{}/{} {}={}'.format(mapping[0], mapping[1], field, test_entry[field]))
            if not differences:
                print('\t{}/{}, {}/{}: no differences'.format(ref_mapping[0], ref_mapping[1], mapping[0], mapping[1]))

AP3hrPt/ps
	3hr/ps, CF3hr/ps: no differences
	3hr/ps, E3hrPt/ps: no differences
APday/clt
	cell_methods
		Eday/clt cell_methods=area: mean where land time: mean
		day/clt cell_methods=area: time: mean
APday/hfls
	cell_methods
		Eday/hfls cell_methods=area: mean where land time: mean
		day/hfls cell_methods=area: time: mean
APday/hfss
	cell_methods
		Eday/hfss cell_methods=area: mean where land time: mean
		day/hfss cell_methods=area: time: mean
APday/hus
	dimensions
		Eday/hus dimensions=longitude latitude plev19 time
		day/hus dimensions=longitude latitude plev8 time
	branded_variable_name
		Eday/hus branded_variable_name=hus_day-ap-tav-p19-hxy
		day/hus branded_variable_name=hus_day-ap-tav-p8-hxy
APday/rls
	cell_methods
		Eday/rls cell_methods=area: mean where land time: mean
		day/rls cell_methods=area: time: mean
APday/rss
	cell_methods
		Eday/rss cell_methods=area: mean where land time: mean
		day/rss cell_methods=area: time: mean
APday/ta
	dimensions
		Eday/ta dimensions=longitud

### Manual changes required:

Merge entries:

* AP3hrPt ps ['3hr', 'CF3hr', 'E3hrPt']
* APmon ps ['AERmon', 'Amon', 'CFmon', 'Emon']   

In [21]:
print(new_table_mapping['AP3hrPt']['ps']) 
new_table_mapping['AP3hrPt']['ps'] = new_table_mapping['AP3hrPt']['ps'][:1]
print(new_table_mapping['AP3hrPt']['ps']) 
print(new_table_mapping['APmon']['ps']) 
new_table_mapping['APmon']['ps'] = new_table_mapping['APmon']['ps'][1:2] # choose Amon
print(new_table_mapping['APmon']['ps'])

[('3hr', 'ps'), ('CF3hr', 'ps'), ('E3hrPt', 'ps')]
[('3hr', 'ps')]
[('AERmon', 'ps'), ('Amon', 'ps'), ('CFmon', 'ps'), ('Emon', 'ps')]
[('Amon', 'ps')]


Retain entry with cell_methods "area: time: mean" (MIP table "day"). Entries marked with "area: mean where land time: mean" should be reconstructable using land sea mask.
* APday clt ['Eday', 'day']
* APday hfls ['Eday', 'day']
* APday hfss ['Eday', 'day']
* APday rls ['Eday', 'day']
* APday rss ['Eday', 'day']

In [22]:
for variable in ['clt', 'hfls', 'hfss', 'rls', 'rss']:
    print(variable)
    print(new_table_mapping['APday'][variable])
    new_table_mapping['APday'][variable] = [('day', variable)]
    print(new_table_mapping['APday'][variable])

clt
[('Eday', 'clt'), ('day', 'clt')]
[('day', 'clt')]
hfls
[('Eday', 'hfls'), ('day', 'hfls')]
[('day', 'hfls')]
hfss
[('Eday', 'hfss'), ('day', 'hfss')]
[('day', 'hfss')]
rls
[('Eday', 'rls'), ('day', 'rls')]
[('day', 'rls')]
rss
[('Eday', 'rss'), ('day', 'rss')]
[('day', 'rss')]


Retain entry from Amon with cell_methods "area: time: mean" rather than entry from LImon "area: mean where land time: mean"
* LImon sbl ['Amon', 'LImon'] 

In [23]:
print(new_table_mapping['LImon']['sbl']) 
new_table_mapping['LImon']['sbl'] = [('Amon', 'sbl')]
print(new_table_mapping['LImon']['sbl']) 

[('Amon', 'sbl'), ('LImon', 'sbl')]
[('Amon', 'sbl')]


Entries with different pressure level sets (plev19 and plev8). Simplest option would be to suffix variable with number of pressure levels, e.g. add APday/hus19 (from Eday/hus) and APday/hus8 (from day/hus).  
* APday hus ['Eday', 'day'] -> APday/hus19, APday/hus8
* APday ta ['Eday', 'day'] -> APday/ta19, APday/ta8
* APday ua ['Eday', 'day'] -> APday/ua19, APday/ua8
* APday va ['Eday', 'day'] -> APday/va19, APday/va8
* APday wap ['Eday', 'day'] -> APday/wap19, APday/wap8
* APday zg ['Eday', 'day'] -> APday/zg19, APday/zg8
* APmon hus ['Amon', 'Emon'] -> APmon/hus19, APmon/hus7h (***Note 1**)
* APmon ua ['Amon', 'Emon'] -> APmon/ua19, APmon/ua7h (***Note 1**)
* APmon va ['Amon', 'Emon'] -> APmon/va19, APmon/va7h (***Note 1**)
	
***Note 1**: The Amon entry hadd cell methods "time: mean", whereas the Emon entry had "area: time: mean". Propose to keep the "area: time: mean" entry for consistency.

In [24]:
remappings = [
    (('Eday', 'hus'), ('APday', 'hus19')),
    (('day', 'hus'), ('APday', 'hus8')),
    (('Eday', 'ta'), ('APday', 'ta19')),
    (('day', 'ta'), ('APday', 'ta8')),
    (('Eday', 'ua'), ('APday', 'ua19')),
    (('day', 'ua'), ('APday', 'ua8')),
    (('Eday', 'va'), ('APday', 'va19')),
    (('day', 'va'), ('APday', 'va8')),
    (('Eday', 'wap'), ('APday', 'wap19')),
    (('day', 'wap'), ('APday', 'wap8')),
    (('Eday', 'zg'), ('APday', 'zg19')),
    (('day', 'zg'), ('APday', 'zg8')),
    (('Amon', 'hus'), ('APmon', 'hus19')),
    (('Emon', 'hus'), ('APmon', 'hus7h')),
    (('Amon', 'ua'), ('APmon', 'ua19')),
    (('Emon', 'ua'), ('APmon', 'ua7h')),
    (('Amon', 'va'), ('APmon', 'va19')),
    (('Emon', 'va'), ('APmon', 'va7h'))
]

for (original_table, original_variable), (new_table, new_variable) in remappings:
    new_table_mapping[new_table][original_variable] = []
    new_table_mapping[new_table][new_variable] = [(original_table, original_variable)]



These entries have different cell methods/measures and/or grids. Propose placing appropriate entries in two MIP tables
* APmon prsn ['Amon', 'Omon'] : 
  * Amon/prsn -> APmon/prsn (atmosphere grid, "area: time: mean") and 
  * Omon/prsn -> OPmon/prsn (ocean grid, "area:mean where ice_free_sea over sea time:mean")
    * Set modeling_realm to ocean

	

In [25]:
new_table_mapping['OPmon']['prsn'] = [('Omon', 'prsn')]
mip_tables_data['Omon']['prsn']['modeling_realm'] = 'ocean'
mip_tables_data['Omon']['prsn']['branded_variable_name'] = branded_variable_name(mip_tables_data['Omon']['prsn'])
new_table_mapping['APmon']['prsn'] = [('Amon', 'prsn')]



Sanity check: the following should give no output

In [26]:
for table in new_tables:
    for variable in new_tables[table]:
        if len(new_table_mapping[table][variable]) > 1:
            print(table, variable, new_table_mapping[table][variable])

### Additional manual changes

APfxSite table doesn't seem necessary

In [27]:
new_table_mapping['APfxSite']

defaultdict(list,
            {'latitude': [('CFsubhr', 'latitude')],
             'longitude': [('CFsubhr', 'longitude')]})

In [28]:
del new_table_mapping['APfxSite']

## Write out to files


In [29]:
for filename in os.listdir(OUTPUT_LOCATION):
    try:
        os.unlink(os.path.join(OUTPUT_LOCATION, filename))
    except IsADirectoryError:
        pass

In [30]:
from datetime import datetime

header = {'data_specs_version': '6.3.0.0',
 'cmor_version': '4.0',
 'table_id': '<to be set>',
 'table_date': datetime.now().strftime('%Y-%m-%d'),
 'missing_value': '1e20',
 'int_missing_value': '-999',
 'product': 'model-output',
 'approx_interval': '<NEEDS WORK>',
 'generic_levels': '',  # overwritten if necessary
# 'mip_era': 'CMIP6Plus',  # not to be included
 'Conventions': 'CF-1.7 CMIP-6.3'}

Routines to set and validate checksums in JSON documents. Checksum creation and validation 

In [31]:
def calculate_checksum(dictionary, overwrite=True, checksum_location='Header'):
    """
    Calculate the checksum for dictionary and add it to the Header

    Parameters
    ----------
    dictionary: dict
        The dictionary to set the checksum for.
    overwrite: bool
        Overwrite the existing checksum (default True).
    checksum_location: str
        sub-dictionary to look for in /add the checksum to.

    Raises
    ------
    RuntimeError
        If the ``checksum`` key already exists and ``overwrite`` is
        False.
    """
    if 'checksum' in dictionary[checksum_location]:
        if not overwrite:
            raise RuntimeError('Checksum already exists.')
        del dictionary[checksum_location]['checksum']
    checksum = _checksum(dictionary)
    dictionary[checksum_location]['checksum'] = checksum


def validate_checksum(dictionary, checksum_location='Header'):
    """
    Validate the checksum in the ``dictionary``.

    Parameters
    ----------
    dictionary: dict
        The dictionary containing the ``checksum`` to validate.
    checksum_location: str
        sub-dictionary to look for in /add the checksum to.

    Raises
    ------
    KeyError
        If the ``checksum`` key does not exist.
    RuntimeError
        If the ``checksum`` value is invalid.
    """
    if 'checksum' not in dictionary[checksum_location]:
        raise KeyError('No checksum to validate')
    dictionary_copy = deepcopy(dictionary)
    del dictionary_copy[checksum_location]['checksum']
    checksum = _checksum(dictionary_copy)
    if dictionary[checksum_location]['checksum'] != checksum:
        msg = ('Expected checksum   "{}"\n'
               'Calculated checksum "{}"').format(dictionary[checksum_location]['checksum'],
                                                  checksum)
        raise RuntimeError(msg)


def _checksum(obj):
    obj_str = json.dumps(obj, sort_keys=True)
    checksum_hex = hashlib.md5(obj_str.encode('utf8')).hexdigest()
    return 'md5: {}'.format(checksum_hex)


In [32]:
# construct new tables

for table_name in new_table_mapping:
    # header information
    table_header = copy(header)
    table_header['table_id'] = table_name
    if table_name.endswith('Lev'):
        if table_name.startswith('A'):
            table_header['generic_levels'] = ['alevel', 'alevhalf']
        elif table_name.startswith('O'):
            table_header['generic_levels'] = ['olevel', 'olevhalf']
        else:
            # shouldn't happen
            raise RuntimeError(table)
    
    # variable entries
    variable_entry = {}
    for variable, original_variable_list in new_table_mapping[table_name].items():
        if len(original_variable_list) == 0:
            # Variables that have been renamed due to conflicts, e.g. APmon/hus -> APmon/hus19 and APmon/hus7h
            continue
        # get name of CMIP6 mip table and variable name
        original_mip_table, original_variable = original_variable_list[0]
        
        # get variable entry from original mip_tables
        entry = copy(mip_tables_data[original_mip_table][original_variable])
        
        # change dimensions into a list of strings
        entry['dimensions'] = entry['dimensions'].split()
        # add a provenance field to indicate where this variable came from in CMIP6 (and before)
        entry['provenance'] = {
            'CMIP6': {
                'mip_table': original_mip_table, 
                'variable_name': original_variable, 
                'dreq_uid': entry['dreq_uid']}}
        # remove data request uid as independent field
        del entry['dreq_uid']
        # Move validation fields into a subsection
        entry['validation'] = {}
        for i in ["ok_max_mean_abs", "ok_min_mean_abs", "valid_max", "valid_min"]:
            entry['validation'][i] = entry[i]
            del entry[i]
        
        # overwrite outnames with variable name 
        if entry['out_name'] != variable:
            print('Overwriting out_name "{}" with "{}"'.format(entry['out_name'], variable))
            entry['out_name'] = variable
                
        variable_entry[variable] = entry
        
    # write out
    new_table = {
        'Header': table_header,
        'variable_entry': variable_entry,
    }
    # Add MD5 checksum to table header
    calculate_checksum(new_table)
    
    validate_checksum(new_table)
    output_file = os.path.join(OUTPUT_LOCATION, '{}.json'.format(table_name))
    
    with open(output_file, 'w') as file_handle:
        json.dump(new_table, file_handle, indent=2, sort_keys=True)
    

Overwriting out_name "hus" with "hus7h"
Overwriting out_name "ta" with "ta7h"
Overwriting out_name "ua" with "ua7h"
Overwriting out_name "va" with "va7h"
Overwriting out_name "wap" with "wap7h"
Overwriting out_name "cldicemxrat" with "cldicemxrat27"
Overwriting out_name "cldwatmxrat" with "cldwatmxrat27"
Overwriting out_name "grplmxrat" with "grplmxrat27"
Overwriting out_name "hus" with "hus27"
Overwriting out_name "hus" with "hus7h"
Overwriting out_name "rainmxrat" with "rainmxrat27"
Overwriting out_name "snowmxrat" with "snowmxrat27"
Overwriting out_name "ta" with "ta27"
Overwriting out_name "ta" with "ta7h"
Overwriting out_name "ua" with "ua7h"
Overwriting out_name "va" with "va7h"
Overwriting out_name "wbptemp" with "wbptemp7h"
Overwriting out_name "zg" with "zg27"
Overwriting out_name "zg" with "zg7h"
Overwriting out_name "hus" with "hus4"
Overwriting out_name "wap" with "wap4"
Overwriting out_name "hus" with "hus19"
Overwriting out_name "hus" with "hus8"
Overwriting out_name "ta"

In [33]:
!cat ../../Tables/ACmon.json


{
  "Header": {
    "Conventions": "CF-1.7 CMIP-6.3",
    "approx_interval": "<NEEDS WORK>",
    "checksum": "md5: 6a2da2bd3b93327ca0cc89b3543294a8",
    "cmor_version": "4.0",
    "data_specs_version": "6.3.0.0",
    "generic_levels": "",
    "int_missing_value": "-999",
    "missing_value": "1e20",
    "product": "model-output",
    "table_date": "2022-09-05",
    "table_id": "ACmon"
  },
  "variable_entry": {
    "flashrate": {
      "branded_variable_name": "flashrate_mon-ac-tav-z0-hxy",
      "cell_measures": "area: areacella",
      "cell_methods": "area: time: mean",
      "comment": "proposed name: lightning_flash_rate (units to be interpreted as 'counts km-2 s-1)",
      "dimensions": [
        "longitude",
        "latitude",
        "time"
      ],
      "frequency": "mon",
      "long_name": "Lightning Flash Rate",
      "modeling_realm": "atmosChem",
      "out_name": "flashrate",
      "positive": "",
      "provenance": {
        "CMIP6": {
          "dreq_uid": "6f691c5

Copy over formula terms and coordinates
*Question: is there value in adding MD5 sums here?*

In [34]:

shutil.copy(os.path.join(CMIP6_LOCATION, 'CMIP6_formula_terms.json'), os.path.join(OUTPUT_LOCATION, 'formula_terms.json'))
shutil.copy(os.path.join(CMIP6_LOCATION, 'CMIP6_coordinate.json'), os.path.join(OUTPUT_LOCATION, 'coordinate.json'))



'../../Tables/coordinate.json'

In [35]:
# Grids file

# Load grid file
grids_file = os.path.join(CMIP6_LOCATION, 'CMIP6_grids.json')
with open(grids_file) as grids_fh:
    grids_data = json.load(grids_fh)

# tweak header used for MIP table
grids_data['Header'] = header
grids_data['Header']['table_id'] = 'grids'
for f in ['generic_levels', 'approx_interval']:
    if f in grids_data:
        del grids_data['Header'][f]

calculate_checksum(grids_data)
        
# write out
output_grids_file = os.path.join(OUTPUT_LOCATION, 'grids.json')
with open(output_grids_file, 'w') as grids_fh:
    json.dump(grids_data, grids_fh, indent=2, sort_keys=True)

In [36]:
grids_data

{'Header': {'data_specs_version': '6.3.0.0',
  'cmor_version': '4.0',
  'table_id': 'grids',
  'table_date': '2022-09-05',
  'missing_value': '1e20',
  'int_missing_value': '-999',
  'product': 'model-output',
  'approx_interval': '<NEEDS WORK>',
  'generic_levels': '',
  'Conventions': 'CF-1.7 CMIP-6.3',
  'checksum': 'md5: 39e7a038c812a390fc0e412653363015'},
 'mapping_entry': {'sample_user_mapping': {'parameter1': 'false_easting',
   'parameter2': 'false_northing',
   'coordinates': 'rlon rlat'}},
 'axis_entry': {'grid_latitude': {'standard_name': 'grid_latitude',
   'units': 'degrees',
   'axis': 'Y',
   'long_name': 'latitude in rotated pole grid',
   'out_name': 'rlat',
   'type': 'double'},
  'grid_longitude': {'standard_name': 'grid_longitude',
   'units': 'degrees',
   'axis': 'X',
   'long_name': 'longitude in rotated pole grid',
   'out_name': 'rlon',
   'type': 'double'},
  'i_index': {'standard_name': '',
   'units': '1',
   'axis': '',
   'long_name': 'first spatial index 

In [37]:
# Pick required pieces out of CV file
cv_file = os.path.join(CMIP6_LOCATION, 'CMIP6_CV.json')

# load data, but drop the top level 'CV' section
with open(cv_file) as cf_handle:
    cv_data = json.load(cf_handle)['CV']

keys_to_keep_in_general_cvs = [
    # 'required_global_attributes',
    'version_metadata',
    'institution_id',  
    'source_type',
    'frequency',
    'grid_label',
    'nominal_resolution',
    'realm',
    'table_id',
    'product',
    # 'tracking_id',     
    # 'further_info_url', # ?
    'realization_index',
    'variant_label',
    'data_specs_version',
    'Conventions',
    'forcing_index',
    'initialization_index',
    'physics_index'
]
# construct dictionary with required keys
generic_cv_data = {
    i: copy(cv_data[i]) for i in keys_to_keep_in_general_cvs
}
project_cv_data = {
    i: copy(cv_data[i]) for i in cv_data if i not in keys_to_keep_in_general_cvs
}   

# Update version metadata
generic_cv_data['version_metadata'] = {
    'CV_collection_modified': datetime.now().strftime('%Y-%m-%d'),
    'CV_collection_version': '6.3.0.0',
    'author': 'Matt Mizielinski <matthew.mizielinski@metoffice.gov.uk>',
    'institution_id': 'MOHC',
    'previous_commit': 'To be added',
    'specs_doc': 'v6.3.0 (link TBC)'
}
project_cv_data['version_metadata'] = {
    'CV_collection_modified': datetime.now().strftime('%Y-%m-%d'),
    'CV_collection_version': '6.3.0.0',
    'author': 'Matt Mizielinski <matthew.mizielinski@metoffice.gov.uk>',
    'institution_id': 'MOHC',
    'previous_commit': 'To be added',
    'specs_doc': 'v6.3.0 (link TBC)'
}

# update list of tables
generic_cv_data['table_id'] = sorted(list(new_tables.keys()))

# Repurpose data_specs_version
# Allow (1 or 2 digits).(1 or 2 digits).(1-3 digits).(1-3 digits), e.g. 6.3.20.119
# use 6.3 to indicate that this is post CMIP6 (6.2) use third index for new groups of variables and fourth for variable updates
generic_cv_data['data_specs_version'] = ['^[[:digit:]]\\{1,2\\}\\.[[:digit:]]\\{1,2\\}\\.[[:digit:]]\\{1,3\\}\\.[[:digit:]]\\{1,3\\}$']

generic_cv_data['Conventions'] = ['^CF-1.7 CMIP-6.[0-3]\\( UGRID-1.0\\)\\{0,\\}$']

# update CMIP6Plus CVs

project_cv_data['DRS']['directory_path_example'] = \
    project_cv_data['DRS']['directory_path_example'].replace('CMIP6', 'CMIP6Plus')
project_cv_data['DRS']['directory_path_sub_experiment_example'] = \
    project_cv_data['DRS']['directory_path_sub_experiment_example'].replace('CMIP6', 'CMIP6Plus')
project_cv_data['license'][0] = \
    project_cv_data['license'][0].replace('CMIP6', 'CMIP6Plus')
project_cv_data['mip_era'] = 'CMIP6Plus'

# Write out

calculate_checksum(generic_cv_data, checksum_location='version_metadata')
calculate_checksum(project_cv_data, checksum_location='version_metadata')

with open(os.path.join(OUTPUT_LOCATION, 'generic_CV.json'), 'w') as cv_fh:
    json.dump(generic_cv_data, cv_fh, indent=2, sort_keys=True)
with open(os.path.join(OUTPUT_LOCATION, 'CMIP6Plus_CV.json'), 'w') as cv_fh:
    json.dump(project_cv_data, cv_fh, indent=2, sort_keys=True)

In [38]:
[i for i in sorted(os.listdir(OUTPUT_LOCATION)) if i.endswith('.json')]

['ACmon.json',
 'ACmonZ.json',
 'AE1hr.json',
 'AE3hrPt.json',
 'AE3hrPtLev.json',
 'AE6hr.json',
 'AE6hrPt.json',
 'AE6hrPtLev.json',
 'AEday.json',
 'AEmon.json',
 'AEmonLev.json',
 'AEmonZ.json',
 'AEsubhrPt.json',
 'AEsubhrPtSite.json',
 'AP1hr.json',
 'AP1hrPt.json',
 'AP3hr.json',
 'AP3hrPt.json',
 'AP3hrPtLev.json',
 'AP6hr.json',
 'AP6hrPt.json',
 'AP6hrPtLev.json',
 'AP6hrPtZ.json',
 'APday.json',
 'APdayLev.json',
 'APdayZ.json',
 'APfx.json',
 'APmon.json',
 'APmonClim.json',
 'APmonClimLev.json',
 'APmonDiurnal.json',
 'APmonLev.json',
 'APmonZ.json',
 'APsubhrPt.json',
 'APsubhrPtLev.json',
 'APsubhrPtSite.json',
 'CMIP6Plus_CV.json',
 'GIAfx.json',
 'GIAmon.json',
 'GIAyr.json',
 'GIGfx.json',
 'GIGmon.json',
 'GIGyr.json',
 'LI3hrPt.json',
 'LI6hrPt.json',
 'LIday.json',
 'LIfx.json',
 'LImon.json',
 'LIsubhrPtSite.json',
 'LP3hr.json',
 'LP3hrPt.json',
 'LP6hrPt.json',
 'LPday.json',
 'LPfx.json',
 'LPmon.json',
 'LPyr.json',
 'LPyrPt.json',
 'OBday.json',
 'OBmon.json'