In [1]:
import requests
import numpy
import pydap.parsers.dmr
import pydap.model
import pydap.client
from xml.etree import ElementTree as ET
import re
import copy
import collections

# Loading DMRs or DAPs

In [2]:
file_path = 'data/20220102090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1_subset.dap'
file_path = 'data/MY1DQND1.sst.ADD2005001.040.2006011070802.hdf.dap'
dmr_len = pydap.client.get_dmr_length(file_path)
with open(file_path, "rb") as f:    
    #clfr = f.read(4)
    dmr = f.read(dmr_len)
dmr = dmr.decode('ascii')

In [3]:
#fname = 'data/ATL03_20181228015957_13810110_003_01.2var.h5.dmrpp.dmr'
fname = 'data/20220531090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.dmr'
             
#fname = 'data/coads_climatology.nc.dmr'
#fname = 'data/MOD05_L2.A2019336.2315.061.2019337071952.hdf.dmr'

with open(fname, 'r') as dmr_file:
    dmr = dmr_file.read()

In [4]:
dmr = re.sub(' xmlns="[^"]+"', '', dmr, count=1)
dom_et = ET.fromstring(dmr)

# Define functions

In [5]:
dmr_atomic_types = ['Int8', 'UInt8', 'Byte', 'Char', 'Int16', 
                    'UInt16', 'Int32', 'UInt32',
                    'Int64', 'UInt64', 'Float32', 'Float64']

def dap4_to_numpy_typemap(type_string):
    """
    This function takes a numpy dtype object
    and returns a dtype object that is compatible with
    the DAP2 specification.
    """
    dtype_str = pydap.lib.DAP4_TO_NUMPY_PARSER_TYPEMAP[type_string]
    return numpy.dtype(dtype_str)

def get_attributes(element):
    attributes = {}
    attribute_elements = element.findall('Attribute')
    for attribute_element in attribute_elements:
        name = attribute_element.get('name')
        value = attribute_element.find('Value').text
        attributes[name] = value
    return attributes

def get_dtype(element):
    dtype = element.tag
    dtype = dap4_to_numpy_typemap(dtype)
    return dtype

def get_dim_names(element):
    n_unnamed = 0
    # Not to be confused with dimensions
    dimension_elements = element.findall('Dim')
    dimensions = []
    for dimension_element in dimension_elements:
        name = dimension_element.get('name')
        if name is None:
            # We might have unnamed dimensions
            return dimensions
        if name.find('/', 1) == -1:
            # If this is a root Dimension, we remove the leading slash
            name = name.replace('/', '')
        dimensions.append(name)
    return dimensions

def has_map(element):
    maps = element.findall('Map')    
    if len(maps) > 0:        
        return True
    else:
        return False
    
def get_shape(dimensions, variable):
    shape = []
    for dim_name in variable['dims']:
        shape.append(dimensions[dim_name]['size'])
    return shape

def get_dim_sizes(element):
    dimension_elements = element.findall('Dim')
    dimension_sizes = ()
    for dimension_element in dimension_elements:
        name = dimension_element.get('name')
        if name is None:
            size = int(dimension_element.get('size'))
            dimension_sizes += (size,)
    return dimension_sizes

In [6]:
class DummyData(object):
    def __init__(self, dtype, shape):
        self.dtype = dtype
        self.shape = shape

# Ordered Approach

In [7]:
def get_variables(node, prefix=''):
    variables = collections.OrderedDict()
    group_name = node.get('name')
    if group_name is None:
        return variables
    if node.tag != 'Dataset':
        prefix = prefix + '/' + group_name
    for subnode in node:
        if subnode.tag in dmr_atomic_types:
            name = subnode.get('name')
            if prefix != '':
                name = prefix + '/' + name
            variables[name] = {'element': subnode}
        variables.update(get_variables(subnode, prefix))
    return variables

In [8]:
def get_named_dimensions(node, prefix=''):
    dimensions = {}
    group_name = node.get('name')
    if group_name is None:
        return dimensions
    if node.tag != 'Dataset':
        prefix = prefix + '/' + group_name
    for subnode in node:
        if subnode.tag == 'Dimension':
            name = subnode.get('name')
            if prefix != '':
                name = prefix + '/' + name
            dimensions[name] = int(subnode.attrib['size'])
        dimensions.update(get_named_dimensions(subnode, prefix))
    return dimensions

In [9]:
variables = get_variables(dom_et)
named_dimensions = get_named_dimensions(dom_et)

In [10]:
variables

OrderedDict([('mask', {'element': <Element 'Int8' at 0x7fc7083d2ca0>}),
             ('analysed_sst',
              {'element': <Element 'Int16' at 0x7fc7083d3510>}),
             ('lon', {'element': <Element 'Float32' at 0x7fc7083d3d80>}),
             ('time', {'element': <Element 'Int32' at 0x7fc7083ec2c0>}),
             ('sea_ice_fraction',
              {'element': <Element 'Int8' at 0x7fc7083ec770>}),
             ('dt_1km_data', {'element': <Element 'Int8' at 0x7fc7083ecf90>}),
             ('lat', {'element': <Element 'Float32' at 0x7fc7083ed620>}),
             ('analysis_error',
              {'element': <Element 'Int16' at 0x7fc7083edb20>}),
             ('sst_anomaly',
              {'element': <Element 'Int16' at 0x7fc7083ee200>})])

## Now converting variable and named dimensions to dataset

In [11]:
for name, size in named_dimensions.items():
    if name in variables:
        variables[name]['size'] = size

In [12]:
for name, variable in variables.items():
    variable['name'] = name
    variable['attributes'] = get_attributes(variable['element'])
    variable['dtype'] = get_dtype(variable['element'])
    variable['dims'] = get_dim_names(variable['element'])
    variable['has_map'] = has_map(variable['element'])
    #variable['size'] = None
    variable['shape'] = get_dim_sizes(variable['element'])

In [13]:
for name, size in named_dimensions.items():
    # We might have dimensions that only have a declaration, so we add them to the variables
    variables[name] = {'name': name, 'size': size, 'dims': [name], 'element': None, 'dtype': 'int', 'has_map': False, 'attributes': {}, 'shape': ()}

In [14]:
for name, variable in variables.items():
    dims = variable['dims']
    for dim in dims:
        variable['shape'] += (variables[dim]['size'],) 

In [15]:
dataset = pydap.model.DatasetType()
for name, variable in variables.items():
    data = DummyData(dtype=variable['dtype'], shape=variable['shape'])
    array = pydap.model.BaseType(name=variable['name'], data=data, dimensions=variable['dims'])
    if variable['has_map']:
        var = pydap.model.GridType(name=variable['name'])
        var[name] = array
        for dim in variable['dims']:
            var[dim] = copy.copy(dataset[dim])
    else:
        var = array
    var.attributes = variable['attributes']
    dataset[var.name] = var

# Unordered

In [16]:
def get_group_variables(element, parent_name=''):
    variables = {}  
    group_elements = element.findall('Group')
    for group_element in group_elements:
        group_name = parent_name + '/' +  group_element.get('name')               
        group_variables = get_variables(group_element, group_name)        
        subgroup_variables = get_group_variables(group_element, group_name)                        
        variables = {**variables, **group_variables, **subgroup_variables}
    return variables
        
def get_variables(element, parent_name=''):
    variables = {}
    for atomic_type in dmr_atomic_types:              
        for variable in element.findall(atomic_type):            
            name = variable.attrib['name']
            if parent_name == '':           
                # The FQN of root variables does not have leading slash
                fqn = name
            else:
                fqn = parent_name + '/' + name            
            variables[fqn] = {'name': fqn, 'element': variable}
    return variables

def get_group_dimensions(element, parent_name=''):
    dimensions = {}
    group_elements = element.findall('Group')
    for group_element in group_elements:
        group_name = parent_name + '/' +  group_element.get('name')               
        group_dimensions = get_dimensions(group_element, group_name)        
        subgroup_dimensions = get_group_dimensions(group_element, group_name)
        dimensions = {**dimensions, **group_dimensions, **subgroup_dimensions}
    return dimensions

def get_dimensions(element, parent_name=''):
    dimensions = {}
    dimensions_elements = element.findall('Dimension')
    for dimensions_element in dimensions_elements:
        name = dimensions_element.attrib['name']
        if parent_name == '':           
            # The FQN of root variables does not have leading slash
            fqn = name
        else:
            fqn = parent_name + '/' + name
        size = dimensions_element.attrib['size']
        dimensions[fqn] = {'name': fqn, 'size': int(size)}                
    return dimensions

In [17]:
group_variables = get_group_variables(dom_et)
group_dimensions = get_group_dimensions(dom_et)
root_variables = get_variables(dom_et, '')
root_dimensions = get_dimensions(dom_et, '')
dimensions = {**root_dimensions,  **group_dimensions}
variables = {**root_variables,  **group_variables}

In [18]:
dataset = pydap.model.DatasetType('')

for dimension in dimensions.values():       
    dimension_name = dimension['name']    
    dimension_variable = variables[dimension_name]['element']        
    dimension['element'] = dimension_variable
    dimension['attributes'] = get_attributes(dimension_variable)    
    dimension['dtype'] = get_dtype(dimension_variable)    
        
    dim_data = DummyData(dimension['dtype'], shape=(dimension['size'],))    
    var = pydap.model.BaseType(dimension_name, dim_data)
    var.attributes = dimension['attributes']
    dataset[var.name] = var

for variable in variables.values():        
    if variable['name'] in dimensions.keys():
        continue
    variable['attributes'] = get_attributes(variable['element'])
    variable['dtype'] = get_dtype(variable['element'])
    variable['dims'] = get_dims(variable['element'])        
    variable['shape'] = get_shape(dimensions, variable)
    data = DummyData(dtype=variable['dtype'], shape=variable['shape'])
    if has_map(variable['element']):        
        var = pydap.model.GridType(name=variable['name'])
        var[variable['name']] = pydap.model.GridType(name=variable['name'], data=data, dimensions=variable['dims'])
        for dim in variable['dims']:
            var[dim] = dataset[dim]
    else:        
        var = pydap.model.BaseType(name=variable['name'], dimensions=variable['dims'])
    var.attributes = variable['attributes']
    dataset[var.name] = var
        
#var_name = '/gt1r/bckgrd_atlas/bckgrd_int_height'
#var_element = variables[var_name]
#process_variable_element(var_element)
#dataset['/gt1r/bckgrd_atlas/delta_time']
#dimensions
dataset

NameError: name 'get_dims' is not defined

In [None]:
# Sorting the variables

In [None]:
def get_variable_order(node, prefix=''):
    variables = []
    group_name = node.get('name')
    if group_name is None:
        return variables        
    if node.tag != 'Dataset':        
        prefix = prefix + '/' + group_name 
    
    for subnode in node:                        
        if subnode.tag in dmr_atomic_types:
            name = subnode.get('name')
            if prefix != '':
                name = prefix + '/' + name            
            variables.append(name)
        variables += get_variable_order(subnode, prefix)
    return variables

# Live

In [None]:
url = 'http://test.opendap.org/opendap/hyrax/data/stare/MOD05_L2.A2019336.2315.061.2019337071952.hdf.dmr'
ret = requests.get(url)
