# Generate manifest file

In [114]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import h5py
import shutil

from dateutil.parser import isoparse

In [120]:
def to_df(f, key):
    print('processing', key)
    values = np.array(f[key]['array'])
    names_1 = [d.decode() for d in f[key]['Dimension_1_names']]
    title_1 = list(f[key]['Dimension_1_title'])[0].decode()
    names_2 = [d.decode() for d in f[key]['Dimension_2_names']]
    title_2 = list(f[key]['Dimension_2_title'])[0].decode()
    index_is_date = False
    try:
        isoparse(names_2[0])
        # Succeeded, names_2 are dates and names_1 are columns
        index_is_date = True
        indices = names_2
        index_name = title_2
        columns = names_1
    except Exception:
        # Failed, names_2 are columns and names_1 are normal indices.
        indices = names_1
        index_name = title_1
        columns = names_2

    if 'Dimension_3_names' in f[key]:
        extra_columns = [d.decode() for d in f[key]['Dimension_3_names']]
        data_dict = {}
        for i1, c1 in enumerate(extra_columns):
            for i2, c2 in enumerate(columns):
                data_dict[c1 + '___' + c2] = values[i1,:,i2] if index_is_date else values[i1,i2,:]
                
        df = pd.DataFrame(data_dict, index=indices, dtype='Int64')
    else:
        # The values are aslo transposed in case of 2 dimensions and non-date
        if not index_is_date:
            values = values.T
            
        # Try Int64 first which is the same as int but can store NaN
        try:
            df = pd.DataFrame(data=values, index=indices, columns=columns, dtype='Int64')
        except Exception:
            df = pd.DataFrame(data=values, index=indices, columns=columns, dtype='Float64')
            
    df.index.name = index_name
    assert df.values.shape == (len(df), len(df.columns))
    
    return df

f = h5py.File('/Users/sbkr014/Downloads/dataproducts/records:SARS-CoV-2:scotland:cases-and-management:carehomes.h5')
df = to_df(f, 'date-country-adult_carehomes_which_submitted_a_return')
df.info()

processing date-country-adult_carehomes_which_submitted_a_return
True
<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 2020-04-21 to 2021-01-05
Data columns (total 1 columns):
 #   Column                                                        Non-Null Count  Dtype
---  ------                                                        --------------  -----
 0   Adult care homes - Adult care homes which submitted a return  37 non-null     Int64
dtypes: Int64(1)
memory usage: 629.0+ bytes


In [134]:
f = h5py.File('/Users/sbkr014/Downloads/1.0.2.h5')
np.array(f['health board/age/persons/array']).shape

(91, 14)

In [31]:
def h5_to_product(path):
    def add_component(name, node):
        if not isinstance(node, h5py.Group):
            return
        first_key = list(node.keys())[0]
        # If children are DataSet, this is the leaf node, create an instance
        if isinstance(node[first_key], h5py.Dataset) and 'archived' not in name:
            item = {
                "name": name,
                "location": "scotland",
                "type": "raw"
            }
            components.append(item)

    f = h5py.File(path)
    components = []
    f.visititems(add_component)
    
    print(path.name, len(components))

    return {
        "product": path.name.replace(':', '/').replace('.h5', ''),
        "components": components
    }

In [12]:
products = [h5_to_product(path) for path in Path('/Users/sbkr014/Downloads/dataproducts/').rglob('*.h5')]
with open('auto_manifest.json', 'w') as f:
    json.dump(products, f, indent=4)

records:SARS-CoV-2:scotland:cases-and-management:carehomes.h5 5
records:SARS-CoV-2:scotland:cases-and-management:schools.h5 4
records:SARS-CoV-2:scotland:cases-and-management:hospital.h5 6
records:SARS-CoV-2:scotland:human-mortality.h5 15
records:SARS-CoV-2:scotland:cases-and-management:mortality.h5 1
records:SARS-CoV-2:scotland:cases-and-management:nhsworkforce.h5 4
records:SARS-CoV-2:scotland:cases-and-management:testing.h5 15


In [35]:
print(json.dumps(h5_to_product(Path('/Users/sbkr014/Downloads/1.0.2.h5')), indent=4))

1.0.2.h5 18
{
    "product": "1.0.2",
    "components": [
        {
            "name": "datazone/age/genders",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "datazone/age/persons",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "grid area/age/genders",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "grid area/age/persons",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "health board/age/genders",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "health board/age/persons",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "intermediate zone/age/genders",
            "location": "scotland",
            "type": "raw"
        },
        {
            "name": "int