In [1]:
# default_exp extraction

In [2]:
#exports
import numpy as np
import pandas as pd
import json

import os
from warnings import warn
from urllib.request import urlopen, urlretrieve 

from frictionless import Package

In [3]:
from IPython.display import JSON

<br>

### Data Loading

In [5]:
package = Package('../data/dictionary/datapackage-.json', profile='tabular-data-package')
ids_resource = package.get_resource('ids')

df_ids = ids_resource.to_pandas()

df_ids.head(2)

Unnamed: 0_level_0,gppd_idnr,esail_id,name,sett_bmu_id,ngc_bmu_id,4c_offshore_id,windpowernet_id,wikidata_id,wikipedia_id,power_technology_id,eutl_id
osuked_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10000,,MARK,Rothes Bio-Plant CHP,"[E_MARK-1, E_MARK-2]","[MARK-1, MARK-2]",,,,,,
10001,"[GBR1000377, GBR1000369]",DIDC,Didcot,"[T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3, T_DIDC1G,...","[DIDC1, DIDC2, DIDC4, DIDC3, DIDC1G, DIDC2G, D...",,,,,,[97165]


In [6]:
JSON(ids_resource.schema)

<IPython.core.display.JSON object>

<br>

### Site ID Extraction

In [7]:
#exports
def get_field_name_tags(
    schema: dict,
    ids_resource_name: str='ids'
):
    field_name_tags = {
        field['name']: field['hierarchy']
        for field 
        in schema['fields']
    }

    return field_name_tags

def field_hierarchies_to_root(field_hierarchies: dict):
    root_fields = [k for k, v in field_hierarchies.items() if v=='root']
    
    assert len(root_fields) == 1, 'There can be only 1 field with the hierarchy: `root`'
    root_field = root_fields[0]

    return root_field

def set_primary_field_as_idx(df, primary_key_field, root_field_type):
    if primary_key_field in df.columns:
        assert df[primary_key_field].unique().size == df.shape[0], 'There were duplicated values in the primary key field'
        df = df.set_index(primary_key_field)
    else:
        assert df.index.name == primary_key_field, f'The expected primary key field, {primary_key_field}, is not within the dataset'
        
    if root_field_type in ['integer']:
        df.index = df.index.astype(int)
        
    return df

In [8]:
field_hierarchies = get_field_name_tags(ids_resource.schema)
root_field = field_hierarchies_to_root(field_hierarchies)
root_field_type = [field['type'] for field in ids_resource.schema['fields'] if field['name']==root_field][0]

df_ids = set_primary_field_as_idx(df_ids, root_field, root_field_type)

print(f'root_field: `{root_field}`')
field_hierarchies

root_field: `osuked_id`


{'osuked_id': 'root',
 'gppd_idnr': 'equivalent/child',
 'esail_id': 'equivalent',
 'name': 'equivalent',
 'sett_bmu_id': 'child',
 'ngc_bmu_id': 'child',
 '4c_offshore_id': 'equivalent/child',
 'windpowernet_id': 'equivalent/child',
 'wikidata_id': 'equivalent/child',
 'wikipedia_id': 'equivalent/child',
 'power_technology_id': 'equivalent/child',
 'eutl_id': 'equivalent/child'}

In [9]:
#exports
get_dict_head = lambda dict_, n=5: pd.Series(dict_).head(n).to_dict()

def initialise_site_data_with_ids(df_ids):
    valid_hierarchy_types = ['root', 'parent', 'child', 'equivalent', 'equivalent/parent', 'equivalent/child']

    site_data = {}
    id_cols = df_ids.columns

    for idx, *ids in df_ids.itertuples():
        site_data[idx] = {}
        site_data[idx]['id_hierarchies'] = {}
        site_data[idx]['id_hierarchies']['parent'] = {}
        site_data[idx]['id_hierarchies']['child'] = {}
        site_data[idx]['id_hierarchies']['equivalent'] = {}

        for id_type, id_value in pd.Series(dict(zip(id_cols, ids))).dropna().items():
            field_hierarchy = field_hierarchies[id_type]
            assert field_hierarchy in valid_hierarchy_types, f'The {field_hierarchy} field did not have a valid hierarchical attribute'

            if 'equivalent/' in field_hierarchy:
                hierarchy_if_not_array, hierarchy_if_array = field_hierarchy.split('/')

                if isinstance(id_value, list):
                    if len(id_value) > 1:
                        site_data[idx]['id_hierarchies'][hierarchy_if_array][id_type] = id_value
                    else:
                        site_data[idx]['id_hierarchies'][hierarchy_if_not_array][id_type] = id_value[0]
            else:
                site_data[idx]['id_hierarchies'][field_hierarchy][id_type] = id_value
        
    return site_data

In [10]:
site_data = initialise_site_data_with_ids(df_ids)

JSON(get_dict_head(site_data))

<IPython.core.display.JSON object>

<br>

### Site Datasets Extraction

In [11]:
#exports
def datapackage_ref_to_ds_schema(datapackage_ref):
    dp_url = datapackage_ref['package']
    resource = datapackage_ref['resource']

    dp_schema = json.load(urlopen(dp_url))
    
    ds_schema = [
        resource
        for resource 
        in dp_schema['resources']
        if resource['name'] == datapackage_ref['resource']
    ][0]
    
    return ds_schema

def extract_external_foreignkey_datapackage_refs(resource, primary_key_field):
    fk_external_datapackage_refs = [
        {
            'package': fk['reference']['package'],
            'resource': fk['reference']['resource'],
            'attributes': fk['reference']['attributes'],
            'dictionary_pk_field': fk['fields'],
            'external_fk_field': fk['reference']['fields']
        }
        for fk 
        in resource.schema['foreignKeys'] 
        if ('package' in fk['reference'].keys())
    ]
    
    for i, datapackage_ref in enumerate(fk_external_datapackage_refs):
        ds_schema = datapackage_ref_to_ds_schema(datapackage_ref)
        fk_external_datapackage_refs[i]['attribute_fields'] = {field['name']: field for field in ds_schema['schema']['fields']}

    return fk_external_datapackage_refs

In [12]:
%%time

fk_external_datapackage_refs = extract_external_foreignkey_datapackage_refs(ids_resource, primary_key_field=root_field)

JSON(fk_external_datapackage_refs)

Wall time: 1.28 s


<IPython.core.display.JSON object>

In [13]:
#exports
def add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs: str) -> dict:
    for i, fk_external_datapackage_ref in enumerate(fk_external_datapackage_refs):
        external_datapackage_basepath = '/'.join(fk_external_datapackage_ref['package'].split('/')[:-1])
        external_datapackage_json = json.load(urlopen(fk_external_datapackage_ref['package']))

        fk_external_datapackage_refs[i]['resource_loc'] = [
            f"{external_datapackage_basepath}/{resource['path']}" 
            for resource 
            in external_datapackage_json['resources']
            if resource['name'] == fk_external_datapackage_ref['resource']
        ][0]
        
        fk_external_datapackage_refs[i]['name'] = external_datapackage_json['name']
        
    return fk_external_datapackage_refs

In [14]:
fk_external_datapackage_refs = add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs)

JSON(fk_external_datapackage_refs)

<IPython.core.display.JSON object>

In [15]:
#exports
def create_dir(dir_loc: str='./temp'):
    if not os.path.isdir(dir_loc):
        os.mkdir(dir_loc)
    else:
        warn(f'The directory `{dir_loc}` already exists')
        
    return None

def download_attribute_data_to_temp_dir(
    fk_external_datapackage_refs: dict,
    temp_dir_loc: str='./temp'
):
    create_dir(temp_dir_loc)

    for fk_external_datapackage_ref in fk_external_datapackage_refs:
        datapackage_name = fk_external_datapackage_ref['name']
        datapackage_files = [fk_external_datapackage_ref['resource_loc'], fk_external_datapackage_ref['package']]
        
        datapackage_temp_dir = f'{temp_dir_loc}/{datapackage_name}'
        create_dir(datapackage_temp_dir)
        
        for file_to_download in datapackage_files:
            filename = file_to_download.split('/')[-1]
            urlretrieve(file_to_download, f'{datapackage_temp_dir}/{filename}')
        
    return

In [16]:
temp_dir_loc = '../data/attribute_sources'

download_attribute_data_to_temp_dir(fk_external_datapackage_refs, temp_dir_loc=temp_dir_loc)

  warn(f'The directory `{dir_loc}` already exists')
  warn(f'The directory `{dir_loc}` already exists')
  warn(f'The directory `{dir_loc}` already exists')
  warn(f'The directory `{dir_loc}` already exists')
  warn(f'The directory `{dir_loc}` already exists')


In [17]:
#exports
def load_datapackage(datapackage_ref, dir_loc='./temp', return_type='df', set_index=True):
    datapackage_fp = f"{temp_dir_loc}/{datapackage_ref['package'].split('/')[-2]}/datapackage.json"
    datapackage_resource = datapackage_ref['resource']

    external_datapackage = Package(datapackage_fp)
    resource = external_datapackage.get_resource(datapackage_resource)
    
    if return_type == 'package':
        
        return external_datapackage
    elif return_type == 'resource':
        return resource
    
    elif return_type == 'df':
        df_resource = resource.to_pandas()
        
        if set_index == True:
            assert isinstance(datapackage_ref['external_fk_field'], str) or len(datapackage_ref['external_fk_field']==1), 'Only one primary key was expected to be matched on in the external datapackage'
            field_type = [field['type'] for field in resource.schema['fields'] if field['name']==datapackage_ref['external_fk_field']][0]
            df_resource = set_primary_field_as_idx(df_resource, datapackage_ref['external_fk_field'], field_type)

        return df_resource
    
    else:
        raise ValueError('`` must be one of ["df", "resource", "package"]')
        return resource

def load_resource_attr_dfs(fk_external_datapackage_refs, temp_dir_loc):
    resource_attr_dfs = []

    for datapackage_ref in fk_external_datapackage_refs:
        df_external_resource_attrs = load_datapackage(datapackage_ref, dir_loc=temp_dir_loc)
        
        attrs_to_extract = datapackage_ref['attributes']
        df_external_resource_attrs = df_external_resource_attrs[attrs_to_extract]

        df_external_resource_attrs.name = datapackage_ref['package']
        resource_attr_dfs += [df_external_resource_attrs]
        
    return resource_attr_dfs

In [18]:
%%time

resource_attr_dfs = load_resource_attr_dfs(fk_external_datapackage_refs, temp_dir_loc)

resource_attr_dfs[2].head(3)

Wall time: 10 s


Unnamed: 0_level_0,capacity_mw,longitude,latitude,primary_fuel,other_fuel1,other_fuel2,other_fuel3,commissioning_year,owner,source,...,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_gwh_2019,generation_data_source,estimated_generation_gwh_2013,estimated_generation_gwh_2014,estimated_generation_gwh_2015,estimated_generation_gwh_2016,estimated_generation_gwh_2017
gppd_idnr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GEODB0040538,33.0,65.119,32.322,Hydro,,,,,,GEODB,...,,,,,,123.77,162.9,97.39,137.76,119.5
WKS0070144,10.0,65.795,31.67,Solar,,,,,,Wiki-Solar,...,,,,,,18.43,17.48,18.25,17.7,18.29
WKS0071196,10.0,65.792,31.623,Solar,,,,,,Wiki-Solar,...,,,,,,18.64,17.58,19.1,17.62,18.72


In [19]:
get_datapackage_ref = lambda datapackage_refs, datapackage_url: [dp_ref for dp_ref in datapackage_refs if dp_ref['package']==datapackage_url][0]

def load_dataset_ref(datapackage_url, datapackage_refs, dir_loc='./temp'):
    dp_ref = get_datapackage_ref(datapackage_refs, datapackage_url)
    package = load_datapackage(dp_ref, dir_loc=dir_loc, return_type='package')

    dataset_ref = {
        "datapackage_json_url": dp_ref['package'],
        "datapackage_name": dp_ref['package'].split('/')[-2],
        "datapackage_description": package['description'],
        "related_resources": [
            {
                "resource_url": dp_ref['resource_loc'],
                "resource_name": dp_ref['resource'],
                "dictionary_pk_field": dp_ref['dictionary_pk_field'],
                "external_fk_field": dp_ref['external_fk_field'],
                "extracted_attributes": dp_ref['attributes']
            }
        ]
    }
    
    return dataset_ref

In [20]:
datapackage_url = resource_attr_dfs[2].name
dataset_ref = load_dataset_ref(datapackage_url, fk_external_datapackage_refs, dir_loc=temp_dir_loc)

JSON(dataset_ref)

<IPython.core.display.JSON object>

In [21]:
# from fd array.py load the type_to_class dict
# extract the field_descriptor from the resource schema (load from json and look for the field)
# field = array_item_class(field_descriptor)
# field.read_cell()

In [22]:
#exports
flatten_list = lambda list_: [item for sublist in list_ for item in sublist]
drop_duplicates_attrs = lambda attrs, subset=None: pd.DataFrame(attrs).drop_duplicates(subset=subset).to_dict(orient='records')

def load_full_id_map(single_site_data):
    full_id_map = {}

    for hierarchy_ids in single_site_data['id_hierarchies'].values():
        full_id_map.update(hierarchy_ids)

    return full_id_map

def extract_attrs_from_resource_dfs(site_data, datapackage_refs, temp_dir_loc, root_id='osuked_id'):
    dp_schemas = {}
    resource_attr_dfs = load_resource_attr_dfs(datapackage_refs, temp_dir_loc)
    
    for site_id in site_data.keys():
        site_data[site_id]['datasets'] = {}
        full_id_map = load_full_id_map(site_data[site_id])
        full_id_map[root_id] = site_id
        
        for df_resource_attrs in resource_attr_dfs:
            dp_url = df_resource_attrs.name
            
            datapackage_ref = get_datapackage_ref(datapackage_refs, dp_url)
            dataset_ref = load_dataset_ref(df_resource_attrs.name, datapackage_refs, dir_loc=temp_dir_loc)
            
            if datapackage_ref['dictionary_pk_field'] in full_id_map.keys():
                dict_ids = full_id_map[datapackage_ref['dictionary_pk_field']]
                
                if not isinstance(dict_ids, list):
                    dict_ids = [dict_ids]

                matched_dict_ids = sorted(list(set(df_resource_attrs.index).intersection(set(dict_ids))))

                if len(matched_dict_ids) > 0:
                    # datasets                
                    if dp_url not in site_data[site_id]['datasets'].keys():
                        site_data[site_id]['datasets'][dp_url] = dataset_ref
                    else:
                        site_data[site_id]['datasets'][dp_url]['related_resources'] += dataset_ref['related_resources']

                    # attributes
                    site_attrs_from_resource = df_resource_attrs.loc[matched_dict_ids].dropna(how='all', axis=1).to_dict(orient='records')
                    
                    reshaped_site_attrs = flatten_list([
                        [
                            {
                                'source': dp_url, 
                                'id': dict_id, 
                                'attribute': datapackage_ref['attribute_fields'][k]['title'], 
                                'field_schema': datapackage_ref['attribute_fields'][k], 
                                'value': v
                            } 
                            for k, v 
                            in dict_.items()
                        ] 
                        for dict_id, dict_ 
                        in zip(matched_dict_ids, site_attrs_from_resource)
                    ])

                    if len(site_attrs_from_resource) >= 1:
                        if 'attributes' not in site_data[site_id].keys():
                            site_data[site_id]['attributes'] = []

                        site_data[site_id]['attributes'] += reshaped_site_attrs
                        
                        subset = list(set(site_data[site_id]['attributes'][0].keys())-{'field_schema'}) # this assumes all attribute entries have the same keys
                        site_data[site_id]['attributes'] = drop_duplicates_attrs(site_data[site_id]['attributes'], subset=subset)
                
    return site_data

In [23]:
%%time

site_data = extract_attrs_from_resource_dfs(site_data, fk_external_datapackage_refs, temp_dir_loc)

pd.DataFrame(site_data[10001]['attributes']).drop(columns=['field_schema']).head()

Wall time: 16.3 s


Unnamed: 0,source,id,attribute,value
0,https://raw.githubusercontent.com/OSUKED/Dicti...,10001,Longitude,-1.26757
1,https://raw.githubusercontent.com/OSUKED/Dicti...,10001,Latitude,51.62363
2,https://raw.githubusercontent.com/OSUKED/Dicti...,GBR1000369,Installed Capacity (MW),1470.0
3,https://raw.githubusercontent.com/OSUKED/Dicti...,GBR1000369,Longitude,-1.2683
4,https://raw.githubusercontent.com/OSUKED/Dicti...,GBR1000369,Latitude,51.6246


In [24]:
JSON(get_dict_head(site_data))

<IPython.core.display.JSON object>

In [25]:
save_site_data = False
site_data_fp = '../data/intermediate/site_data.json'

if save_site_data == True:
    with open(site_data_fp, 'w') as f:
        json.dump(site_data, f)

In [26]:
# need to create a JSON for the datasets where the id (.json url) and name are passed as well as the id column they're matched on and the relevant attributes
# then start work on the html templates for site pages
# then create the dictionary template
# add in the links between the dictionary and sites
# create dataset pages (and link to them as well)

In [27]:
#hide
from nbdev.export import *
notebook2script()

Converted 00-documentation.ipynb.
Converted 01-combining power station data.ipynb.
Converted 02-attribute extraction.ipynb.
Converted 03-page-population.ipynb.
