In [1]:
# default_exp population

In [5]:
#exports
import json
import numpy as np
import pandas as pd
from frictionless import Package

from powerdict import extraction

import os
from tqdm import tqdm
from warnings import warn

from jinja2 import Template

In [6]:
from IPython.display import JSON, Markdown

<br>

### Data Formatting

In [7]:
site_data_fp = '../data/intermediate/site_data.json'

with open(site_data_fp, 'r') as f:
    site_data = json.load(f)
    
JSON(extraction.get_dict_head(site_data))

<IPython.core.display.JSON object>

In [8]:
#exports
def get_dp_field_to_url_format_str(datapackage_json_fp):
    package = Package(datapackage_json_fp, profile='tabular-data-package')
    ids_resource = package.get_resource('ids')

    id_field_to_url_format_str = {
        field['name']: field['url_format'] 
        for field 
        in ids_resource['schema']['fields'] 
        if 'url_format' in field.keys()
    }

    return id_field_to_url_format_str

In [9]:
datapackage_json_fp = '../data/dictionary/datapackage.json'

id_field_to_url_format_str = get_dp_field_to_url_format_str(datapackage_json_fp)

id_field_to_url_format_str

{'4c_offshore_id': 'https://www.4coffshore.com/windfarms/united-kingdom/{value}.html',
 'windpowernet_id': 'https://www.thewindpower.net/{value}.php',
 'wikidata_id': 'https://www.wikidata.org/wiki/{value}',
 'wikipedia_id': 'https://en.wikipedia.org/wiki/{value}',
 'power_technology_id': 'https://www.power-technology.com/projects/{value}'}

In [10]:
# load in the datapackage.json
# create a dict mapping from the id column name to the url format string
# format the id values to be url links in the markdown string

root_id = '10001'
single_site_data = site_data[root_id]

for hierarchy_level, ids in single_site_data['id_hierarchies'].items():
    for id_type, id_values in ids.items():
        pass
    
id_type, id_values

('eutl_id', 97165)

In [11]:
#exports
def format_id_values(id_values, id_type, id_field_to_url_format_str):
    if id_type in id_field_to_url_format_str.keys():
        url_format_str = id_field_to_url_format_str[id_type]
        id_values_strs = [f'[{id_value}]({url_format_str.format(value=id_value)})' for id_value in id_values]
    else:
        id_values_strs = [str(id_value) for id_value in id_values] 
    
    return id_values_strs

In [12]:
id_type = 'power_technology_id'
id_values = ['hornsea-project-one-north-sea', 'hornsea-project-two-north-sea']
url_format_str = 'https://www.power-technology.com/projects/{value}'

id_values_strs = format_id_values(id_values, id_type, id_field_to_url_format_str)

id_values_strs

['[hornsea-project-one-north-sea](https://www.power-technology.com/projects/hornsea-project-one-north-sea)',
 '[hornsea-project-two-north-sea](https://www.power-technology.com/projects/hornsea-project-two-north-sea)']

In [13]:
#exports
def single_site_data_to_ids_df(single_site_data, root_id, datapackage_json_fp, root_id_type='osuked_id'):
    id_field_to_url_format_str = get_dp_field_to_url_format_str(datapackage_json_fp)
    df_site_ids = pd.DataFrame([{'Relationship': 'root', 'ID Type': root_id_type, 'ID(s)': root_id}])

    hierarchy_level_to_relationship = {
        'parent': 'parent',
        'child': 'element-of',
        'equivalent': 'same-as'
    }

    for hierarchy_level, ids in single_site_data['id_hierarchies'].items():
        if len(ids) >= 1:
            ids = {
                id_type: (', '.join([str(id_) for id_ in format_id_values(id_values, id_type, id_field_to_url_format_str)]) if isinstance(id_values, list) else id_values) 
                for id_type, id_values 
                in ids.items()
            }
                
            relationship = hierarchy_level_to_relationship[hierarchy_level]

            df_site_ids = df_site_ids.append(pd
                                             .Series(ids)
                                             .reset_index()
                                             .assign(Relationship=relationship)
                                             .rename(columns={'index': 'ID Type', 0: 'ID(s)'})
                                            )

    if df_site_ids.size >= 1:
        df_site_ids = df_site_ids.set_index(['Relationship', 'ID Type'])
        
    return df_site_ids

def single_site_data_to_ids_md_str(single_site_data, root_id, datapackage_json_fp):
    df_site_ids = single_site_data_to_ids_df(single_site_data, root_id, datapackage_json_fp)
    site_ids_md_table = df_site_ids.reset_index().to_markdown(index=False)
    site_ids_md_str = '### Identifiers\n\n' + site_ids_md_table
    
    return site_ids_md_str

In [14]:
root_id = '10001'
single_site_data = site_data[root_id]

site_ids_md_str = single_site_data_to_ids_md_str(single_site_data, root_id, datapackage_json_fp)

Markdown(site_ids_md_str)

### Identifiers

| Relationship   | ID Type     | ID(s)                                                                                                                                  |
|:---------------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
| root           | osuked_id   | 10001                                                                                                                                  |
| element-of     | gppd_idnr   | GBR1000377, GBR1000369                                                                                                                 |
| element-of     | sett_bmu_id | T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3, T_DIDC1G, T_DIDC2G, T_DIDC3G, T_DIDC4G, E_DIDC1G, E_DIDC2G, E_DIDC3G, E_DIDC4G, T_DIDCB5, T_DIDCB6 |
| element-of     | ngc_bmu_id  | DIDC1, DIDC2, DIDC4, DIDC3, DIDC1G, DIDC2G, DIDC3G, DIDC4G, DIDC01G, DIDC02G, DIDC03G, DIDC04G, DIDCB5, DIDCB6                         |
| same-as        | esail_id    | DIDC                                                                                                                                   |
| same-as        | name        | Didcot                                                                                                                                 |
| same-as        | eutl_id     | 97165                                                                                                                                  |

In [15]:
#exports
from frictionless.types.array import type_to_class
from frictionless.field import Field

filter_dict = lambda dict_, keys_to_select: {k: dict_[k] for k in keys_to_select}

def attrs_df_to_md_str(df_attrs):
    if df_attrs['Id'].unique().size > 1:
        attrs_md_str = df_attrs.pivot('Attribute', 'Id', 'Value').to_markdown(index=True)
    else:
        attrs_md_str = df_attrs.drop(columns=['Id']).to_markdown(index=False)
        
    return attrs_md_str

def format_attribute_value_types(df_attributes, attr_to_field_schema):
    s_values = pd.Series(index=df_attributes.index, dtype='object')

    for idx, row in df_attributes.iterrows():
        field_schema = Field(attr_to_field_schema[row['Attribute']])
        field_type = field_schema['type']

        field_class = type_to_class[field_type](field_schema)
        s_values[idx] = field_class.read_cell(row['Value'])

    df_attributes['Value'] = s_values
    
    return df_attributes

def get_datapackage_url_to_attributes(single_site_data):
    datapackage_url_to_attributes = {}

    if 'attributes' in single_site_data.keys():
        for attr in single_site_data['attributes']:
            datapackage_url = attr['source']
            attribute_values = filter_dict(attr, ['attribute', 'value', 'id'])

            if datapackage_url not in datapackage_url_to_attributes.keys():
                datapackage_url_to_attributes[datapackage_url] = []

            datapackage_url_to_attributes[datapackage_url] += [attribute_values]

    return datapackage_url_to_attributes

def get_datapackage_url_to_attrs_md_str(single_site_data):
    def capitalize_df_columns(df):
        df.columns = df.columns.str.capitalize()
        return df
    
    attr_to_field_schema = {attr['attribute']: attr['field_schema'] for attr in single_site_data['attributes']}
    datapackage_url_to_attrs = get_datapackage_url_to_attributes(single_site_data)
    
    datapackage_url_to_attrs_md_str = {
        datapackage_url: pd.DataFrame(attributes).pipe(lambda df: capitalize_df_columns(df)).pipe(format_attribute_value_types, attr_to_field_schema).pipe(attrs_df_to_md_str)
        for datapackage_url, attributes
        in datapackage_url_to_attrs.items()
    }
    
    return datapackage_url_to_attrs_md_str

In [16]:
datapackage_url_to_attrs_md_str = get_datapackage_url_to_attrs_md_str(single_site_data)

for attrs_md_str in list(datapackage_url_to_attrs_md_str.values()):
    print(attrs_md_str, '\n')

| Attribute   |    Value |
|:------------|---------:|
| Longitude   | -1.26757 |
| Latitude    | 51.6236  | 

| Attribute                           | GBR1000369                                                                     | GBR1000377                                                                     |
|:------------------------------------|:-------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
| Estimated Annual Generation in 2017 | 6820.88                                                                        | 464.0                                                                          |
| Geolocation Source                  | GEODB                                                                          | GEODB                                                                          |
| Installed Capacity (MW)             | 1470.0                                    

In [17]:
#exports
clean_dp_name = lambda dp_name: dp_name.replace('-', ' ').title()

def construct_dataset_md_str(dataset_metadata, dataset_attributes):
    title = clean_dp_name(dataset_metadata['datapackage_name'])
    url = dataset_metadata['datapackage_json_url']
    description = dataset_metadata['datapackage_description']
    dictionary_column_match = dataset_metadata['related_resources'][0]['dictionary_pk_field']
    dataset_column_match = dataset_metadata['related_resources'][0]['external_fk_field']

    dataset_str = f"""##### <a href="{url}">{title}</a>

{description}

The \"{dictionary_column_match}\" field was used to match from the dictionary to the \"{dataset_column_match}\" field in this dataset.

{dataset_attributes}\n"""
    
    return dataset_str

def single_site_data_to_datasets_md_str(single_site_data):
    datapackage_url_to_attrs_md_str = get_datapackage_url_to_attrs_md_str(single_site_data)
    dataset_url_to_md_str = {}

    for dataset_metadata in single_site_data['datasets'].values():
        dataset_url = dataset_metadata['datapackage_json_url']
        dataset_attributes = datapackage_url_to_attrs_md_str[dataset_url]
        dataset_str = construct_dataset_md_str(dataset_metadata, dataset_attributes)

        dataset_url_to_md_str[dataset_url] = dataset_str
        
    datasets_md_str = '### Datasets\n' + '\n<br><br>\n'.join(list(dataset_url_to_md_str.values()))
    
    return datasets_md_str

In [18]:
datasets_md_str = single_site_data_to_datasets_md_str(single_site_data)

Markdown(datasets_md_str)

### Datasets
##### <a href="https://raw.githubusercontent.com/OSUKED/Dictionary-Datasets/main/datasets/plant-locations/datapackage.json">Plant Locations</a>

Dataset listing the locations of power plants

The "osuked_id" field was used to match from the dictionary to the "osuked_id" field in this dataset.

| Attribute   |    Value |
|:------------|---------:|
| Longitude   | -1.26757 |
| Latitude    | 51.6236  |

<br><br>
##### <a href="https://raw.githubusercontent.com/OSUKED/Dictionary-Datasets/main/datasets/global-power-plant-database/datapackage.json">Global Power Plant Database</a>

The Global Power Plant Database is a comprehensive, open source database of power plants around the world. It centralizes power plant data to make it easier to navigate, compare and draw insights for one’s own analysis. The database covers approximately 35,000 power plants from 167 countries and includes thermal plants (e.g. coal, gas, oil, nuclear, biomass, waste, geothermal) and renewables (e.g. hydro, wind, solar). Each power plant is geolocated and entries contain information on plant capacity, generation, ownership, and fuel type. It will be continuously updated as data becomes available. 

The methodology for the dataset creation is given in the World Resources Institute publication ["A Global Database of Power Plants"](https://www.wri.org/research/global-database-power-plants). Data updates may occur without associated updates to this manuscript.

The "gppd_idnr" field was used to match from the dictionary to the "gppd_idnr" field in this dataset.

| Attribute                           | GBR1000369                                                                     | GBR1000377                                                                     |
|:------------------------------------|:-------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
| Estimated Annual Generation in 2017 | 6820.88                                                                        | 464.0                                                                          |
| Geolocation Source                  | GEODB                                                                          | GEODB                                                                          |
| Installed Capacity (MW)             | 1470.0                                                                         | 100.0                                                                          |
| Latitude                            | 51.6246                                                                        | 51.6246                                                                        |
| Longitude                           | -1.2683                                                                        | -1.2683                                                                        |
| Owner                               | RWE Npower Plc                                                                 | RWE Npower Plc                                                                 |
| PLATTS-WEPP ID                      | 1023591.0                                                                      | nan                                                                            |
| Primary Fuel Type                   | Gas                                                                            | Gas                                                                            |
| Source                              | Department for Business Energy & Industrial Strategy                           | Department for Business Energy & Industrial Strategy                           |
| URL                                 | https://www.gov.uk/government/collections/digest-of-uk-energy-statistics-dukes | https://www.gov.uk/government/collections/digest-of-uk-energy-statistics-dukes |


<br>

### Populating the Templates

In [19]:
#exports
def extract_name_from_single_site_data(single_site_data):
    potential_names = [v['name'] for k, v in single_site_data['id_hierarchies'].items() if'name' in v.keys()]

    if len(potential_names) > 0:
        name = potential_names[0]
        return name
    else:
        return None

def single_site_data_to_md_str(single_site_data, root_id, datapackage_json_fp):
    site_ids_md_str = single_site_data_to_ids_md_str(single_site_data, root_id, datapackage_json_fp)
    datasets_md_str = single_site_data_to_datasets_md_str(single_site_data)

    site_md_str = site_ids_md_str + '\n\n<br>\n' + datasets_md_str
    
    return site_md_str

def populate_and_save_template(template_fp, save_fp, render_kwargs):
    rendered_str = Template(open(template_fp).read()).render(**render_kwargs)

    with open(save_fp, 'w', encoding='utf-8') as f:
        try:
            f.write(rendered_str)
        except e as exc:
            raise exc
            
    return None

def clean_object_names(object_names):
    object_names = sorted(object_names)

    alpha_names = [name for name in object_names if name[0].isalpha()]
    numeric_names = [name for name in object_names if not name[0].isalpha()]

    object_names = alpha_names + numeric_names

    return object_names

In [20]:
template_fp = '../templates/objects_page.md'
object_names = []

for osuked_id, single_site_data in tqdm(site_data.items()):
    if 'attributes' in single_site_data.keys():
        name = extract_name_from_single_site_data(single_site_data)

        if name is not None:
            name = name.replace('/', '-').strip()
        else:
            name = osuked_id

        object_names += [name]
        save_fp = f'../docs/objects/{name}.md'

        render_kwargs = {'site_ids_md_string': single_site_data_to_md_str(single_site_data, osuked_id, datapackage_json_fp)}
        populate_and_save_template(template_fp, save_fp, render_kwargs)
    
object_names = clean_object_names(object_names)

100%|████████████████████████████████████████████████████████████████████████████████| 273/273 [00:07<00:00, 34.72it/s]


In [21]:
template_fp = '../templates/mkdocs.yml'
save_fp = f'../mkdocs.yml'
render_kwargs = {'object_names': object_names}

populate_and_save_template(template_fp, save_fp, render_kwargs)

In [22]:
# start work on the html templates for site pages
# then create the dictionary template
# add in the links between the dictionary and sites
# create dataset pages (and link to them as well)

In [20]:
#hide
from nbdev.export import *
notebook2script()

Converted 00-documentation.ipynb.
Converted 01-combining power station data.ipynb.
Converted 02-attribute extraction.ipynb.
Converted 03-page-population.ipynb.
