### installing necessary packages

In [1]:
!pip install Jinja2
!pip install jsonpath_rw

Collecting jsonpath_rw
  Downloading https://files.pythonhosted.org/packages/71/7c/45001b1f19af8c4478489fbae4fc657b21c4c669d7a5a036a86882581d85/jsonpath-rw-1.4.0.tar.gz
Building wheels for collected packages: jsonpath-rw
  Building wheel for jsonpath-rw (setup.py): started
  Building wheel for jsonpath-rw (setup.py): finished with status 'done'
  Stored in directory: C:\Users\shuto\AppData\Local\pip\Cache\wheels\5c\00\9a\82822db383c2d96dcebf839786665a185f92d37e5026f9806f
Successfully built jsonpath-rw
Installing collected packages: jsonpath-rw
Successfully installed jsonpath-rw-1.4.0


### Loading the template

In [2]:
from jinja2 import Template

In [3]:
from jinja2 import Environment, FileSystemLoader

In [4]:
env = Environment(loader = FileSystemLoader('.'))
template = env.get_template('sdoToISO-ISOTemplate_test6.xml')

### Demo: Automated jsonpath parsing from table

mapping in render() requires a dict. Table, which in essence is a dict, is easy for us to update and keep track of the mapping values. Workflow involves extracting variable name and json path for each information needed in the template. Operating on the table to turn it into the desired mapper and pass it into render.

In [5]:
from jsonpath_rw import jsonpath, parse
import pandas as pd
import numpy as np
import json

In [6]:
fh = open('AS-BCODMO-SDOexample.json')
json_dict = json.load(fh)
def get_value(expression, source = json_dict):
    expression = parse(expression)
    try:
        return expression.find(source)[0].value
    except IndexError:
        return ''

In [8]:
template_df = pd.read_csv('template_vars.csv')
template_df = template_df.loc[template_df['exception'] == 0].drop('exception', axis = 1)
template_df['@ of'] = template_df['@ of'].fillna('')
template_df.head()

Unnamed: 0,variable,json_path,@ of
0,md_creators,$.creator,
1,creator_profile_url,`this`.creator.url,md_creators
2,creator_profile_type,`this`.creator.@type,md_creators
3,creator_profile_name,`this`.creator.name,md_creators
4,creator_type,`this`.@type,md_creators


In [9]:
global_variable_mapping = (template_df
                           .loc[template_df['@ of'] == '']
                           .drop('@ of', axis = 1)
                           .set_index('variable'))['json_path'].apply(lambda x: get_value(x)).to_dict()

In [10]:
local_variable_mapping = (template_df
                          .loc[template_df['@ of'] != '']
                          .drop('@ of', axis = 1)
                          .set_index('variable')['json_path']
                          .to_dict()
                         )

In [16]:
global_variable_mapping

{'md_creators': [{'@type': 'Role',
   'additionalType': 'http://schema.geolink.org/1.0/base/main#Participant',
   'roleName': 'Principal Investigator',
   'creator': {'@type': 'Person',
    'additionalType': 'http://schema.geolink.org/1.0/base/main#Person',
    '@id': 'https://www.bco-dmo.org/person/51317',
    'name': 'Dr Uta Passow',
    'url': 'https://www.bco-dmo.org/person/51317'}},
  {'@type': 'Role',
   'additionalType': 'http://schema.geolink.org/1.0/base/main#Participant',
   'roleName': 'Co-Principal Investigator',
   'creator': {'@type': 'Person',
    'additionalType': 'http://schema.geolink.org/1.0/base/main#Person',
    '@id': 'https://www.bco-dmo.org/person/50663',
    'name': 'Dr Mark Brzezinski',
    'url': 'https://www.bco-dmo.org/person/50663',
    'identifier': {'@type': 'PropertyValue',
     'additionalType': ['http://schema.geolink.org/1.0/base/main#Identifier',
      'http://purl.org/spar/datacite/Identifier'],
     'propertyID': 'http://purl.org/spar/datacite/orc

In [14]:
local_variable_mapping

{'creator_profile_name': '`this`.creator.name',
 'creator_profile_type': '`this`.creator.@type',
 'creator_profile_url': '`this`.creator.url',
 'creator_rolename': '`this`.roleName',
 'creator_type': '`this`.@type',
 'identifier_pid': '`this`.propertyID',
 'identifier_type': '`this`.@type',
 'identifier_value': '`this`.value'}

In [15]:
function_mapping = {'get_value':get_value, 
                    'parse': parse}

In [16]:
def merge_dicts(*dicts):
    result = {}
    for mapping in dicts:
        for key in mapping:
            result[key] = mapping[key]
    return result

In [17]:
render_mapping = merge_dicts(global_variable_mapping, local_variable_mapping, function_mapping)

In [18]:
print(template.render(render_mapping))

<?xml version="1.0" encoding="UTF-8"?>
<gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd"
    xmlns:gco="http://www.isotc211.org/2005/gco" xmlns:srv="http://www.isotc211.org/2005/srv"
    xmlns:gml="http://www.opengis.net/gml" xmlns:xlink="http://www.w3.org/1999/xlink">
    <!-- template mapping schema.org JSON-LD to ISO19115 (2006).  
        JSON paths follow https://restfulapi.net/json-jsonpath/
    
    java implemenation at https://github.com/json-path/JsonPath adds additional operators for expressions
    https://github.com/kennknowles/python-jsonpath-rw is python implementation that is also somewhat different
    flow communications php implementation https://github.com/FlowCommunications/JSONPath
    see https://pypi.org/search/?q=jsonpath for a bunch of python implementations...
    
    comparison of different implementations: https://cburgmer.github.io/json-path-comparison/
    -->
    
    <!-- start with assumption that JSON-LD schema.org docs will follow 
    h

The string can be turned into an xml object easily, given that the format is correct

### Not demo: Checking whether each expression makes sense and how they work under the context of jsonpath

In [None]:
from jsonpath_rw import jsonpath, parse

In [None]:
test_jsonpath_expr = parse('foo[*].baz')

In [None]:
test_json_dict = {'foo': [{'baz': 1}, {'baz': 2}, {'adsadas':3}]}

In [None]:
[match.value for match in test_jsonpath_expr.find(test_json_dict)]

In [None]:
#the matches remember their path
[str(match.full_path) for match in test_jsonpath_expr.find(test_json_dict)]

In [None]:
import json

In [None]:
fh = open('AS-BCODMO-SDOexample.json')
json_dict = json.load(fh)
def get_value(expression, source = json_dict):
    expression = parse(expression)
    try:
        return expression.find(source)[0].value
    except IndexError:
        return ''

In [None]:
id_expression = '$.@id'
md_id = get_value(id_expression)
md_id

In [None]:
type_expression = '$.@type'
md_type = get_value(type_expression)
md_type

In [None]:
name_expression = '$.name'
md_name = get_value(name_expression)
md_name

In [None]:
alter_expression = '$.alternateName'
md_alter = get_value(alter_expression)
md_alter

In [None]:
date_pub_expr = '$.datePublished'
md_date_pub = get_value(date_pub_expr)
md_date_pub

In [None]:
version_expr = '$.version'
md_version = get_value(version_expr)
md_version

In [None]:
identifiers_expr = '$.identifier'
md_identifiers = get_value(identifiers_expr)
md_identifiers

under identifiers

In [None]:
id_type_expr = '`this`.@type'
id_value_expr = '`this`.value'
id_propertyID_expr = '`this`.propertyID'

In [None]:
for item in md_identifiers:
    print(parse(id_propertyID_expr).find(item))

In [None]:
keywords_expr = '$.keywords'
md_keywords = [keyword.strip() for keyword in get_value(keywords_expr).split(',')]
md_keywords

In [None]:
license_expr = '$.license'
md_license = get_value(license_expr)
md_license

In [None]:
pp_expr = '$.publishingPrinciples'
md_pp = get_value(pp_expr)
md_pp

In [None]:
temp_cov_expr = '$.temporalCoverage'
md_temp_cov = get_value(temp_cov_expr)
md_temp_cov

In [None]:
begin_end = md_temp_cov.split('/')
begin = begin_end[0]
end = begin_end[1]

In [None]:
spatial_cov_expr = '$.spatialCoverage'
md_spatial = get_value(spatial_cov_expr)
md_spatial

In [None]:
spatial_box_expr = '$.spatialCoverage..box'
spatial_cov_box = get_value(spatial_box_expr)
spatial_cov_box

In [None]:
coords = spatial_cov_box.split(' ')
min_coords = coords[0]
min_lat = min_coords.split(',')[0]
min_long = min_coords.split(',')[1]
max_coords = coords[1]
max_lat = max_coords.split(',')[0]
max_long = max_coords.split(',')[1]

In [None]:
url_expr = '$.url'
md_url = get_value(url_expr)
md_url

In [None]:
creators_expr = '$.creator'
md_creators = get_value(creators_expr)
md_creators

In [None]:
creator_url = '`this`.creator.url'
creator_profile_type = '`this`.creator.@type'
creator_profile_name = '`this`.creator.name'
creator_role_name = '`this`.creator.roleName'

In [None]:
for item in md_creators:
    #print(item)
    print(parse('`this`.creator.url').find(item)[0].value)

In [None]:
citation_expr = '$.citation'
md_cite = get_value(citation_expr)
md_cite

In [None]:
desc_expr = '$.description'
md_desc = get_value(desc_expr)
md_desc