# DSSAT Data to MINT Data Catalog
This notebook is used to register MINT inputs and data with the MINT Data Catalog.

In [1]:
# Prerequisites: python 3.6 or later
import requests
import json
import uuid
import pprint
import datetime
pp = pprint.PrettyPrinter(indent=1)
import re

# using rdflib to traverse the GSN ontology
from rdflib import Graph

In [2]:
# This is a convenience method to handle api responses. The main portion of the notebook starts in the the next cell
def handle_api_response(response, print_response=False):
    parsed_response = response.json()

    if print_response:
        pp.pprint({"API Response": parsed_response})
    
    if response.status_code == 200:
        return parsed_response
    elif response.status_code == 400:
        raise Exception("Bad request ^")
    elif response.status_code == 403:
        msg = "Please make sure your request headers include X-Api-Key and that you are using correct url"
        raise Exception(msg)
    else:
        now = datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        msg = f"""\n\n
        ------------------------------------- BEGIN ERROR MESSAGE -----------------------------------------
    
        Automatically generated summary:
        - Time of occurrence: {now}
        - Request method + url: {response.request.method} - {response.request.url}
        - Request headers: {response.request.headers}
        - Request body: {response.request.body}
        - Response: {parsed_response}

        --------------------------------------- END ERROR MESSAGE ------------------------------------------
        \n\n
        """
        raise Exception(msg)

## Main Portion

In [53]:
# For real interactions with the data catalog, use api.mint-data-catalog.org
url = "https://sandbox.mint-data-catalog.org"

In [54]:
# When you register datasets or resources, we require you to pass a "provenance_id". This a unique id associated
# with your account so that we can keep track of who is adding things to the data catalog. For sandboxed interactions
# with the data catalog api, please use this provenance_id:
provenance_id = "e8287ea4-e6f2-47aa-8bfc-0c22852735c8"

In [55]:
# Step 1: Get session token to use the API
resp = requests.get(f"{url}/get_session_token").json()
print(resp)
api_key = resp['X-Api-Key']

request_headers = {
    'Content-Type': "application/json",
    'X-Api-Key': api_key
}

{'X-Api-Key': 'mint-data-catalog:a1f80744-12a8-4494-9c72-43befa499eba:882bce49-8fd8-4d2f-98c5-c7c5e8da7e51'}


## Registering `historical_nbg_maiz.json`
We can start with the input file for DSSAT.



In [6]:
pp.pprint(json.loads(open('historical_nbg_maiz.json').read()))

{'fractionalAW': 0.25,
 'incorporationDepth': 5,
 'incorporationRate': 100,
 'name': 'historical_nbg_maiz',
 'plantingDayOfMonth': 1,
 'plantingWindow': 45,
 'rasters': {'dataLayers': {'elevation': 'rasters/nbg_elevation.tif',
                            'harvestedArea': 'rasters/nbg_harvest_maiz.tif',
                            'initialNitrogen': 'rasters/nbg_initial_n.tif',
                            'plantingMonth': 'rasters/nbg_planting.tif',
                            'rootMass': 'rasters/nbg_root_mass.tif',
                            'soilProfile': 'rasters/nbg_hc27.tif',
                            'soilResidue': 'rasters/nbg_surface_residue.tif',
                            'weatherFile': 'rasters/nbg_cellid.tif'}},
 'runYears': 33,
 'soils': ['base/HC.SOL'],
 'startYear': 1984,
 'templateDir': 'templates',
 'weatherDir': 'weather/nbg',
 'workDir': 'out/nbg_maiz/historical',
 'wstID': 'SSUD',
 'xFileTemplate': 'MAIZ8433.SNX'}


In [7]:
to_register = ['plantingWindow',
               'plantingDayOfMonth',
               'fractionalAW',
               'incorporationDepth',
               'incorporationRate',
               'startYear',
               'runYears']

to_register_sn = {}    

### Load properties from GSN ontology:

In [8]:
g = Graph()
g.parse("svo-properties.rdf", format="xml")

<Graph identifier=N86738a01069a40f0b7a3034085c586f7 (<class 'rdflib.graph.Graph'>)>

In [9]:
print(f"There are {len(g)} properties in GSN")

There are 9606 properties in GSN


Find the triples that map to the variables in our input file

In [10]:
def find_triples(search_list, exact=False):
    '''
    Takes in a list of search terms and returns any triples whose name
    contains in any order the terms.
    
    if `exact=True` then the specific words in `search_list` and only those
    words should appear in the name
    '''
    triples = []
    for subj, pred, obj in g:
        if '#' in subj:
            name = subj.split('#')[1]
            if exact:
                if set(search_list) == set(re.split('_|-',name.lower())):
                    triples.append((subj, pred, obj))
            else:
                if all(to_search in name.lower() for to_search in search_list):
                    triples.append((subj, pred, obj))
    for subj,pred,obj in triples:
        print(subj)
        print(pred)
        print(obj)
        print()
    return triples

Below are all triples associated with `http://www.geoscienceontology.org/svo/svl/property#year`. We can learn:

- year's type is a quantitative property
- year's type could also be a named individual (?)
- year's preferred lable is `year`
- year has an associated wikipedia page
- year has units `none` (a year is a year)

In [11]:
find_triples(['year'], exact=True)

http://www.geoscienceontology.org/svo/svl/property#year
http://www.w3.org/2004/02/skos/core#prefLabel
year

http://www.geoscienceontology.org/svo/svl/property#year
http://www.geoscienceontology.org/svo/svu#hasUnits
none

http://www.geoscienceontology.org/svo/svl/property#year
http://www.geoscienceontology.org/svo/svu#hasAssociatedWikipediaPage
http://en.wikipedia.org/wiki/Year

http://www.geoscienceontology.org/svo/svl/property#year
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.geoscienceontology.org/svo/svu#QuantitativeProperty

http://www.geoscienceontology.org/svo/svl/property#year
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.w3.org/2002/07/owl#NamedIndividual



[(rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svl/property#year'),
  rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#prefLabel'),
  rdflib.term.Literal('year', lang='en')),
 (rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svl/property#year'),
  rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svu#hasUnits'),
  rdflib.term.Literal('none')),
 (rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svl/property#year'),
  rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svu#hasAssociatedWikipediaPage'),
  rdflib.term.Literal('http://en.wikipedia.org/wiki/Year')),
 (rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svl/property#year'),
  rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
  rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svu#QuantitativeProperty')),
 (rdflib.term.URIRef('http://www.geoscienceontology.org/svo/svl/property#year'),
  rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-s

In [13]:
to_register

['plantingWindow',
 'plantingDayOfMonth',
 'fractionalAW',
 'incorporationDepth',
 'incorporationRate',
 'startYear',
 'runYears']

In [29]:
to_register_sn['runYears'] = 'http://www.geoscienceontology.org/svo/svl/property#year'
to_register_sn['startYear'] = 'http://www.geoscienceontology.org/svo/svl/property#year'
to_register_sn['plantingWindow'] = 'http://www.geoscienceontology.org/svo/svl/property#planting_date'
to_register_sn['plantingDayOfMonth'] = 'http://www.geoscienceontology.org/svo/svl/property#planting_date'
to_register_sn['incorporationDepth'] = 'http://www.geoscienceontology.org/svo/svl/variable#crop_planting__planting_depth'
to_register_sn['incorporationRate'] = 'http://www.geoscienceontology.org/svo/svl/variable#__planting_separation_distance'
to_register_sn['fractionalAW'] = 'http://www.geoscienceontology.org/svo/svl/attribute#moisture'

### Step 0: Registering Standard Variables

In [66]:
ontology = 'Scientific Variables Ontology'

In [67]:
standard_variable_defs = {}
standard_variable_defs['standard_variables'] = []
for kk, vv in to_register_sn.items():
    sn = {'name': vv.split('#')[1], 'ontology': ontology, 'uri': vv}
    standard_variable_defs['standard_variables'].append(sn)

In [68]:
pp.pprint(standard_variable_defs)

{'standard_variables': [{'name': 'year',
                         'ontology': 'Scientific Variables Ontology',
                         'uri': 'http://www.geoscienceontology.org/svo/svl/property#year'},
                        {'name': 'year',
                         'ontology': 'Scientific Variables Ontology',
                         'uri': 'http://www.geoscienceontology.org/svo/svl/property#year'},
                        {'name': 'planting_date',
                         'ontology': 'Scientific Variables Ontology',
                         'uri': 'http://www.geoscienceontology.org/svo/svl/property#planting_date'},
                        {'name': 'planting_date',
                         'ontology': 'Scientific Variables Ontology',
                         'uri': 'http://www.geoscienceontology.org/svo/svl/property#planting_date'},
                        {'name': 'crop_planting__planting_depth',
                         'ontology': 'Scientific Variables Ontology',
                

In [38]:
resp = requests.post(f"{url}/knowledge_graph/register_standard_variables", 
                    headers=request_headers, 
                    json=standard_variable_defs)

In [59]:
parsed_response = handle_api_response(resp)
standard_name_records = parsed_response['standard_variables']

In [69]:
if len(standard_name_records) == 0:
    # this means the standard names are already registered;
    # so, let's query for them instead
    search_query = {
    "name__in": [i['name'] for i in standard_variable_defs['standard_variables']]
    }

    resp = requests.post(f"{url}/knowledge_graph/find_standard_variables", 
                                            headers=request_headers,
                                            json=search_query)
    parsed_response = handle_api_response(resp)
    standard_name_records = parsed_response['standard_variables']
    standard_name_records = [i for i in standard_name_records if i['ontology'] == ontology]

In [71]:
standard_name_records

[{'id': '18d2bc6c-c4cd-51a2-a03e-5cffe4dcef02',
  'ontology': 'Scientific Variables Ontology',
  'name': '__planting_separation_distance',
  'uri': 'http://www.geoscienceontology.org/svo/svl/variable#__planting_separation_distance',
  'description': ''},
 {'id': '3276f43e-82a1-5caf-a627-598e6bc04503',
  'ontology': 'Scientific Variables Ontology',
  'name': 'planting_date',
  'uri': 'http://www.geoscienceontology.org/svo/svl/property#planting_date',
  'description': ''},
 {'id': '69a7996f-e953-56cb-bb9f-213456b1efab',
  'ontology': 'Scientific Variables Ontology',
  'name': 'crop_planting__planting_depth',
  'uri': 'http://www.geoscienceontology.org/svo/svl/variable#crop_planting__planting_depth',
  'description': ''},
 {'id': 'df1daca4-d727-5dc8-bfa4-fb20c717a32b',
  'ontology': 'Scientific Variables Ontology',
  'name': 'year',
  'uri': 'http://www.geoscienceontology.org/svo/svl/property#year',
  'description': ''},
 {'id': 'f7d62db8-a470-503a-80d3-c987181c6ca8',
  'ontology': 'Scien

### Step 1: REGISTER DATASETS

In [45]:
dataset_id = str(uuid.uuid4())
print(dataset_id)

00ee1157-bbff-4625-852d-e010f44679e4


#### Build datasets definition

In [97]:
dataset_name = "DSSAT Simplified Input Data"

In [72]:
dataset_defs = {
    "datasets": [
        {
            "record_id": dataset_id, # Remove this line if you want to create a new dataset
            "provenance_id": provenance_id,
            "metadata": {},
            "description": "Simplified input data to be used for running serialized Docker version of DSSAT \
                            (Peerless, Pythia, DSSAT).",
            "name": dataset_name
        }
    ]
}

resp = requests.post(f"{url}/datasets/register_datasets", 
                                        headers=request_headers,
                                        json=dataset_defs)


parsed_response = handle_api_response(resp, print_response=True)

datasets = parsed_response["datasets"]

# Iterate through the list of returned datasets objects and save the one whose name matches our name 
# to a Python variable
dataset_record = next(record for record in datasets if record["name"] == name)
# Extract dataset record_id and store it in a variable
dataset_record_id = dataset_record["record_id"]
    

{'API Response': {'datasets': [{'description': 'Simplified input data to be '
                                               'used for running serialized '
                                               'Docker version of '
                                               'DSSAT                             '
                                               '(Peerless, Pythia, DSSAT).',
                                'json_metadata': {},
                                'name': 'DSSAT Simplified Input Data',
                                'provenance_id': 'e8287ea4-e6f2-47aa-8bfc-0c22852735c8',
                                'record_id': '00ee1157-bbff-4625-852d-e010f44679e4'}],
                  'result': 'success'}}


In [73]:
dataset_record_id

'00ee1157-bbff-4625-852d-e010f44679e4'

### Step 2: Register variables

We have to merge the standard name information (record ids) with the variable names we initially associated with standard names:

In [80]:
variable_defs = {}
variable_defs['variables'] = []
for kk, vv in to_register_sn.items():
    for e in standard_name_records:
        if vv == e['uri']:
            variable = {
                'dataset_id': dataset_id,
                'name': kk,
                'metadata': {},
                'standard_variable_ids': [e['id']]
            }
            variable_defs['variables'].append(variable)

In [82]:
pp.pprint(variable_defs)

{'variables': [{'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                'metadata': {},
                'name': 'runYears',
                'standard_variable_ids': ['df1daca4-d727-5dc8-bfa4-fb20c717a32b']},
               {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                'metadata': {},
                'name': 'startYear',
                'standard_variable_ids': ['df1daca4-d727-5dc8-bfa4-fb20c717a32b']},
               {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                'metadata': {},
                'name': 'plantingWindow',
                'standard_variable_ids': ['3276f43e-82a1-5caf-a627-598e6bc04503']},
               {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                'metadata': {},
                'name': 'plantingDayOfMonth',
                'standard_variable_ids': ['3276f43e-82a1-5caf-a627-598e6bc04503']},
               {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                'metadata

In [83]:
resp = requests.post(f"{url}/datasets/register_variables", 
                                        headers=request_headers,
                                        json=variable_defs)

parsed_response = handle_api_response(resp, print_response=True)
variables = parsed_response["variables"]

{'API Response': {'result': 'success',
                  'variables': [{'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                                 'json_metadata': {},
                                 'name': 'runYears',
                                 'record_id': '8bfff40c-ba75-4a70-87f4-8aa254c8227c'},
                                {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                                 'json_metadata': {},
                                 'name': 'startYear',
                                 'record_id': '0fd06920-e19f-49ad-bdf4-b278bf246bd6'},
                                {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                                 'json_metadata': {},
                                 'name': 'plantingWindow',
                                 'record_id': 'b35db5a5-ff8d-4819-969f-77779387fa90'},
                                {'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                        

### Step 3: Register resources

In [85]:
data_storage_url = "https://s3.amazonaws.com/world-modelers/data/"

In [86]:
file_1_name = "historical_nbg_maiz.json"

In [87]:
file_1_data_url = f"{data_storage_url}/{file_1_name}"

In [88]:
file_1_record_id = str(uuid.uuid4())

In [89]:
file_1_record_id

'e601e666-475b-434f-93e0-6e5252d71995'

#### Finally, we can build our resource definitions and register them (in bulk)

In [94]:
resource_defs = {
    "resources": [
        {
            "record_id": file_1_record_id,
            "dataset_id": dataset_record_id,
            "provenance_id": provenance_id,
            "variable_ids": [i['record_id'] for i in variables],
            "name": file_1_name,
            "resource_type": ".json",
            "data_url": file_1_data_url,
            "metadata": {},
            "layout": {}
        }
    ]
}

resp = requests.post(f"{url}/datasets/register_resources", 
                                        headers=request_headers,
                                        json=resource_defs)


parsed_response = handle_api_response(resp, print_response=True)


resources = parsed_response["resources"]

{'API Response': {'resources': [{'data_url': 'https://s3.amazonaws.com/world-modelers/data//historical_nbg_maiz.json',
                                 'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
                                 'json_metadata': {},
                                 'layout': {},
                                 'name': 'historical_nbg_maiz.json',
                                 'provenance_id': 'e8287ea4-e6f2-47aa-8bfc-0c22852735c8',
                                 'record_id': 'e601e666-475b-434f-93e0-6e5252d71995',
                                 'resource_type': '.json'}],
                  'result': 'success'}}


In [98]:
search_query_4 = {
    "dataset_names__in": [dataset_name]
}

resp = requests.post(f"{url}/datasets/find",
                     headers=request_headers,
                     json=search_query_4).json()

if resp['result'] == 'success':
    found_resources = resp['resources']
    print(f"Found {len(found_resources)} resources")
    pp.pprint(found_resources)

Found 1 resources
[{'dataset_id': '00ee1157-bbff-4625-852d-e010f44679e4',
  'dataset_metadata': {},
  'dataset_name': 'DSSAT Simplified Input Data',
  'resource_data_url': 'https://s3.amazonaws.com/world-modelers/data//historical_nbg_maiz.json',
  'resource_id': 'e601e666-475b-434f-93e0-6e5252d71995',
  'resource_metadata': {},
  'resource_name': 'historical_nbg_maiz.json'}]
