# Create maDMP from RDMO XML Export

This Jupyter Notebook shows how to create an maDMP according to
[RDA DMP Common Standard](https://github.com/RDA-DMP-Common/RDA-DMP-Common-Standard)
out of the XML export of a project in RDMO.

In [1]:
# import required libraries
import json
import xml.etree.ElementTree as ET

In [24]:
# Path to the source XML file
SOURCE_XML = "test-project.xml"
# Path to the target JSON file
TARGET_JSON = "test-project.json"
# XML way of saying 'uri'
XML_URI = "{http://purl.org/dc/elements/1.1/}uri"
# Beginning of all attributes we support
ATTR_START = "https://rdmorganiser.github.io/terms/domain/project"

### Read in XML file

In [3]:
tree = ET.parse(SOURCE_XML)
root = tree.getroot()

In [44]:
# We now transform the XML file into a Python
# structure which can be easier traversed.
# It looks like this:
# rdmo[attribute][set_index][collection_index]
rdmo = {}
rdmo['title'] = root.find('title').text
rdmo['description'] = root.find('description').text
rdmo['created'] = root.find('created').text
max_updated = root.find('updated').text
values = root.find('values')
for value in values.iter('value'):
    index = value.find('set_index').text
    attrib = value.find('attribute').attrib[XML_URI]
    if not attrib.startswith(ATTR_START):
        raise AttributeError('Attribute "%s" not starting with "%s"' %
                            (attrib, ATTR_START))
    attrib = attrib[len(ATTR_START):]
    collection_index = value.find('collection_index').text
    text = value.find('text').text
    value_type = value.find('value_type').text
    unit = value.find('unit').text
    option_node = value.find('option')
    if XML_URI in option_node.attrib:
        option = option_node.attrib[XML_URI]
        # find the last part of the option
        option = option.split('/')[-1]
    else:
        option = None
    updated = value.find('updated').text
    new_val = {
        'text': text,
        'value_type': value_type,
        'unit': unit,
        'option': option,
        'updated': updated
    }
    if attrib not in rdmo:
        rdmo[attrib] = {}
    index = int(index)
    collection_index = int(collection_index)
    if index not in rdmo[attrib]:
        rdmo[attrib][index] = {}
    rdmo[attrib][index][collection_index] = new_val
    # update last modification date
    if updated > max_updated:
        max_updated = updated
rdmo['updated'] = max_updated

In [45]:
# find the indexes of datasets used in the DMP
datasets = set()
for key in rdmo:
    if key.startswith('/dataset/'):
        datasets |= rdmo[key].keys()
print(datasets)

{0, 1}


In [54]:
# helper function to access the elements and return None if not here
def get_val(d, *args):
    pos = d
    for k in args:
        if not isinstance(pos, dict):
            return None
        if k not in pos:
            return None
        pos = pos[k]
    return pos

In [57]:
# construct maDMP ready to be serialized as JSON
maDMP = {}
maDMP['title'] = rdmo['title']
if rdmo['description'] is not None:
    maDMP['description'] = rdmo['description'] 
# TODO: no language in the RDMO data model?
maDMP['language'] = 'en'
maDMP['created'] = rdmo['created']
maDMP['modified'] = rdmo['updated']
# TODO: check for ethical issues (loop through sensitive data)
# extract email from name / email field
name_email = rdmo['/coordination/name'][0][0]['text']
words = name_email.split()
name = (' ').join([w for w in words if '@' not in w])
email = (', ').join([w for w in words if '@' in w])
maDMP['contact'] = {'name': name, 'mbox': email}
maDMP['dataset'] = []
# for each dataset create an entry
for i in datasets:
    title = get_val(rdmo, '/dataset/id', i, 0, 'text')
    description = get_val(rdmo, '/dataset/description', i, 0, 'text')
    dataset_type = get_val(rdmo, '/dataset/format', i, 0, 'text')
    qa = get_val(rdmo, '/dataset/quality_assurance', i, 0, 'text')
    dataset = {'title': title, 'type': dataset_type}
    # TODO: find if personal / sensitive data is in data set
    # TODO: find preservation statement
    if description is not None:
        dataset['description'] = description
    if qa is not None:
        dataset['dataQualityAssurance'] = qa
    # append dataset to list of datasets
    maDMP['dataset'].append(dataset)

In [58]:
print(json.dumps(maDMP, indent=4))
print(json.dumps(rdmo, indent=4))

{
    "title": "Second test project",
    "language": "en",
    "created": "2019-06-29T09:30:43.529158+02:00",
    "modified": "2019-06-30T11:01:54.837899+02:00",
    "contact": {
        "name": "Helmuth Breitenfellner",
        "mbox": "helmuth.breitenfellner@student.tuwien.ac.at"
    },
    "dataset": [
        {
            "title": "Test",
            "type": "CSV",
            "description": "CSV files of marriages and wheat combined",
            "dataQualityAssurance": "ISO9001 will be implemented"
        },
        {
            "title": "Test2",
            "type": null
        }
    ]
}
{
    "title": "Second test project",
    "description": null,
    "created": "2019-06-29T09:30:43.529158+02:00",
    "/coordination/name": {
        "0": {
            "0": {
                "text": "Helmuth Breitenfellner\nhelmuth.breitenfellner@student.tuwien.ac.at",
                "value_type": "text",
                "unit": null,
                "option": null,
                "update