In [1]:
from django_for_jupyter import init_django
init_django("arches")

import json
import pprint 
import uuid

import pandas as pd
from lxml import etree
from collections import OrderedDict

import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

from arches.app.utils.data_management.resource_graphs.exporter import get_graphs_for_export as get_graph
from arches.app.utils.data_management.resource_graphs.exporter import create_mapping_configuration_file as get_mapping
from arches.app.utils.data_management.resource_graphs.exporter import export as export_gephi


from django.core.serializers.json import DjangoJSONEncoder

dataset_uuid = "fa952d08-dd27-11ed-9655-00163e71351b"

#dataset_uuid = "b36ccfad-0686-4213-9511-1481b86a9e21" # Dataset

In [9]:
def get_graph_as_json(_uuid):
    _graph = get_graph([_uuid])
    return _graph

def make_cards_df(graph):
    c_df = pd.DataFrame(graph['graph'][0]['cards']).sort_values(by=['sortorder'])
    for idx, row in c_df.iterrows():
        c_df.loc[idx, 'cardid'] = str(row['cardid'])
    c_df.to_csv('sipc/out/01_cards.csv', index=False)
    return c_df

def make_edges_df(graph):
    df = pd.DataFrame(graph['graph'][0]['edges'])
    df.to_csv('sipc/out/02_edges.csv', index=False)
    return df 

def make_nodes_df(graph):
    df = pd.DataFrame(graph['graph'][0]['nodes'])
    df.to_csv('sipc/out/03_nodes.csv', index=False)
    return df

def make_xx_df(graph):
    df = pd.DataFrame(graph['graph'][0]['cards_x_nodes_x_widgets'])        
    for idx, row in df.iterrows():
        df.loc[idx, 'sortorder'] = str(row['sortorder']).replace('.0', '')
    df.to_csv('sipc/out/04_xx.csv', index=False)
    return df
                    
# ---------------------------
def add_edges_to_nodes(edges, nodes):
    
    for n_idx, n_row in nodes.iterrows():
        n_node_id = str(n_row['nodeid'])        
        n_name = str(n_row['name'])        
        for e_idx, e_row in edges.iterrows():
            e_domainnode_id = str(e_row['domainnode_id'])
            e_rangenode_id = str(e_row['rangenode_id'])
            if e_rangenode_id == n_node_id:  
                nodes.loc[n_idx, 'domainnode_id'] = e_domainnode_id
                nodes.loc[n_idx, 'rangenode_id'] = e_rangenode_id
                nodes.loc[n_idx, 'edgeid'] = str(e_row['edgeid'])
                #nodes.loc[e_idx, 'ontologyproperty'] = str(e_row['ontologyproperty'])                    
                
                # add parents                
                parent = nodes.loc[nodes['nodeid'] == e_domainnode_id]['name'].values.tolist()[0]
                nodes.loc[n_idx, 'parent'] = parent
                parent = ''
    nodes.to_csv('sipc/out/05_nodes_edges.csv', index=False)
    return nodes

def add_cards_to_nodes_edges(cards, node_edges):
    
    for n_idx, n_row in node_edges.iterrows():
        n_nodegroup_id = str(n_row['nodegroup_id'])
        n_name = str(n_row['name'])
        for c_idx, c_row in cards_df.iterrows():
            c_nodegroup_id = str(c_row['nodegroup_id'])
            c_name = str(c_row['name']['en'])
            if n_nodegroup_id == c_nodegroup_id:
                    node_edges.loc[n_idx, 'card_id'] = str(c_row['cardid'])
                    
                    
                    #
                    #
                    #
                    #node_edges.loc[n_idx, 'node_card_sortorder'] = str(c_row['sortorder']).replace('.0', '')
                    try:
                        node_edges.loc[n_idx, 'node_card_sortorder'] = int(c_row['sortorder'])
                    except:
                        pass
                    #
                    #
                    #
                    #
                    
                    # add the rest that's needed
                    node_edges.loc[n_idx, 'instructions'] = str(c_row['instructions'])
                    node_edges.loc[n_idx, 'helptitle'] = str(c_row['helptitle'])
                    node_edges.loc[n_idx, 'helptext'] = str(c_row['helptext'])
                    node_edges.loc[n_idx, 'visible'] = str(c_row['visible'])

                    
        if pd.isnull(node_edges.loc[n_idx, 'parent']):
            node_edges.loc[n_idx, 'parent'] = 'root'
            node_edges.loc[n_idx, 'node_card_sortorder'] = 0
            node_edges.loc[n_idx, 'domainnode_id'] = 'RDF'
            #node_edges.loc[n_idx, 'domainnode_id'] = str(n_row['nodeid'])
            node_edges.loc[n_idx, 'rangenode_id'] = str(n_row['nodeid'])

    node_edges['node_card_sortorder'] = node_edges['node_card_sortorder']
    node_edges.to_csv('sipc/out/06_nodes_edges_cards.csv', index=False)
    return node_edges

def add_xx_to_nodes_edges_cards(xx, edges_node_cards):
    
    for e_idx, e_row in edges_node_cards.iterrows():
        e_node_id = str(e_row['nodeid'])
        for x_idx, x_row in xx.iterrows():
            x_node_id = str(x_row['node_id'])
            if e_node_id == x_node_id:
                edges_node_cards.loc[e_idx, 'card_sortorder'] = str(x_row['sortorder'])
            
    edges_node_cards.to_csv('sipc/out/07_nodes_edges_cards_xx.csv', index=False)
    return edges_node_cards


# ---------------------------


ONTOLOGY_NAMESPACES = {
    #'http://my_namespace_here/': 'some_ns',
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "RDF",
    "http://www.w3.org/2001/XMLSchema#": "xsd",
    "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
    "http://www.cidoc-crm.org/cidoc-crm/": "crm",
    "http://www.ics.forth.gr/isl/CRMarchaeo/": "CRMarchaeo",
    "http://www.ics.forth.gr/isl/CRMdig/": "CRMdig",
    "http://www.ics.forth.gr/isl/CRMgeo/": "CRMgeo",
    "http://www.ics.forth.gr/isl/CRMinf/": "CRMinf",
    "http://www.ics.forth.gr/isl/CRMsci/": "CRMsci",
    "http://parthenos.d4science.org/CRMext/CRMpe.rdfs/": "CRMpe",
    "https://takin.solutions/ontologies/CRMsurv/": "CRMsurv"
}



def add_namespace_suffix(graph):

    def make_namespace_suffix(url):
        if '#' in url:
            url_split = url.split('#')
            class_prop = url_split[-1:][0]
            namespace_prefix = url_split[0] + '#'
        else:
            url_split = url.split('/')
            class_prop = url_split[-1:][0]
            namespace_prefix = '/'.join(url_split[:-1]) + '/'
        namespace = ONTOLOGY_NAMESPACES.get(namespace_prefix)    
        return '%s:%s' % (namespace, class_prop)

    for idx, row in graph.iterrows():
        graph.loc[idx, 'sip_property'] = make_namespace_suffix(row['parentproperty'])
        if row['parent'] == 'root':
            graph.loc[idx, 'sip_property'] = 'RDF'
            graph.loc[idx, 'sip_class'] = make_namespace_suffix(row['ontologyclass'])
        else:    
            graph.loc[idx, 'sip_class'] = make_namespace_suffix(row['ontologyclass'])
                
    graph.to_csv('sipc/out/08_graph_ns.csv', index=False)
    return graph


def build_paths(graph):

    graph = graph.sort_values(by=['node_card_sortorder', 'card_sortorder', 'parent'])
    
    path_list = []
    path_dict = []
    parents = set()
    children = {}

    for idx, row in graph.iterrows():
        children[row['rangenode_id']] = row['domainnode_id']
            
    def ancestors(p):
        return (ancestors(children[p]) if p in children else []) + [p]

    for k in (set(children.keys())):
        path_list.append(ancestors(k))
        
    for idx, row in graph.iterrows():
        node_id = str(row['nodeid'])
        
        for path in path_list:
            if node_id == path[-1]:                
                graph.loc[idx, 'path'] = str(path)
                graph.loc[idx, 'path_length'] = str(len(path))
        if row['domainnode_id'] == 'RDF':
            graph.loc[idx, 'domainnode_id'] = None
            graph.loc[idx, 'rangenode_id'] = str(row['nodeid'])    

    
    graph.to_csv('sipc/out/09_graph_path.csv', index=False)            
    return graph            
                

def make_ns_paths(graph):
    
    path_list = []
    path_dict = {}
    doc_paths = []
        
    path_df = graph[['nodeid', 'path']]
    path_list = []
    path_dict = {}
    doc_path = []
        
    for p_idx, p_row in path_df.iterrows():
        paths = p_row['path'].strip('][').replace("'", "").split(', ')           
        for item in paths:
            if item != 'RDF':
                path_row = graph.loc[graph['nodeid'] == item]
                _property = path_row['sip_property'].values.tolist()[0]
                _class = path_row['sip_class'].values.tolist()[0]
                path_list.append(str(_property))
                path_list.append(str(_class))
                _name = path_row['name'].values.tolist()[0]
        
        path_dict = {"nodeid": p_row['nodeid'], "name": _name, "path": path_list}
        path_list = []
        doc_paths.append(path_dict)
        path_dict = {}

    for idx, row in graph.iterrows():
        for item in doc_paths:            
            if row['nodeid'] == item['nodeid']:
#                print(json.dumps(item, indent=2))
#                print('_-')
                graph.loc[idx, 'ns_path'] = [item]

    doc_paths = []
    graph = graph.sort_values(by=['node_card_sortorder', 'path_length', 'card_sortorder', 'parent'])
    graph.to_csv('sipc/out/10_graph_ns_path.csv', index=False)    
    return graph


In [10]:
graph = get_graph_as_json(dataset_uuid)

edges_df = make_edges_df(graph)
nodes_df = make_nodes_df(graph)
cards_df = make_cards_df(graph)
xx_df = make_xx_df(graph)

node_edges_df = add_edges_to_nodes(edges_df, nodes_df)
node_edges_cards_df = add_cards_to_nodes_edges(cards_df, node_edges_df)
node_edges_cards_xx_df = add_xx_to_nodes_edges_cards(xx_df, node_edges_cards_df)

graph_ns = add_namespace_suffix(node_edges_cards_xx_df)
graph_path = build_paths(graph_ns)

graph = make_ns_paths(graph_path)

print('done')


done


In [11]:
def make_doc_paths(ns_graph):
    
    doc_dict = {}
    doc_list = []

    for idx, row in ns_graph.iterrows():        
        node_id = row['nodeid']
        
        # For some weird reason the first row stores ns_path as dict, not list
        if isinstance(row['ns_path'], dict):
            ns_path = row['ns_path']
        elif isinstance(row['ns_path'], list):   
            ns_path = row['ns_path'][0]

        name = row['name']
        parent = row['parent']
        node_card_sortorder = row['node_card_sortorder']
        description = row['description']
        datatype = row['datatype']
        isrequired = row['isrequired']
        isvisible = "" if pd.isna(ns_graph.loc[idx, 'visible']) else row['visible']
        instructions = "" if pd.isna(ns_graph.loc[idx, 'instructions']) else row['instructions']
        helptitle = "" if pd.isna(ns_graph.loc[idx, 'helptitle']) else row['helptitle']
        helptext = "" if pd.isna(ns_graph.loc[idx, 'helptext']) else row['helptext']
        
        doc_dict = {"nodeid": node_id, "path": ns_path['path'], "name": name, "parent": parent, "node_card_sortorder": node_card_sortorder,
                    "datatype": datatype, "isrequired": isrequired, "isvisible": isvisible, "description": description,
                    "instructions": instructions, "helptitle": helptitle, "helptext": helptext, 
                    "relation": {name: parent}, 'datatype': datatype}
                    
        doc_list.append(doc_dict)
        ns_graph.loc[idx, 'doc_path'] = [doc_dict]
        doc_dict = {}
    
    ns_graph.to_csv('sipc/out/20_graph_nsa_path.csv', index=False)        
    return doc_list

print('done')

done


## Make rec_defs

In [13]:
def make_rec_defs(_graph):
    
    nodes = make_doc_paths(_graph)

    item_list = []
    children = {} #OrderedDict()
    
    for item in nodes:
        #print(item['path'])
        children[item['name']] = item['parent']
            
    for node in nodes:
        def tree(p):
            return (tree(children[p]) if p in children else []) + [p]
        #print(list(node['relation'])[0])
        _tree = tree(list(node['relation'])[0])        
        _tag = node['path'][-1]
        
        if len(_tree) == 2:
            root = etree.Element("root", tag="RDF") 
            node_map = etree.SubElement(root, "node-mapping", inputPath="/input")
            elem2 = etree.SubElement(root, "elem", tag = _tag)
            attr = etree.SubElement(elem2, "attr", uriCheck="True", tag = "rdf:about")                   
        elif len(_tree) == 3:
            elem3 = etree.SubElement(elem2, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'resource-instance':
                elem3.attrib['uriCheck']= 'True'
                elem3.attrib['attrs']= 'rdf:resource'
            if node['datatype'] == 'semantic':
                elem3.attrib['unmappable']= 'True'        
        elif len(_tree) == 4:
            elem4 = etree.SubElement(elem3, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem4.attrib['unmappable']= 'True'
        elif len(_tree) == 5:
            elem5 = etree.SubElement(elem4, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem5.attrib['unmappable']= 'True'
        elif len(_tree) == 6:
            elem6 = etree.SubElement(elem5, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem6.attrib['unmappable']= 'True'
        elif len(_tree) == 7:
            elem7 = etree.SubElement(elem6, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem7.attrib['unmappable']= 'True'
        #elif len(_tree) == 8:
        #    elem8 = etree.SubElement(elem7, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
        #    if node['datatype'] == 'semantic':
        #        elem8.attrib['unmappable']= 'True'
        else:
            pass
        
        
    return etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode() 

# TMP for checking things
#import ast
#generic = lambda x: ast.literal_eval(x)
#conv = {'ns_path': generic}
#tmp_graph = pd.read_csv('sip/out/10_graph_ns_path.csv', converters=conv)    

rec_def = make_rec_defs(graph)
#rec_def = make_rec_defs(graph)
print(rec_def)    

<root tag="RDF">
  <node-mapping inputPath="/input"/>
  <elem tag="crm:E22_Human-Made_Object">
    <attr uriCheck="True" tag="rdf:about"/>
    <elem tag="crm:E42_Identifier" label="Identifier" nodeid="d0d661c8-eda1-11ed-9655-00163e71351b" isrequired="False" isvisible="True" unmappable="True">
      <elem tag="rdfs:Literal" label="Identifier Content" nodeid="d0d6692a-eda1-11ed-9655-00163e71351b" isrequired="False" isvisible="True"/>
      <elem tag="crm:E55_Type" label="Identifier Type" nodeid="d0d66aba-eda1-11ed-9655-00163e71351b" isrequired="False" isvisible="True">
        <elem tag="xsd:string" label="Identifier Type Label" nodeid="d0d66754-eda1-11ed-9655-00163e71351b" isrequired="False" isvisible="True"/>
      </elem>
    </elem>
    <elem tag="crm:E35_Title" label="Title" nodeid="a97fe66c-eda1-11ed-a1e6-00163e71351b" isrequired="False" isvisible="True" unmappable="True">
      <elem tag="rdfs:Literal" label="Title Content" nodeid="a97feac2-eda1-11ed-a1e6-00163e71351b" isrequired=

## Make documentation

In [None]:
def make_docs_xml(_graph):
        
    doc_list = make_doc_paths(_graph)    
    item_list = []
    root = etree.Element("docs")
    for item in doc_list:            
        doc_path = '/'.join(item['path'])
        doc = etree.SubElement(root, "doc", nodeid = item['nodeid'], path = doc_path)  
        para = etree.SubElement(doc, "para", name= "Label") 
        para.text = item['name']
        para = etree.SubElement(doc, "para", name= "Description") 
        para.text = str(item['description'])
        para = etree.SubElement(doc, "para", name= "Instructions") 
        para.text = str(item['instructions'])
        para = etree.SubElement(doc, "para", name= "Helptitle") 
        para.text = str(item['helptitle'])
        para = etree.SubElement(doc, "para", name= "Helptext") 
        para.text = str(item['helptitle'])
        para = etree.SubElement(doc, "para", name= "Datatype") 
        para.text = str(item['datatype'])
        para = etree.SubElement(doc, "para", name= "Required") 
        para.text = str(item['isrequired'])
        
    item_doc = etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode()        
    return item_doc    

docss = make_docs_xml(graph)
print(docss)

## Make Opts

In [4]:
def make_opts(graph, concepts):
    
    ns_list = make_doc_paths(graph)
    root = etree.Element("opts")    
    for item in ns_list:
        opt_path = item['path']
        
        
        #
        # NOT OK, just a name matching at the moment. It needs to be matched against UUIDs since dictionary and displayName can be different.
        # Not avialble in the built in concept export so pull directly from the database
        # Use just one language! 
        #   <opt value="Anmerkung"/>
        #   <opt value="annotaties"/>
        #   <opt value="annotations"/>
        
        for k,v in json.loads(concepts).items():
            if item['name'] == k:
            
                opt_path = '/'.join(item['path'])
                opt_list = etree.SubElement(root, "opt-list", dictionary=item['name'], path=opt_path, displayName=item['name'])  
                for _opt in v.values():
                    opt = etree.SubElement(opt_list, "opt", value=_opt)

    _opts = etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode()        
    print(_opts)


def get_concepts_as_json(_uuid):
    concept_file = get_mapping(_uuid)
    return [concept_file[0]['outputfile'].getvalue()]

_concepts = get_concepts_as_json(dataset_uuid)
#_graph = pd.read_csv('mapper/out/10_graph_ns_path.csv', dtype={'sip_path':str })

print(_concepts[0])

#all_opts = make_opts(graph, _concepts[0])
#print(all_opts)




{
    "Resource to Resource Relationship Types": {
        "ac41d9be-79db-4256-b368-2f4559cfbe55": "is related to"
    }
}


## Make mermaid graph

In [5]:
#A[Square Rect] -- Link text --> B((Circle))

def make_graph(graph):
    
    def render_graph(m_str):
        graphbytes = m_str.encode("ascii")
        base64_bytes = base64.b64encode(graphbytes)
        base64_string = base64_bytes.decode("ascii")
        display(Image(url="https://mermaid.ink/img/" + base64_string))
    
    m_list = []
    g_list = make_doc_paths(graph)
    #print(json.dumps(g_list, indent=2))
    m_list.append('graph LR;')
    
    for item in g_list:        
        for k, v in item['relation'].items():
            if v != 'root':
                m_list.append('%s--> %s;' % (v.replace(' ', '_'), k.replace(' ', '_')))
    
    render_graph('\n'.join(m_list))                   
    
mermaid_graph = make_graph(graph)    
#mermaid_graph = render_graph(n_graph)    
    

NameError: name 'make_doc_paths' is not defined