In [1]:
from django_for_jupyter import init_django
init_django("arches")

import json
import pprint 
import uuid
import pandas as pd
from lxml import etree
from collections import defaultdict
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt
from urllib.parse import urlparse

from arches.app.models.graph import Graph
from arches.app.models.concept import Concept
from arches.app.models.models import CardXNodeXWidget, Node, Resource2ResourceConstraint, FunctionXGraph, Value, GraphXPublishedGraph
from arches.app.utils.betterJSONSerializer import JSONSerializer, JSONDeserializer
#from operator import itemgetter
from arches.app.models import models

#from arches.app.models.system_settings import settings
#from arches.app.utils.data_management.resource_graphs.exporter import get_graphs_for_export as get_graph
#from arches.app.utils.data_management.resource_graphs.exporter import create_mapping_configuration_file as get_mapping
#from arches.app.utils.data_management.resource_graphs.exporter import export as export_gephi
#from collections import OrderedDict
from django.core.serializers.json import DjangoJSONEncoder


_model_uuid_list = [{"model_name": "Museological Item", "uuid": "fa952d08-dd27-11ed-9655-00163e71351b"}]


ONTOLOGY_NAMESPACES = {#'http://my_namespace_here/': 'some_ns',
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "RDF",
                    "http://www.w3.org/2001/XMLSchema#": "xsd",
                    "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
                    "http://www.cidoc-crm.org/cidoc-crm/": "crm",
                    "http://www.ics.forth.gr/isl/CRMarchaeo/": "CRMarchaeo",
                    "http://www.ics.forth.gr/isl/CRMdig/": "CRMdig",
                    "http://www.ics.forth.gr/isl/CRMgeo/": "CRMgeo",
                    "http://www.ics.forth.gr/isl/CRMinf/": "CRMinf",
                    "http://www.cidoc-crm.org/cidoc-crm/CRMsci/": "CRMsci",
                    "http://parthenos.d4science.org/CRMext/CRMpe.rdfs/": "CRMpe",
                    "https://takin.solutions/ontologies/CRMsurv/": "CRMsurv"
}

model_uuid_list = [{"model_name": "Museological Item", "uuid": "fa952d08-dd27-11ed-9655-00163e71351b"}]

In [2]:
def get_card_x_node_x_widget_data_for_export(resource_graph):
    cards_x_nodes_x_widgets = []
    nodeids = [node["nodeid"] for node in resource_graph["nodes"]]
    cards_x_nodes_x_widgets = CardXNodeXWidget.objects.filter(node_id__in=nodeids)
    return cards_x_nodes_x_widgets

def get_graph(graphid):    
    
    #graph = get_graph([graphid])
    graph = Graph.objects.get(graphid=graphid)
    serialized_graph = graph.serialize(force_recalculation=True)

    nodes_df = pd.DataFrame(serialized_graph['nodes'])
    edges_df = pd.DataFrame(serialized_graph['edges'])
    cards_df = pd.DataFrame(serialized_graph['cards'])
    nodegroups_df = pd.DataFrame(serialized_graph['nodegroups'])
    cards_x_nodes_x_widgets_df = pd.DataFrame(JSONSerializer().serializeToPython(
            get_card_x_node_x_widget_data_for_export(serialized_graph), use_raw_i18n_json=True
    ))
    
    for idx, row in nodes_df.iterrows():
        node_id = uuid.UUID(row['nodeid'])
        nodegroup_id = row['nodegroup_id']

        # top node
        if row['istopnode']:
            nodes_df.loc[idx, 'parentproperty'] = 'RDF'
            nodes_df.loc[idx, 'ontologyproperty'] = 'RDF'
            nodes_df.loc[idx, 'card_sortorder'] = -1
            nodes_df.loc[idx, 'domainnode_id'] = 'root'
            nodes_df.loc[idx, 'rangenode_id'] = row['nodeid']
        
        # match edges
        matching_edge_row = edges_df[edges_df['rangenode_id'] == node_id]
        if not matching_edge_row.empty:
            nodes_df.loc[idx, 'rangenode_id'] = str(matching_edge_row['rangenode_id'].values[0])
            nodes_df.loc[idx, 'domainnode_id'] = str(matching_edge_row['domainnode_id'].values[0])
            nodes_df.loc[idx, 'ontologyproperty'] = matching_edge_row['ontologyproperty'].values[0]        
        
        # match cards
        matching_card_row = cards_df[cards_df['nodegroup_id'] == nodegroup_id]
        if not matching_card_row.empty:
            nodes_df.loc[idx, 'card_label'] = matching_card_row['name'].values[0]
            nodes_df.loc[idx, 'card_sortorder'] = matching_card_row['sortorder'].values[0]
        
        # match cards_x_nodes_x_widgets
        matching_cards_x_nodes_x_widgets_row = cards_x_nodes_x_widgets_df[cards_x_nodes_x_widgets_df['node_id'] == node_id]        	
        if not matching_cards_x_nodes_x_widgets_row.empty:
            nodes_df.loc[idx, 'x_sortorder'] = matching_cards_x_nodes_x_widgets_row['sortorder'].values[0]
            nodes_df.loc[idx, 'x_card_label'] = str(matching_cards_x_nodes_x_widgets_row['label'].tolist()[0]['en'])
            nodes_df.loc[idx, 'x_card_id'] = matching_cards_x_nodes_x_widgets_row['card_id'].values[0]
                
    return nodes_df.sort_values(by=['card_sortorder'])


In [3]:
def make_uuid_paths(df):
    order = []
    path_dict = {}  
    tree = defaultdict(list)
    for _, row in df.iterrows():
        tree[row['domainnode_id']].append(row['rangenode_id'])
    root = set(df['domainnode_id']).difference(df['rangenode_id']).pop()

    # Make the tree
    def dfs(node, path):
        order.append(node)
        path_dict[node] = path
        for child in tree[node]:
            dfs(child, path + [child])
    dfs(root, [root])

    # Add order to the graph dataset
    id_to_order = {node_id: i for i, node_id in enumerate(order)}
    df['order'] = df['nodeid'].map(id_to_order)

    # Add paths to the dataset
    df['path'] = df['nodeid'].map(path_dict)
    #df['path_text'] = df['nodeid'].map(path_dict).apply(lambda x: ' -> '.join(x))

    df = df.sort_values('order').reset_index(drop=True)
    df.to_csv('sipc/out/sorted_path_graph.csv', index=False)
    return df

In [4]:
def add_namespace(path):

    for k,v in ONTOLOGY_NAMESPACES.items():
        if k in path:
            new_path = path.replace(k,f'{v}:')            
            return new_path
    
def make_ns_paths(df):
    path_list = []
    path_dict = {}
    n=1
    for idx, row in df.iterrows():
        full_path = row['path']
        for path in full_path:
            matching_path_row = df[df['nodeid'] == path]
            if not matching_path_row.empty:
                if n % 2:
                    #if row['parentproperty'] == 'RDF':
                    #   ns_path = 'RDF'     
                    #else:
                    ns_path = add_namespace(matching_path_row['parentproperty'].values[0])
                else:
                    ns_path = add_namespace(matching_path_row['ontologyclass'].values[0])
                n+=1    
                path_list.append(ns_path)                
            df.loc[idx, 'ns_path'] = str(path_list)
        path_list = []
    return df
    

In [5]:
def make_rec_defs(graph):

    for idx, row in graph.iterrows():
        print(row['ns_path'])
    

    

    #return etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode() 
    

In [7]:
graph = get_graph('fa952d08-dd27-11ed-9655-00163e71351b')
graph.to_csv('sipc/out/old_graph.csv', index=False)
#path_graph = make_uuid_paths(graph)
#ns_path_graph = make_ns_paths(path_graph[:5])
#ns_path_graph.to_csv('sipc/out/path_graph.csv', index=False)
#rec_def = make_rec_defs(ns_path_graph)



In [None]:
def make_rec_def(graph):

    path_list = []
    path_dict = []
    parents = set()
    children = OrderedDict()

    for idx, row in graph.iterrows():
        children[row['rangenode_id']] = row['domainnode_id']
    
    def ancestors(p):
        return (ancestors(children[p]) if p in children else []) + [p]

    for k in (set(children.keys())):
        path_list.append(ancestors(k))
    
    
    for idx, row in graph.iterrows():
        node_id = row['nodeid']
        
        for path in path_list:
            print(path)        
            if node_id == path[-1]:                
                graph.loc[idx, 'path'] = str(path)
                graph.loc[idx, 'path_length'] = str(len(path))

    
    graph.to_csv('sipc/out/09_graph_path.csv', index=False)            
    return graph


rec_def = make_rec_def(_graph)
rec_def

In [None]:
def make_cards_df(graph):
    cards_df = pd.DataFrame(graph['graph'][0]['cards']).sort_values(by=['sortorder'])
    for idx, row in cards_df.iterrows():
        cards_df.loc[idx, 'cardid'] = str(row['cardid'])
    cards_df.rename(columns={"sortorder": "card_sortorder"}, inplace=True)    
    cards_df.to_csv('sipc/out/01_cards.csv', index=False)
    return c_df

def make_edges_df(graph):
    edges_df = pd.DataFrame(graph['graph'][0]['edges'])
    edges_df.to_csv('sipc/out/02_edges.csv', index=False)
    return edges_df 

def make_nodes_df(graph):
    nodes_df = pd.DataFrame(graph['graph'][0]['nodes'])
    df.rename(columns={"sortorder": "node_sortorder"}, inplace=True)    
    df.to_csv('sipc/out/03_nodes.csv', index=False)
    return df

def make_xx_df(graph):
    nodes_df = pd.DataFrame(graph['graph'][0]['cards_x_nodes_x_widgets'])        
    for idx, row in df.iterrows():
        df.loc[idx, 'xx_sortorder'] = str(row['sortorder']).replace('.0', '')
    df.to_csv('sipc/out/04_xx.csv', index=False)
    return df
                    
# ---------------------------
def add_edges_to_nodes(edges, nodes):
    
    for n_idx, n_row in nodes.iterrows():
        n_node_id = str(n_row['nodeid'])        
        n_name = str(n_row['name'])        
        for e_idx, e_row in edges.iterrows():
            e_domainnode_id = str(e_row['domainnode_id'])
            e_rangenode_id = str(e_row['rangenode_id'])
            if e_rangenode_id == n_node_id:  
                nodes.loc[n_idx, 'domainnode_id'] = e_domainnode_id
                nodes.loc[n_idx, 'rangenode_id'] = e_rangenode_id
                nodes.loc[n_idx, 'edgeid'] = str(e_row['edgeid'])
                # add parents                
                parent = nodes.loc[nodes['nodeid'] == e_domainnode_id]['name'].values.tolist()[0]
                nodes.loc[n_idx, 'parent'] = parent
                parent = ''
    nodes.to_csv('sipc/out/05_nodes_edges.csv', index=False)
    return nodes

def add_cards_to_nodes_edges(cards, node_edges):
    
    for n_idx, n_row in node_edges.iterrows():
        n_nodegroup_id = str(n_row['nodegroup_id'])
        n_name = str(n_row['name'])
        for c_idx, c_row in cards_df.iterrows():
            c_nodegroup_id = str(c_row['nodegroup_id'])
            c_name = str(c_row['name']['en'])
            if n_nodegroup_id == c_nodegroup_id:
                node_edges.loc[n_idx, 'card_id'] = str(c_row['cardid'])
                node_edges.loc[n_idx, 'card_sortorder'] = str(c_row['card_sortorder'])                         
                # add the rest that's needed
                node_edges.loc[n_idx, 'instructions'] = str(c_row['instructions'])
                node_edges.loc[n_idx, 'helptitle'] = str(c_row['helptitle'])
                node_edges.loc[n_idx, 'helptext'] = str(c_row['helptext'])
                node_edges.loc[n_idx, 'visible'] = str(c_row['visible'])

                    
        if pd.isnull(node_edges.loc[n_idx, 'parent']):
            node_edges.loc[n_idx, 'parent'] = 'root'
            node_edges.loc[n_idx, 'domainnode_id'] = 'RDF'
            node_edges.loc[n_idx, 'rangenode_id'] = str(n_row['nodeid'])

    node_edges.to_csv('sipc/out/06_nodes_edges_cards.csv', index=False)
    return node_edges

def add_xx_to_nodes_edges_cards(xx, edges_node_cards):
    
    for e_idx, e_row in edges_node_cards.iterrows():
        e_node_id = str(e_row['nodeid'])        
        for x_idx, x_row in xx.iterrows():
            x_node_id = str(x_row['node_id'])
            if e_node_id == x_node_id:
                edges_node_cards.loc[e_idx, 'xx_sortorder'] = str(x_row['sortorder'])
            
            
            
    edges_node_cards.to_csv('sipc/out/07_nodes_edges_cards_xx.csv', index=False)
    return edges_node_cards


# ---------------------------

def add_namespace_suffix(graph):

    def make_namespace_suffix(url):
        if '#' in url:
            url_split = url.split('#')
            class_prop = url_split[-1:][0]
            namespace_prefix = url_split[0] + '#'
        else:
            url_split = url.split('/')
            class_prop = url_split[-1:][0]
            namespace_prefix = '/'.join(url_split[:-1]) + '/'
        namespace = ONTOLOGY_NAMESPACES.get(namespace_prefix)    
        return '%s:%s' % (namespace, class_prop)

    for idx, row in graph.iterrows():
        graph.loc[idx, 'sip_property'] = make_namespace_suffix(row['parentproperty'])
        if row['parent'] == 'root':
            graph.loc[idx, 'sip_property'] = 'RDF'
            graph.loc[idx, 'sip_class'] = make_namespace_suffix(row['ontologyclass'])
        else:    
            graph.loc[idx, 'sip_class'] = make_namespace_suffix(row['ontologyclass'])
                
    graph.to_csv('sipc/out/08_graph_ns.csv', index=False)
    return graph


def build_paths(graph):

    graph = graph.sort_values(by=['node_sortorder', 'card_sortorder', 'parent'])
    
    path_list = []
    path_dict = []
    parents = set()
    children = {}

    for idx, row in graph.iterrows():
        children[row['rangenode_id']] = row['domainnode_id']
            
    def ancestors(p):
        return (ancestors(children[p]) if p in children else []) + [p]

    for k in (set(children.keys())):
        path_list.append(ancestors(k))
        
    for idx, row in graph.iterrows():
        node_id = str(row['nodeid'])
        
        for path in path_list:
            if node_id == path[-1]:                
                graph.loc[idx, 'path'] = str(path)
                graph.loc[idx, 'path_length'] = str(len(path))
        if row['domainnode_id'] == 'RDF':
            graph.loc[idx, 'domainnode_id'] = None
            graph.loc[idx, 'rangenode_id'] = str(row['nodeid'])    

    
    graph.to_csv('sipc/out/09_graph_path.csv', index=False)            
    return graph            
                

def make_ns_paths(graph):
    
    path_list = []
    path_dict = {}
    doc_paths = []
        
    path_df = graph[['nodeid', 'path']]
    path_list = []
    path_dict = {}
    doc_path = []
        
    for p_idx, p_row in path_df.iterrows():
        paths = p_row['path'].strip('][').replace("'", "").split(', ')           
        for item in paths:
            if item != 'RDF':
                path_row = graph.loc[graph['nodeid'] == item]
                _property = path_row['sip_property'].values.tolist()[0]
                _class = path_row['sip_class'].values.tolist()[0]
                path_list.append(str(_property))
                path_list.append(str(_class))
                _name = path_row['name'].values.tolist()[0]
        
        path_dict = {"nodeid": p_row['nodeid'], "name": _name, "path": path_list}
        path_list = []
        doc_paths.append(path_dict)
        path_dict = {}

    for idx, row in graph.iterrows():
        for item in doc_paths:            
            if row['nodeid'] == item['nodeid']:
#                print(json.dumps(item, indent=2))
#                print('_-')
                graph.loc[idx, 'ns_path'] = [item]

    doc_paths = []
    graph = graph.sort_values(by=['node_sortorder', 'path_length', 'card_sortorder', 'parent'])
    graph.to_csv('sipc/out/10_graph_ns_path.csv', index=False)    
    return graph



## Make Paths

In [None]:
def make_rec_def_paths(ns_graph):
    
    doc_dict = {}
    doc_list = []

    for idx, row in ns_graph.iterrows():        
        node_id = row['nodeid']
        
        # For some weird reason the first row stores ns_path as dict, not list
        if isinstance(row['ns_path'], dict):
            ns_path = row['ns_path']
        elif isinstance(row['ns_path'], list):   
            ns_path = row['ns_path'][0]
        name = row['name']
        parent = row['parent']
        description = row['description']
        datatype = row['datatype']
        isrequired = row['isrequired']
        isvisible = "" if pd.isna(ns_graph.loc[idx, 'visible']) else row['visible']
        instructions = "" if pd.isna(ns_graph.loc[idx, 'instructions']) else row['instructions']
        helptitle = "" if pd.isna(ns_graph.loc[idx, 'helptitle']) else row['helptitle']
        helptext = "" if pd.isna(ns_graph.loc[idx, 'helptext']) else row['helptext']
        
        doc_dict = {"nodeid": node_id, "path": ns_path['path'], "name": name, "parent": parent,
                    "datatype": datatype, "isrequired": isrequired, "isvisible": isvisible, "description": description,
                    "instructions": instructions, "helptitle": helptitle, "helptext": helptext, 
                    "relation": {name: parent}, 'datatype': datatype}
                    
        doc_list.append(doc_dict)
        ns_graph.loc[idx, 'doc_path'] = [doc_dict]
        doc_dict = {}
    ns_graph.sort_values(by=['card_sortorder', 'xx_sortorder'], inplace=True)
    ns_graph.to_csv('sipc/out/20_rec_def_path.csv', index=False)        
    return ns_graph

print('done')

## Make rec_defs

In [None]:
def make_rec_defs(_graph):
    
    nodes_df = make_rec_def_paths(_graph)
    nodes_df.sort_values(by=['card_sortorder', 'xx_sortorder'], inplace=True)
    item_list = []
    
            
    for idx, node in nodes_df.iterrows():
        crm_property = node['sip_property']
        crm_class = node['sip_class']
        path_length = node['path_length']
        #print(path_length, crm_property, crm_class)
        try:
            node_path = node['ns_path'][0]['path']
        except:
            node_path = node['ns_path']['path']
        #print(node_path)
        
        #if len(node['path']) == 2:
        #    root = etree.Element("root", tag="RDF") 
        #    node_map = etree.SubElement(root, "node-mapping", inputPath="/input")
        #    elem2 = etree.SubElement(root, "elem", tag = crm_class)
        #    attr = etree.SubElement(elem2, "attr", uriCheck="True", tag = "rdf:about")                   
        
        #elif len(node['path']) == 4:
        #    #print(node['path'])
        #    _property = node['path'][2]
        #    _class = node['path'][3]
        #    elem_prop4 = etree.SubElement(elem2, "elem", tag = _property)
        #    elem_class4 = etree.SubElement(elem_prop4, "elem", tag = _class)

        #elif len(node['path']) == 6:
        #    #print(node['path'])
        #    _property = node['path'][3]
        #    _class = node['path'][5]
        #    elem_prop6 = etree.SubElement(elem_class4, "elem", tag = _property)
        #    elem_class6 = etree.SubElement(elem_prop6, "elem", tag = _class)
            
            
            #elem3 = etree.SubElement(elem2, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            #if node['datatype'] == 'resource-instance':
            #    elem3.attrib['uriCheck']= 'True'
            #    elem3.attrib['attrs']= 'rdf:resource'
            #if node['datatype'] == 'semantic':
            #    elem3.attrib['unmappable']= 'True'        
        #else:
        #    pass
        
        
    #return etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode() 


#nodes_df

## Make mermaid graph

In [None]:
def make_mermaid_graph(graph):
    
    def render_graph(m_str):
        graphbytes = m_str.encode("ascii")
        base64_bytes = base64.b64encode(graphbytes)
        base64_string = base64_bytes.decode("ascii")
        display(Image(url="https://mermaid.ink/img/" + base64_string))
    
    m_list = []
    g_list = make_doc_paths(graph)
    #print(json.dumps(g_list, indent=2))
    m_list.append('graph LR;')
    
    for item in g_list:        
        for k, v in item['relation'].items():
            if v != 'root':
                m_list.append('%s--> %s;' % (v.replace(' ', '_'), k.replace(' ', '_')))
    mermaid_str = '\n'.join(m_list) 

    test = "graph LR; A--> B & C & D; B--> A & E;C--> A & E; D--> A & E;E--> B & C & D;"
    
    render_graph(test)                          
        

## Main

In [None]:
for item in model_uuid_list:
    graph = get_graph([item['uuid']])

    edges_df = make_edges_df(graph)
    nodes_df = make_nodes_df(graph)
    cards_df = make_cards_df(graph)
    xx_df = make_xx_df(graph)

    node_edges_df = add_edges_to_nodes(edges_df, nodes_df)
    node_edges_cards_df = add_cards_to_nodes_edges(cards_df, node_edges_df)
    node_edges_cards_xx_df = add_xx_to_nodes_edges_cards(xx_df, node_edges_cards_df)

    graph_ns = add_namespace_suffix(node_edges_cards_xx_df)
    graph_path = build_paths(graph_ns)

    graph_final = make_ns_paths(graph_path)
    
    rec_def = make_rec_defs(graph_final)
    #print(rec_def) 
    
    #mermaid_graph = make_mermaid_graph(graph)    

    print('done')

        if len(_tree) == 2:
            root = etree.Element("root", tag="RDF") 
            node_map = etree.SubElement(root, "node-mapping", inputPath="/input")
            elem2 = etree.SubElement(root, "elem", tag = _tag)
            attr = etree.SubElement(elem2, "attr", uriCheck="True", tag = "rdf:about")                   
        elif len(_tree) == 3:
            elem3 = etree.SubElement(elem2, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'resource-instance':
                elem3.attrib['uriCheck']= 'True'
                elem3.attrib['attrs']= 'rdf:resource'
            if node['datatype'] == 'semantic':
                elem3.attrib['unmappable']= 'True'        
        elif len(_tree) == 4:
            elem4 = etree.SubElement(elem3, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem4.attrib['unmappable']= 'True'
        elif len(_tree) == 5:
            elem5 = etree.SubElement(elem4, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem5.attrib['unmappable']= 'True'
        elif len(_tree) == 6:
            elem6 = etree.SubElement(elem5, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem6.attrib['unmappable']= 'True'
        elif len(_tree) == 7:
            elem7 = etree.SubElement(elem6, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem7.attrib['unmappable']= 'True'
        elif len(_tree) == 8:
            elem8 = etree.SubElement(elem7, "elem", tag = _tag, label = node['name'], nodeid= node['nodeid'], isrequired = str(node['isrequired']), isvisible = str(node['isvisible']))
            if node['datatype'] == 'semantic':
                elem8.attrib['unmappable']= 'True'
        else:
            pass
