In [1]:
from django_for_jupyter import init_django
init_django("arches")

import json
import pprint 
import uuid
import pandas as pd
from lxml import etree
import xmltodict
from collections import defaultdict
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt
from urllib.parse import urlparse

from arches.app.models.graph import Graph
from arches.app.models.concept import Concept
from arches.app.models.models import CardXNodeXWidget, Node, Resource2ResourceConstraint, FunctionXGraph, Value, GraphXPublishedGraph
from arches.app.utils.betterJSONSerializer import JSONSerializer, JSONDeserializer#from operator import itemgetter
from arches.app.models import models

from arches.app.utils.data_management.resource_graphs.exporter import get_graphs_for_export as get_json_graph
from django.core.serializers.json import DjangoJSONEncoder


#_model_uuid_list = [{"model_name": "Museological Item", "uuid": "fa952d08-dd27-11ed-9655-00163e71351b"}]


ONTOLOGY_NAMESPACES = {#'http://my_namespace_here/': 'some_ns',
                    "http://purl.org/dc/elements/1.1/": "dc",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "RDF",
                    "http://www.w3.org/2001/XMLSchema#": "xsd",
                    "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
                    "http://www.cidoc-crm.org/cidoc-crm/CRMsci/": "CRMsci",
                    "http://www.cidoc-crm.org/cidoc-crm/": "crm",
                    "http://www.ics.forth.gr/isl/CRMarchaeo/": "CRMarchaeo",
                    "http://www.ics.forth.gr/isl/CRMdig/": "CRMdig",
                    "http://www.ics.forth.gr/isl/CRMgeo/": "CRMgeo",
                    "http://www.ics.forth.gr/isl/CRMinf/": "CRMinf",
                    "http://parthenos.d4science.org/CRMext/CRMpe.rdfs/": "CRMpe",
                    "https://takin.solutions/ontologies/CRMsurv/": "CRMsurv"
}

model_uuid_list = {"model_name": "SEM.01_Survey Unit", "uuid": "af04eac2-a131-11ed-a102-9cf387da2c40"}

In [2]:
def make_classes(class_doc):

    sub_class_list = []
    class_dict = {}  
    
    class_dict['tag'] = crm_class['@rdf:about']
    class_dict['type'] = 'class'
    if 'rdfs:comment' in crm_class.keys():
        class_dict['scope_note'] = crm_class['rdfs:comment']
    if 'rdfs:subClassOf' in crm_class.keys():
        sub_class = crm_class['rdfs:subClassOf']
        if isinstance(sub_class, dict): 
            sub_class_list.append(sub_class['@rdf:resource'].replace('http://www.cidoc-crm.org/cidoc-crm/', ''))
        elif isinstance(sub_class, list):
            for sub in sub_class:                        
                sub_class_list.append(sub['@rdf:resource'].replace('http://www.cidoc-crm.org/cidoc-crm/', ''))
        class_dict['sub_class_of'] = sub_class_list 
        
    return class_dict
    

In [3]:
def make_properties(property_doc):
    property_dict = {}
    property_list = []
    
    property_dict['tag'] = crm_property['@rdf:about']
    property_dict['type'] = 'property'
    
    if 'rdfs:comment' in crm_property.keys():
        property_dict['scope_note'] = crm_property['rdfs:comment']
    if 'rdfs:domain' in crm_property.keys():
        property_dict['domain'] = crm_property['rdfs:domain']
    if 'rdfs:range' in crm_property.keys():
        property_dict['range'] = crm_property['rdfs:range']
    if 'owl:inverseOf' in crm_property.keys():
        property_dict['inverseOf'] = crm_property['owl:inverseOf']
    if 'rdfs:subPropertyOf' in crm_property.keys():
        property_dict['subPropertyOf'] = crm_property['rdfs:subPropertyOf']        
    
    return property_dict


In [4]:
ontology_path = '../arches_datahub/pkg/ontologies/cidoc_crm'
ontology_list = []
all_ontologies_list = []
doc_dict = {}
doc_list = []
sub_class_list = []
file_check = True

## Read ontology_config.json to get all used ontoligies
with open(f'{ontology_path}/ontology_config.json', 'r') as f:
    ontologies = json.load(f)
    ontology_list.append(ontologies['base'])    
    for ontology in ontologies['extensions']:
        ontology_list.append(ontology)    

    ## add all to one list
    for file in ontology_list:
        with open(f'{ontology_path}/{file}', encoding='utf-8') as xml_file:
            data_dict = xmltodict.parse(xml_file.read())
            data_dict['file'] = file
            all_ontologies_list.append(data_dict)
    
    # Save as json        
    #with open('sipc/out/all_ontos.json', "w") as fp:
    #    json.dump(all_ontologies_list, fp) 
    
 
    for doc in all_ontologies_list:
        ontology = ''
        #if doc['file'] == 'CIDOC_CRM_v7.1.2.rdf':
        if file_check:            
            
            ## Classes    
            if 'rdfs:Class' in doc['rdf:RDF'].keys():
                for crm_class in doc['rdf:RDF']['rdfs:Class']:                                    
                    doc_dict['ontology'] = doc['file'].replace('.xml', '').replace('.rdf', '')
                    doc_dict.update(make_classes(crm_class))
                    doc_list.append(doc_dict)
                    doc_dict = {}
                    
            ## Properties    
            if 'rdf:Property_' in doc['rdf:RDF'].keys():
                for crm_property in doc['rdf:RDF']['rdf:Property']:                                    
                    doc_dict['ontology'] = doc['file'].replace('.xml', '').replace('.rdf', '')
                    doc_dict.update(make_properties(crm_property))
                    doc_list.append(doc_dict)
                    doc_dict = {}
            
doc_df = pd.DataFrame(doc_list)
doc_df.to_csv('sipc/out/docs.csv', index=False)
#print(json.dumps(doc_list, indent=2))
doc_df

Unnamed: 0,ontology,tag,type,scope_note,sub_class_of
0,CIDOC_CRM_v7.1.2,E1_CRM_Entity,class,This class comprises all things in the univers...,
1,CIDOC_CRM_v7.1.2,E2_Temporal_Entity,class,"This class comprises all phenomena, such as th...",[E1_CRM_Entity]
2,CIDOC_CRM_v7.1.2,E3_Condition_State,class,This class comprises the states of objects cha...,[E2_Temporal_Entity]
3,CIDOC_CRM_v7.1.2,E4_Period,class,This class comprises sets of coherent phenomen...,"[E2_Temporal_Entity, E92_Spacetime_Volume]"
4,CIDOC_CRM_v7.1.2,E5_Event,class,"This class comprises distinct, delimited and c...",[E4_Period]
...,...,...,...,...,...
223,linkedart,https://linked.art/ns/terms/Set,class,,"[E89_Propositional_Object, E72_Legal_Object]"
224,linkedart,https://linked.art/ns/terms/Addition,class,The addition of some entity to a Set,[E7_Activity]
225,linkedart,https://linked.art/ns/terms/Removal,class,The removal of some entity from a Set,[E7_Activity]
226,linkedart,https://linked.art/ns/terms/Relationship,class,,[E28_Conceptual_Object]


In [5]:
def get_card_x_node_x_widget_data_for_export(resource_graph):
    cards_x_nodes_x_widgets = []
    nodeids = [node["nodeid"] for node in resource_graph["nodes"]]
    cards_x_nodes_x_widgets = CardXNodeXWidget.objects.filter(node_id__in=nodeids)
    return cards_x_nodes_x_widgets

def get_graph(graphid):    
    
    graph = get_json_graph([graphid])
    source_graph = pd.DataFrame(graph['graph'])
    
    nodes_df = pd.DataFrame(graph['graph'][0]['nodes'])
    edges_df = pd.DataFrame(graph['graph'][0]['edges'])
    cards_df = pd.DataFrame(graph['graph'][0]['cards'])
    nodegroups_df = pd.DataFrame(graph['graph'][0]['nodegroups'])
    cards_x_nodes_x_widgets_df = pd.DataFrame(graph['graph'][0]['cards_x_nodes_x_widgets'])
        
    for idx, row in nodes_df.iterrows():
        node_id = row['nodeid']
        nodegroup_id = row['nodegroup_id']
        nodes_df.loc[idx, 'id'] = node_id
        nodes_df.loc[idx, 'elem_name'] = row['ontologyclass']
        nodes_df.loc[idx, 'elem_type'] = 'class'
        
        # top node
        if row['istopnode']:
            nodes_df.loc[idx, 'parentproperty'] = 'RDF'
            nodes_df.loc[idx, 'ontologyproperty'] = 'RDF'
            nodes_df.loc[idx, 'card_sortorder'] = -1
            nodes_df.loc[idx, 'parent_id'] = 'root'
            nodes_df.loc[idx, 'child_id'] = row['nodeid']
            nodes_df.loc[idx, 'visible'] = True
        
        # match edges to nodes
        matching_edge_row = edges_df[edges_df['rangenode_id'] == node_id]                
        if not matching_edge_row.empty:            
            nodes_df.loc[idx, 'edgeid'] = str(matching_edge_row['edgeid'].values[0])
            add_parent_id = edges_df[edges_df['rangenode_id'] == node_id]                
            if not add_parent_id.empty:
                nodes_df.loc[idx, 'parent_id'] = str(add_parent_id['edgeid'].values[0])
                #nodes_df.loc[idx, 'child_id'] = str(matching_edge_row['domainnode_id'].values[0])
            nodes_df.loc[idx, 'child_id'] = node_id
            
        # match cards
        if not row['istopnode']: ## the top node has no nodegroup_id
            matching_card_row = cards_df[cards_df['nodegroup_id'] == nodegroup_id]        
            if not matching_card_row.empty:            
                name = matching_card_row['name'].values[0]
                nodes_df.loc[idx, 'card_label'] = str(name)
                nodes_df.loc[idx, 'card_sortorder'] = matching_card_row['sortorder'].values[0]
                nodes_df.loc[idx, 'instructions'] = str(matching_card_row['instructions'].values[0])
                nodes_df.loc[idx, 'helptitle'] = str(matching_card_row['helptitle'].values[0])
                nodes_df.loc[idx, 'helptext'] = str(matching_card_row['helptext'].values[0])
                nodes_df.loc[idx, 'visible'] = str(matching_card_row['visible'].values[0])

                
                
        # match cards_x_nodes_x_widgets
        #matching_cards_x_nodes_x_widgets_row = cards_x_nodes_x_widgets_df[cards_x_nodes_x_widgets_df['node_id'] == node_id]
        #print(matching_cards_x_nodes_x_widgets_row)
        #if not matching_cards_x_nodes_x_widgets_row.empty:
        #    nodes_df.loc[idx, 'x_sortorder'] = matching_cards_x_nodes_x_widgets_row['sortorder'].values[0]
        #    nodes_df.loc[idx, 'x_card_label'] = str(matching_cards_x_nodes_x_widgets_row['label'].tolist()[0]['en'])
        #    nodes_df.loc[idx, 'x_card_id'] = matching_cards_x_nodes_x_widgets_row['card_id'].values[0]
        #    nodes_df.loc[idx, 'visible'] = matching_cards_x_nodes_x_widgets_row['visible'].values[0]
        
        # add edges to nodes
        add_edge_row = edges_df[edges_df['rangenode_id'] == node_id]                
        if not matching_edge_row.empty:   
            new_row_df = pd.DataFrame({
                'elem_type': 'property',
                'elem_name': str(matching_edge_row['ontologyproperty'].values[0]),
                'is_collector': False,
                'nodeid': '',
                'name': '',
                'description': '', 
                'istopnode': False,
                'ontologyclass': '',
                'datatype': 'property',
                'nodegroup_id': '',
                'graph_id': row['graph_id'],
                'config': '',
                'issearchable': '',
                'isrequired': True,
                'sortorder': '',
                'fieldname': '',
                'exportable': '',
                'alias': '',          
                'hascustomalias': '',
                'parentproperty': '',
                'id': str(add_edge_row['edgeid'].values[0]),
                'edgeid': str(add_edge_row['edgeid'].values[0]), 
                'parent_id': str(add_edge_row['domainnode_id'].values[0]),
                'child_id': str(add_edge_row['edgeid'].values[0]), #str(add_edge_row['rangenode_id'].values[0]),
                'card_label': '',
                'card_sortorder': 0,
                'visible': True,
                'ontologyproperty': ''
            }, index=[0])

            #nodes_df.loc[len(nodes_df.index)] = new_row_df
            nodes_df = pd.concat([nodes_df, new_row_df])   
            #print(len(new_row))    

    
    return nodes_df.sort_values(by=['card_sortorder'])
    #return new_row_df
#graph = get_graph('af04eac2-a131-11ed-a102-9cf387da2c40')
#graph.to_csv('sipc/out/temp.csv', index=False)
#graph

In [6]:
from collections import defaultdict
from typing import List, Dict


# Function to generate paths
def generate_paths(tree: Dict, node: str, path: List = []):
    path = path + [node]
    
    paths = [path]
    if node in tree:
        for child in tree[node]:
            paths.extend(generate_paths(tree, child, path))
    return paths

def make_uuid_paths(df):
    tree = df.groupby('parent_id')['child_id'].apply(list).to_dict()
    # Generate paths
    paths = generate_paths(tree, 'root')

    # Convert paths to DataFrame
    paths_df = pd.DataFrame([(path[-1], i, path) for i, path in enumerate(paths)], columns=['id', 'order', 'path'])

    # Merge paths_df with the existing df based on 'id'
    merged_df = pd.merge(graph, paths_df, on='id', how='left')
    merged_df.to_csv('sipc/out/merged.csv', index=False)
    return merged_df


In [7]:
def add_namespace(path):

    for k,v in ONTOLOGY_NAMESPACES.items():
        if k in path:
            new_path = path.replace(k,f'{v}:')
            return new_path
    
def make_ns_paths(df):
    
    #df = df['card_sortorder'].astype(int)
    df = df.sort_values('order').reset_index(drop=True)
    path_list = []
    path_dict = {}
    n=0
    for idx, row in df.iterrows():
        full_path = row['path']        
        #print(full_path)
        for path in full_path:

            if path == 'root':
                ns_path = 'RDF'
            else:
                matching_row = df[df['id'] == path]
                ns_path = add_namespace(matching_row['elem_name'].values[0])
                #print(matching_row)
                if (n % 2) == 0:
                    pass
                    #print(matching_row['elem_name'])
                    #ns_path = add_namespace(matching_row['elem_name'].values[0])
                else:   
                    pass
                    #ns_path = add_namespace(matching_row['elem_name'].values[0])
            path_list.append(ns_path)
            ns_path = ''
            n+=1        
        n=0        
        df.loc[idx, 'ns_path'] = str(path_list)
        path_list = []
    return df

In [8]:
def make_rec_defs(graph):
    graph = graph.sort_values('order').reset_index(drop=True)
    
    
    
    for idx, row in graph.iterrows():
        path_length = len(row['path'])
        #print(path_length)
        path_list = eval(row['ns_path'])
        path_length = len(path_list)
        tag = str(path_list[-1])
        label = tag #.split(':')[1]
        
        if path_length == 2:
            root = etree.Element("root", tag="RDF") 
            node_map = etree.SubElement(root, "node-mapping", inputPath="/input")
            elem2 = etree.SubElement(root, "elem", tag = model_uuid_list['model_name'])
            attr = etree.SubElement(elem2, "attr", uriCheck="True", tag = "rdf:about")                           
        
        elif path_length == 3:
            elem3 = etree.SubElement(elem2, "elem", tag=tag, label=label, datatype=row['datatype'], isvisible = str(row['visible']).lower())
            elem3.attrib['unmappable']= 'true'        
        
        elif path_length == 4:
            elem4 = etree.SubElement(elem3, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem4.attrib['unmappable']= 'true'        
            elif row['datatype'] == 'resource-instance':
                elem4.attrib['uriCheck']= 'true'
                elem4.attrib['attrs']= 'rdf:resource'
                elem4.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':                
                elem4.attrib['attrs']= 'rdf:resource'
                elem4.attrib['simple']= 'true'
            else:
                elem4.attrib['simple']= 'true'            
        
        elif path_length == 5:
            elem5 = etree.SubElement(elem4, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem4.attrib['unmappable']= 'true'        
        
        elif path_length == 6:
            elem6 = etree.SubElement(elem5, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem4.attrib['unmappable']= 'true'        
            elif row['datatype'] == 'resource-instance':
                elem6.attrib['uriCheck']= 'true'
                elem6.attrib['attrs']= 'rdf:resource'
                elem6.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':                
                elem6.attrib['attrs']= 'rdf:resource'
                elem6.attrib['simple']= 'true'
            else:
                elem6.attrib['simple']= 'true'            
        
        elif path_length == 7:
            elem7 = etree.SubElement(elem6, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem7.attrib['unmappable']= 'true'        
        
        elif path_length == 8:
            elem8 = etree.SubElement(elem7, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem8.attrib['unmappable']= 'true'        
            elif row['datatype'] == 'resource-instance':
                elem8.attrib['uriCheck']= 'true'
                elem8.attrib['attrs']= 'rdf:resource'
                elem8.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':                
                elem8.attrib['attrs']= 'rdf:resource'
                elem8.attrib['simple']= 'true'
            else:
                elem8.attrib['simple']= 'true'            

        elif path_length == 9:
            elem9 = etree.SubElement(elem8, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem9.attrib['unmappable']= 'true'

        elif path_length == 10:
            elem10 = etree.SubElement(elem9, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem10.attrib['unmappable']= 'true'
            elif row['datatype'] == 'resource-instance':
                elem10.attrib['uriCheck']= 'true'
                elem10.attrib['attrs']= 'rdf:resource'
                elem10.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':
                elem10.attrib['attrs']= 'rdf:resource'
                elem10.attrib['simple']= 'true'
            else:
                elem10.attrib['simple']= 'true'

        elif path_length == 11:
            elem11 = etree.SubElement(elem10, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem11.attrib['unmappable']= 'true'

        elif path_length == 12:
            elem12 = etree.SubElement(elem11, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem12.attrib['unmappable']= 'true'
            elif row['datatype'] == 'resource-instance':
                elem12.attrib['uriCheck']= 'true'
                elem12.attrib['attrs']= 'rdf:resource'
                elem12.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':
                elem12.attrib['attrs']= 'rdf:resource'
                elem12.attrib['simple']= 'true'
            else:
                elem12.attrib['simple']= 'true'


        elif path_length == 13:
            elem13 = etree.SubElement(elem12, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem13.attrib['unmappable']= 'true'


        elif path_length == 14:
            elem14 = etree.SubElement(elem13, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem14.attrib['unmappable']= 'true'
            elif row['datatype'] == 'resource-instance':
                elem14.attrib['uriCheck']= 'true'
                elem14.attrib['attrs']= 'rdf:resource'
                elem14.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':
                elem14.attrib['attrs']= 'rdf:resource'
                elem14.attrib['simple']= 'true'
            else:
                elem14.attrib['simple']= 'true'
                
        elif path_length == 15:
            elem15 = etree.SubElement(elem14, "elem", tag = tag, label=label, datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            elem15.attrib['unmappable']= 'true'

        elif path_length == 16:
            elem16 = etree.SubElement(elem13, "elem", tag = tag, label = row['name'], datatype=row['datatype'], isrequired = str(row['isrequired']).lower(), isvisible = str(row['visible']).lower())
            if row['datatype'] == 'semantic':
                elem14.attrib['unmappable']= 'true'
            elif row['datatype'] == 'resource-instance':
                elem14.attrib['uriCheck']= 'true'
                elem14.attrib['attrs']= 'rdf:resource'
                elem14.attrib['simple']= 'true'
            elif row['datatype'] == 'concept':
                elem14.attrib['attrs']= 'rdf:resource'
                elem14.attrib['simple']= 'true'
            else:
                elem14.attrib['simple']= 'true'
                
        else:            
            print(path_length)
        
    return etree.tostring(root, encoding='utf-8', method='xml', pretty_print=True).decode() 



#rec_def = make_rec_defs(ns_path_graph)
#print(rec_def)

In [9]:
#model_uuid_list = [{"model_name": "Museological Item tw", "uuid": "7886ae5e-009a-11ee-93d6-96a6d2455259"}]

graph = get_graph(model_uuid_list['uuid'])
graph.to_csv('sipc/out/new_graph.csv', index=False)
path_graph = make_uuid_paths(graph)
ns_path_graph = make_ns_paths(path_graph)
ns_path_graph.to_csv('sipc/out/ns_path_graph.csv', index=False)
rec_def = make_rec_defs(ns_path_graph)
print(rec_def)
#graph




IndexError: list index out of range