    ## Calculating the sub- and super-class aspect measures

In [None]:
## select all triples containing rdfs:subClassOf for ontology A

## Group all subjects of the created triple list --> subclass aspect

## Group all objects of the created triple list --> superclass aspect

## repeat for ontology B

## calculate size of intersection set between A and B on subclass aspect

## calculate size of union set between A and B on subclass aspect

## divide intersection over union

## repeat of superclass aspect

In [2]:

import rdflib
from rdflib import URIRef
from pprint import pprint
import pandas as pd

## Cleaning NCIT ontology

In [29]:
ncit = rdflib.Graph()
ncit.parse("./data/semi_cleaned_NCIT.rdf",format='xml') #Load the ontology with all datatype properties already removed

properties = list(ncit.query(
    'SELECT ?p (COUNT(?o) as ?occurrences) WHERE { ?s ?p ?o. } GROUP BY ?p ORDER BY ?occurrences'
))
for p in properties: # list all the remaining
    print(f'property: {p[0].toPython()} --> occurrences: {p[1].toPython()}')

property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P378 --> occurrences: 1
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A4 --> occurrences: 1
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A12 --> occurrences: 10
property: http://www.w3.org/2000/01/rdf-schema#subPropertyOf --> occurrences: 11
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A3 --> occurrences: 11
property: http://www.w3.org/2002/07/owl#oneOf --> occurrences: 19
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A9 --> occurrences: 28
property: http://www.w3.org/2000/01/rdf-schema#domain --> occurrences: 86
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A1 --> occurrences: 97
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A2 --> occurrences: 97
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A14 --> occurrences: 105
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A30 --> occurrences: 124
property: http://w

In [30]:
# For now, we are only interested in the rdfs:subClassOf relation and ncit:A8, which has the 'Concept_In_Subset' label

pprint(properties[-3])
pprint(properties[-1])

(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.Literal('258855', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8'),
 rdflib.term.Literal('345099', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))


In [32]:
# extract the taxonomy structure from the semi-cleaned ontology. Making sure to exclude all blank nodes

query = f'CONSTRUCT {{?s1 <{properties[-3][0]}> ?o1 .}} WHERE {{ ?s1 <{properties[-3][0]}> ?o1. FILTER isIRI(?o1) FILTER isIRI(?s1)}} '

taxonomy = ncit.query(
    query
)

print(len(taxonomy))

205453


In [33]:
# extract the subset structure from the semi-cleaned ontology. Making sure to exclude all blank nodes

query = f'CONSTRUCT {{?s1 <{properties[-1][0]}> ?o1 .}} WHERE {{ ?s1 <{properties[-1][0]}> ?o1. FILTER isIRI(?o1) FILTER isIRI(?s1)}} '

subset = ncit.query(
    query
)

print(len(subset))

345099


In [38]:
# combine the taxonomy and the subset structure into a new graph.

tax_and_subset_KG = rdflib.Graph()

for t in list(taxonomy):
    tax_and_subset_KG.add(t)

# At this point only the taxonomy is stored within the graph
tax_and_subset_KG.serialize('./data/tax_NCIT.ttl', format='ttl')

for t in list(subset):
    tax_and_subset_KG.add(t)


In [36]:
# Save the graph for later usage

tax_and_subset_KG.serialize('./data/tax_and_subset_NCIT.ttl', format='ttl')

<Graph identifier=Nffaac6ca329c467bbf20bd70202950a1 (<class 'rdflib.graph.Graph'>)>

In [12]:
# verifying that the graph has been saved correctly

tax_sub_NCIT = rdflib.Graph()
tax_sub_NCIT.parse("./data/tax_and_subset_NCIT.ttl",format='ttl')

print(len(tax_sub_NCIT))

603954


In [3]:
tax_sub_NCIT.serialize("./data/tax_and_subset_NCIT.owl", format='xml')

<Graph identifier=Na5f7f55ce9d14a748e307bfc34d8e625 (<class 'rdflib.graph.Graph'>)>

In [41]:
# A function for determining the max depth of the ontology. It also shows the top node, which is needed further down the line

def find_max_taxonomy_level(g):
    print(f'current size graph: {len(g)}')
    if len(g) == 1:
        for t in g:
            print('found top node: ')
            pprint(t[2])
    if len(g) == 0:
        return 0
    else:
        subClass = URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')
        query = f'CONSTRUCT {{?o <{subClass}> ?o2}} WHERE {{ ?s <{subClass}> ?o. ?o <{subClass}> ?o2 . }}'
        rest_g = list(g.query(
            query
        ))
        new_g = rdflib.Graph()
        for t in rest_g:
            new_g.add(t)
        return 1 + find_max_taxonomy_level(new_g)


In [42]:
# Run the find depth function on the bare taxonomy

g = rdflib.Graph()
g.parse("./data/tax_NCIT.ttl")

max_level = find_max_taxonomy_level(g)
print(max_level)

current size graph: 205453
current size graph: 39718
current size graph: 13164
current size graph: 6393
current size graph: 3397
current size graph: 1853
current size graph: 1053
current size graph: 604
current size graph: 365
current size graph: 221
current size graph: 127
current size graph: 72
current size graph: 42
current size graph: 26
current size graph: 17
current size graph: 11
current size graph: 8
current size graph: 6
current size graph: 4
current size graph: 3
current size graph: 2
current size graph: 1
found top node: 
rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Thing')
current size graph: 0
22


In [117]:
# The main research task requires determining the taxonomy rank.

tax = rdflib.Graph()
tax.parse("./data/tax_NCIT.ttl")

rank_overview = pd.DataFrame(columns=['rank', 'count', 'tax_remaining'])
node_set = {'http://www.w3.org/2002/07/owl#Thing'} # initialize the set with the top node of the ontology
rank = 0
while (len(node_set) != 0 ):
    print(f'currently at rank: {rank}, analyzing {len(node_set)} nodes. Remaining size taxonomy: {len(tax)}')
    triples_at_rank = []
    for o in node_set: # for all nodes in the current node list, query which subclasses it has. The inheritance structure is maintained by storing the entire triple.
        # print(o)
        triples = list(tax.query(
        f'CONSTRUCT {{?s <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{o}>}} WHERE {{ ?s <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{o}>. }}'
        ))
        triples_at_rank += triples # add all found triples to a list
    df = pd.DataFrame(triples_at_rank, columns=['s','p','o']) # create pandas dataframe of the triple list
    df.to_csv(f'./data/taxonomy_rank_trip_store/rank_{rank}_to_{rank+1}.csv', sep=',', index=False)
    rank_overview = pd.concat([rank_overview, pd.DataFrame(data={'rank':[rank], 'count':[len(df)], 'tax_remaining':[len(tax)]})]) # keep track of some statistics for easier validation
    rank += 1
    node_set = {t[0] for t in triples_at_rank} # create new node list based on the subject of all stored triples within triples_at_rank
    for t in triples_at_rank: # remove the checked triples to prevent loops
        tax.remove(t)

rank_overview.to_csv(f'./data/taxonomy_ranks/rank_overview.csv', sep=',', index=False)

currently at rank: 0, analyzing 1 nodes. Remaining size taxonomy: 205453
currently at rank: 1, analyzing 19 nodes. Remaining size taxonomy: 205434
currently at rank: 2, analyzing 776 nodes. Remaining size taxonomy: 204658
currently at rank: 3, analyzing 13154 nodes. Remaining size taxonomy: 191457
currently at rank: 4, analyzing 23698 nodes. Remaining size taxonomy: 166918
currently at rank: 5, analyzing 38164 nodes. Remaining size taxonomy: 127037
currently at rank: 6, analyzing 29814 nodes. Remaining size taxonomy: 94823
currently at rank: 7, analyzing 22939 nodes. Remaining size taxonomy: 70430
currently at rank: 8, analyzing 21440 nodes. Remaining size taxonomy: 47095
currently at rank: 9, analyzing 28933 nodes. Remaining size taxonomy: 15452
currently at rank: 10, analyzing 8374 nodes. Remaining size taxonomy: 5710
currently at rank: 11, analyzing 3545 nodes. Remaining size taxonomy: 1628
currently at rank: 12, analyzing 1118 nodes. Remaining size taxonomy: 326
currently at rank: 

## Function for inducing changes at ranks

In [203]:
import random
import sys


def pick_alternative_o(options, current, enforce_change):
    """
    Selects a random alternative object from a given set

    :param options: The set of possible options
    :param current: The current object
    :param enforce_change: If true, remove the current object from the options set
    :return: A valid alternative object
    """
    if enforce_change:
        options.remove(current)
    return random.sample(list(options), 1)[0]

def update_tax(tax, changes, enforce_change):
    """
    Function for making adjustments to taxonomy
    :param tax: The taxonomy that is to be altered
    :param changes: Pandas Dataframe containing the columns ['s', 'p', 'old_o', 'new_o']
    :param enforce_change: if true, the changes are guaranteed to make an actual change to the ontology
    :return:
    """
    for index, t in changes.iterrows():
        if enforce_change:
            contains_trip = list(tax.query(
                f'ASK {{ <{t.s}> <{t.p}> <{t.new_o}>}}'
            ))[0]
            if contains_trip:
                print(f'{t.old_o}, {t.new_o}')
            assert  contains_trip != True, "The newly formed triple should not be within the taxonomy"

        tax.add((URIRef(t.s), URIRef(t.p), URIRef(t.new_o)))
        tax.remove((URIRef(t.s), URIRef(t.p), URIRef(t.old_o)))


def change_tax(tax, rank, p, enforce_change = True, rand_seed = 42):
    """
    Function for determining the changes that are to be made to the taxonomy.
    :param tax: the taxonomy that is to be changed
    :param rank: The rank to which the changes should be applied
    :param p: The probability that a subclass relation is changed
    :param enforce_change: Enforce that the intended actually make a change (removes the current object from the set of possible new options)
    :param rand_seed: Set the seed of the sampler
    :return:
    """
    random.seed(rand_seed)

    trips = pd.read_csv(f'./data/taxonomy_rank_trip_store/rank_{rank-1}_to_{rank}.csv', sep=',')

    unique_o_nodes = set(trips['o'])

    mod_trips = trips.sample(frac=p, random_state=random.randint(0, 2**32 - 1))

    mod_trips.rename(columns={'o':'old_o'}, inplace=True)

    print(mod_trips.head())

    mod_trips['new_o'] = [pick_alternative_o(unique_o_nodes.copy(), o, enforce_change) for o in mod_trips['old_o']]

    mod_trips.to_csv(f'./data/change_log/rank_{rank}_to_{rank+1}.csv', sep=',', index=False)

    update_tax(tax, mod_trips, enforce_change)



In [130]:
tax = rdflib.Graph()
tax.parse('./data/tax_NCIT.ttl')



<Graph identifier=N1975672450894e4cb3b72f3901cc4fe2 (<class 'rdflib.graph.Graph'>)>

In [204]:

print(len(tax))
change_tax(tax, 2, 0.01)
print(len(tax))

205446
                                                     s  \
146  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
343  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
223  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
349  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   
597  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...   

                                                   p  \
146  http://www.w3.org/2000/01/rdf-schema#subClassOf   
343  http://www.w3.org/2000/01/rdf-schema#subClassOf   
223  http://www.w3.org/2000/01/rdf-schema#subClassOf   
349  http://www.w3.org/2000/01/rdf-schema#subClassOf   
597  http://www.w3.org/2000/01/rdf-schema#subClassOf   

                                                 old_o  
146  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
343  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
223  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
349  http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus...  
597  http://ncicb.nci.

AssertionError: The newly formed triple should not be within the taxonomy

# Redundant code. clean up later

In [13]:
properties = list(tax_sub_NCIT.query(
    'SELECT ?p (COUNT(?o) as ?occurrences) WHERE { ?s ?p ?o. } GROUP BY ?p'
))
for p in properties:
    print(f'property: {p[0].toPython()} --> occurrences: {p[1].toPython()}')

property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8 --> occurrences: 345099
property: http://www.w3.org/2000/01/rdf-schema#subClassOf --> occurrences: 258855


In [27]:
blank_nodes = tax_sub_NCIT.query(
    f'SELECT ?blank (COUNT(?s) as ?occurrences) WHERE {{ ?s <{properties[1][0]}> ?blank. FILTER (isBlank(?blank)) }} GROUP BY ?blank ORDER BY DESC(?occurrences) LIMIT 20'
)

pprint(list(blank_nodes))

[(rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab1'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab2'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab3'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab4'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab5'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7ee00070458c984da9cbc95aeaaab6'),
  rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))),
 (rdflib.term.BNode('n6c5f7e

In [20]:
tax = tax_sub_NCIT.query(
    f'CONSTRUCT {{?s <{properties[1][0]}> ?o}}  WHERE {{ ?s <{properties[1][0]}> ?o. FILTER (isIRI(?o)) }} '
)

tax_NCIT = rdflib.Graph()

for t in tax:
    tax_NCIT.add(t)

# tax_NCIT.serialize('./data/tax_NCIT.owl', format='xml')

In [21]:
len(tax)

205453

In [8]:
properties = list(tax_NCIT.query(
    'SELECT ?p (COUNT(?o) as ?occurrences) WHERE { ?s ?p ?o. } GROUP BY ?p'
))
for p in properties:
    print(f'property: {p[0].toPython()} --> occurrences: {p[1].toPython()}')

property: http://www.w3.org/2000/01/rdf-schema#subClassOf --> occurrences: 258855


In [4]:
properties[0]

(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8'),
 rdflib.term.Literal('345099', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))

In [5]:
sets = list(tax_sub_NCIT.query(
    f'SELECT DISTINCT ?o (COUNT(?o) as ?occurences) WHERE {{ ?s <{properties[0][0]}> ?o. }} GROUP BY ?o ORDER BY ?occurences'
))
for s in sets:
    print(f'set: {s[0].toPython()} --> occurences {s[1].toPython()}')
# pprint(sets)

set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C100169 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C100170 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C163030 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C163031 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C100153 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C100154 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C101815 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C101816 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103094 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103458 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103459 --> occurences 1
set: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103488 --> occurences 1
set: http://ncicb.nci.nih.go

## Code for the semi-cleaned NCIT ontology

<Graph identifier=Nc343f429c97c4f5c88a50281c9a5a85a (<class 'rdflib.graph.Graph'>)>

In [42]:
print(properties[0])
type(
    properties[0][0]
)

(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R100'),)


rdflib.term.URIRef

In [46]:
properties = list(ncit.query(
    'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?p WHERE { ?p rdfs:range ?o.} '
))

property_list = [p[0] for p in properties]

for p in property_list:
    print(f'property: {p.n3()}')

property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R100>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R101>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R102>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R103>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R104>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R111>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R112>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R135>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R136>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R137>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R138>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R145>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R163>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R30>
property: <http://ncicb.nci.nih.gov/xml/owl/EVS/T

In [50]:
query = f'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?p WHERE {{ ?s <{property_list[0]}> ?o.}} '
prop_usage = list(ncit.query(
    query
))
for p in property_list:
    print(f'Currently processing {p}')
    prop_usage = list(ncit.query(
    f'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?p WHERE {{ ?s <{p}> ?o.}} '
    ))
    for u in prop_usage:
        print(f'occurences: {u}')

Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R100
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R101
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R102
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R103
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R104
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R111
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R112
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R135
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R136
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R137
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R138
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R145
Currently processing http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#R163

In [31]:
instance_classes = list(ncit.query(
    'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?o WHERE { ?p a ?o. } '
))
for p in instance_classes:
    print(f'class: {p}')

class: (rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Ontology'),)
class: (rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#Datatype'),)
class: (rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#List'),)
class: (rdflib.term.URIRef('http://www.w3.org/2002/07/owl#ObjectProperty'),)
class: (rdflib.term.URIRef('http://www.w3.org/2002/07/owl#TransitiveProperty'),)
class: (rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'),)
class: (rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Restriction'),)


In [None]:
subclass_relations = list(ncit.query(
    'PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?s ?o WHERE { ?s rdfs:subClassOf ?o. FILTER isURI(?o) }'
))