In [1]:

import rdflib
from rdflib import URIRef
from pprint import pprint
import pandas as pd
import os
import copy

## Cleaning NCIT ontology

In [2]:
ncit = rdflib.Graph()
ncit.parse("./data/semi_cleaned_NCIT.rdf",format='xml') #Load the ontology with all datatype properties already removed

properties = list(ncit.query(
    'SELECT ?p (COUNT(?o) as ?occurrences) WHERE { ?s ?p ?o. } GROUP BY ?p ORDER BY ?occurrences'
))
for p in properties: # list all the remaining
    print(f'property: {p[0].toPython()} --> occurrences: {p[1].toPython()}')

property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A4 --> occurrences: 1
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P378 --> occurrences: 1
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A12 --> occurrences: 10
property: http://www.w3.org/2000/01/rdf-schema#subPropertyOf --> occurrences: 11
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A3 --> occurrences: 11
property: http://www.w3.org/2002/07/owl#oneOf --> occurrences: 19
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A9 --> occurrences: 28
property: http://www.w3.org/2000/01/rdf-schema#domain --> occurrences: 86
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A2 --> occurrences: 97
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A1 --> occurrences: 97
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A14 --> occurrences: 105
property: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A30 --> occurrences: 124
property: http://w

In [30]:
# For now, we are only interested in the rdfs:subClassOf relation and ncit:A8, which has the 'Concept_In_Subset' label

pprint(properties[-3])
pprint(properties[-1])

(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.Literal('258855', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8'),
 rdflib.term.Literal('345099', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))


In [32]:
# extract the taxonomy structure from the semi-cleaned ontology. Making sure to exclude all blank nodes

query = f'CONSTRUCT {{?s1 <{properties[-3][0]}> ?o1 .}} WHERE {{ ?s1 <{properties[-3][0]}> ?o1. FILTER isIRI(?o1) FILTER isIRI(?s1)}} '

taxonomy = ncit.query(
    query
)

print(len(taxonomy))

205453


In [33]:
# extract the subset structure from the semi-cleaned ontology. Making sure to exclude all blank nodes

query = f'CONSTRUCT {{?s1 <{properties[-1][0]}> ?o1 .}} WHERE {{ ?s1 <{properties[-1][0]}> ?o1. FILTER isIRI(?o1) FILTER isIRI(?s1)}} '

subset = ncit.query(
    query
)

print(len(subset))

345099


In [38]:
# combine the taxonomy and the subset structure into a new graph.

tax_and_subset_KG = rdflib.Graph()

for t in list(taxonomy):
    tax_and_subset_KG.add(t)

# At this point only the taxonomy is stored within the graph
tax_and_subset_KG.serialize('./data/tax_NCIT.ttl', format='ttl')

for t in list(subset):
    tax_and_subset_KG.add(t)


In [36]:
# Save the graph for later usage

tax_and_subset_KG.serialize('./data/tax_and_subset_NCIT.ttl', format='ttl')

<Graph identifier=Nffaac6ca329c467bbf20bd70202950a1 (<class 'rdflib.graph.Graph'>)>

In [12]:
# verifying that the graph has been saved correctly

tax_sub_NCIT = rdflib.Graph()
tax_sub_NCIT.parse("./data/tax_and_subset_NCIT.ttl",format='ttl')

print(len(tax_sub_NCIT))

603954


In [3]:
tax_sub_NCIT.serialize("./data/tax_and_subset_NCIT.owl", format='xml')

<Graph identifier=Na5f7f55ce9d14a748e307bfc34d8e625 (<class 'rdflib.graph.Graph'>)>

In [4]:
# A function for determining the max depth of the ontology. It also shows the top node, which is needed further down the line

def find_max_taxonomy_level(g):
    print(f'current size graph: {len(g)}')
    if len(g) == 1:
        for t in g:
            print('found top node: ')
            pprint(t[2])
        return 0
    else:
        subClass = URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')
        query = f'CONSTRUCT {{?o <{subClass}> ?o2}} WHERE {{ ?s <{subClass}> ?o. ?o <{subClass}> ?o2 . }}'
        rest_g = list(g.query(
            query
        ))
        new_g = rdflib.Graph()
        for t in rest_g:
            new_g.add(t)
        return 1 + find_max_taxonomy_level(new_g)


In [5]:
# Run the find depth function on the bare taxonomy

g = rdflib.Graph()
g.parse("./data/tax_NCIT.ttl")

max_level = find_max_taxonomy_level(g)
print(max_level)

current size graph: 205453
current size graph: 39718
current size graph: 13164
current size graph: 6393
current size graph: 3397
current size graph: 1853
current size graph: 1053
current size graph: 604
current size graph: 365
current size graph: 221
current size graph: 127
current size graph: 72
current size graph: 42
current size graph: 26
current size graph: 17
current size graph: 11
current size graph: 8
current size graph: 6
current size graph: 4
current size graph: 3
current size graph: 2
current size graph: 1
found top node: 
rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Thing')
21


## Determing the nodes at different ranks

In [3]:
# The main research task requires determining the taxonomy rank.

tax = rdflib.Graph()
tax.parse("./data/tax_NCIT.ttl") # only load in the taxonomy, as that is the only information relevant for this task.

# initialize a dataframe keeping track of how many triples are present at each rank within the ontology
rank_overview = pd.DataFrame(columns=['rank', 'count', 'tax_remaining'])
node_set = {'http://www.w3.org/2002/07/owl#Thing'} # initialize the set with the top node of the ontology
rank = 0 # Initialize the current rank to be 0
while (len(node_set) != 0 ):
    print(f'currently at rank: {rank}, analyzing {len(node_set)} nodes. Remaining size taxonomy: {len(tax)}')
    triples_at_rank = []
    for o in node_set: # for all nodes in the current node list, query which subclasses it has. The inheritance structure is maintained by storing the entire triple.
        triples = list(tax.query(
        f'CONSTRUCT {{?s <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{o}>}} WHERE {{ ?s <http://www.w3.org/2000/01/rdf-schema#subClassOf> <{o}>. }}'
        ))
        triples_at_rank += triples # add all found triples to a list
    df = pd.DataFrame(triples_at_rank, columns=['s','p','o']) # create pandas dataframe of the triple list
    file_dir = './data/taxonomy_rank_trip_store/'
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    df.to_csv(f'./data/taxonomy_rank_trip_store/rank_{rank+1}_to_{rank}.csv', sep=',', index=False)
    rank_overview = pd.concat([rank_overview, pd.DataFrame(data={'rank':[rank+1], 'count':[len(df)], 'tax_remaining':[len(tax)]})]) # keep track of some statistics for easier validation
    rank += 1
    node_set = {t[0] for t in triples_at_rank} # create new node set based on the subject of all stored triples within triples_at_rank
    for t in triples_at_rank: # remove the checked triples to prevent loops
        tax.remove(t)

file_dir = './data/taxonomy_ranks/'
if not os.path.exists(file_dir):
    os.makedirs(file_dir)
rank_overview.to_csv(f'./data/taxonomy_ranks/rank_overview.csv', sep=',', index=False)

currently at rank: 0, analyzing 1 nodes. Remaining size taxonomy: 205453
currently at rank: 1, analyzing 19 nodes. Remaining size taxonomy: 205434
currently at rank: 2, analyzing 776 nodes. Remaining size taxonomy: 204658
currently at rank: 3, analyzing 13154 nodes. Remaining size taxonomy: 191457
currently at rank: 4, analyzing 23698 nodes. Remaining size taxonomy: 166918
currently at rank: 5, analyzing 38164 nodes. Remaining size taxonomy: 127037
currently at rank: 6, analyzing 29814 nodes. Remaining size taxonomy: 94823
currently at rank: 7, analyzing 22939 nodes. Remaining size taxonomy: 70430
currently at rank: 8, analyzing 21440 nodes. Remaining size taxonomy: 47095
currently at rank: 9, analyzing 28933 nodes. Remaining size taxonomy: 15452
currently at rank: 10, analyzing 8374 nodes. Remaining size taxonomy: 5710
currently at rank: 11, analyzing 3545 nodes. Remaining size taxonomy: 1628
currently at rank: 12, analyzing 1118 nodes. Remaining size taxonomy: 326
currently at rank: 

## Function for inducing changes at ranks

In [3]:
import random

def change_tax(tax, rank, p, enforce_change = True, rand_seed = 42):
    """
    Function for determining the changes that are to be made to the taxonomy.
    :param tax: the taxonomy that is to be changed
    :param rank: The rank to which the changes should be applied
    :param p: The probability that a subclass relation is changed
    :param enforce_change: Enforce that the intended actually make a change (removes the current object from the set of possible new options)
    :param rand_seed: Set the seed of the sampler
    :return:
    """

    random.seed(rand_seed) # Setting the seed of reproducibility

    # Load the set of triples that can be changed
    trips = pd.read_csv(f'./data/taxonomy_rank_trip_store/rank_{rank}_to_{rank-1}.csv', sep=',')

    # Load the set of unique 'object' entities. This is used for determining the new object
    unique_o_nodes = set(trips['o'])

    # Sample a random subset from the selected triples. These are the triples that will be changed
    mod_trips = trips.sample(frac=p, random_state=random.randint(0, 2**32 - 1))

    # Rename the 'o(bject)' column to 'old_o(bject)'
    mod_trips.rename(columns={'o':'old_o'}, inplace=True)

    # Create a new column 'new_o(bject)' where the newly selected object is stored
    mod_trips['new_o'] = [pick_alternative_o(unique_o_nodes.copy(), o, enforce_change) for o in mod_trips['old_o']]

    # Update the taxonomy with the selected changes.
    update_tax(tax, mod_trips, enforce_change)

    # Write the selected changes into a change log for reproducibility
    file_dir = f'./data/change_log/rank_{rank}_to_{rank-1}/'
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    mod_trips.to_csv(file_dir+f'{p}.csv', sep=',', index=False)

    print(f"Total number of changes between rank {rank} and {rank-1}: {len(mod_trips)}")



def pick_alternative_o(options, current, enforce_change):
    """
    Selects a random alternative object from a given set

    :param options: The set of possible options
    :param current: The current object
    :param enforce_change: If true, remove the current object from the options set
    :return: A valid alternative object
    """

    # If we need to ensure the new ontology is different from the old one, remove the current 'object' from the set of possible objects.
    if enforce_change:
        options.remove(current)
    return random.sample(list(options), 1)[0] # Sample a new object.


def update_tax(tax, changes, enforce_change):
    """
    Function for making adjustments to taxonomy
    :param tax: The taxonomy that is to be altered
    :param changes: Pandas Dataframe containing the columns ['s', 'p', 'old_o', 'new_o']
    :param enforce_change: if true, the changes are guaranteed to make an actual change to the ontology
    :return:
    """

    # Loop over all provided changes
    for index, t in changes.iterrows():
        if enforce_change: # If the change should result in a different ontology, perform additional checks.
            contains_trip = list(tax.query( # Query for determining if the new triple is already present within the ontology
                f'ASK {{ <{t.s}> <{t.p}> <{t.new_o}>}}'
            ))[0]
            if contains_trip: # If it returns true, something is wrong and this should be fixed
                print(f'{t.old_o}, {t.new_o}')
                changes.loc[index, 'New_triple_already_exists'] = 1
            else:
                changes.loc[index, 'New_triple_already_exists'] = 0

        tax.add((URIRef(t.s), URIRef(t.p), URIRef(t.new_o))) # add the new triple to the ontology
        tax.remove((URIRef(t.s), URIRef(t.p), URIRef(t.old_o))) # remove the old one

    if enforce_change:
        print(f'{sum(changes["New_triple_already_exists"])} modified triplets were already present within the taxonomy')



In [4]:
change_ratio = 0.1
rand_seed = 42
graph_name = 'tax_NCIT'

tax = rdflib.Graph()
tax.parse(f'./data/{graph_name}.ttl')

# This loop will make modifications the subclass relation between entities with rank 2 to 1, 3 to 2, ect. all the way to rank 15 to 14
for rank in range(2, 16):
    tax_copy = copy.deepcopy(tax) # create a deep copy to make sure changes in higher ranks are not included in the new graph.
    change_tax(tax_copy, rank, change_ratio, rand_seed=rand_seed)
    file_dir = f'./data/modified_graphs/{graph_name}/rank_{rank}_to_{rank-1}/'
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    file_name = file_dir+f'{change_ratio}.ttl'
    tax_copy.serialize(file_name, format='ttl')

0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 2 and 1: 78
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 3 and 2: 1320
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 4 and 3: 2454
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 5 and 4: 3988
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 6 and 5: 3221
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 7 and 6: 2439
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 8 and 7: 2334
0.0 modified triplets were already present within the taxonomy
Total number of changes between rank 9 and 8: 3164
0.0 modified triplets were already present within the taxonomy
Total number of changes bet