In [1]:
import os
import random
import string
import re
import requests
import gzip
import shutil
import json
import types
import datetime
import decimal
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint = "http://localhost:9999/blazegraph/sparql"  # SPARQL endpoint hosting previous version of ITO.owl
prefixes = """
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix ito: <https://identifiers.org/ito:>
prefix edam: <http://edamontology.org/>
prefix obo: <http://www.geneontology.org/formats/oboInOwl#>
prefix dc: <http://purl.org/dc/elements/1.1/>
"""


def query(query, return_format = JSON):
    sparql = SPARQLWrapper(endpoint)
    sparql.method = 'POST'
    sparql.setReturnFormat(return_format)
    sparql.setQuery(prefixes + query)
    results = sparql.query().convert()
    return results


def query_df(query, numeric_cols = []):
    # Run SPARQL query, and convert results to Pandas dataframe
    sparql = SPARQLWrapper(endpoint)
    sparql.method = 'POST'
    sparql.setReturnFormat(JSON)
    sparql.setQuery(prefixes + query)
    results = sparql.query()
    processed_results = json.load(results.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
        
    df = pd.DataFrame(out, columns=cols)
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col])
    
    return df


def query_if_iri_exists(iri):
    q = """ASK FROM <tag:ITO> {
    VALUES (?r) { (<""" + iri + """>) }
        { ?r ?p ?o }
        UNION
        { ?s ?r ?o }
        UNION
        { ?s ?p ?r }
    } """
    return query(q)['boolean']


def query_entites_with_pwc_label(pwc_label):
    q = 'SELECT ?x FROM <tag:ITO> WHERE { ?x ito:papers_with_code_id "' + escape(pwc_label.strip()) + '" }'  #.strip() added because some part of process (WebProtege?) seem to strip whitespace without user intent
    r = query(q)
    if len(r['results']['bindings']) > 1:
        raise Exception(
            f"query_entites_with_pwc_label: It seems like there are multiple existing entities for {pwc_label}!")
    elif len(r['results']['bindings']) == 0:
        return None
    else:
        return r['results']['bindings'][0]['x']['value']


def query_label(iri):
    q = 'SELECT ?label FROM <tag:ITO> WHERE { <' + iri + '> rdfs:label ?label }'
    r = query(q)
    if len(r['results']['bindings']) == 0:
        return None
    else:
        return r['results']['bindings'][0]['label']['value']


def add_triple(raw_sparql_fragment):
    query("INSERT DATA { GRAPH <tag:ITO> { " + raw_sparql_fragment + " }  } ")


def find_new_iri_fragment(prefix, number_format='{:05}'):
    # Find new unique IRI fragment (e.g., 'ITO_00101'), start trying with global increment variable i
    global iri_increment
    while True:
        candidate_iri = prefix + number_format.format(iri_increment)
        if query_if_iri_exists(candidate_iri) == True:
            iri_increment += 1
        else:
            break
    return candidate_iri


def clean_metric_value(metric_value):
    if len(metric_value) == 0:
        raise ValueError("Metric value empty")
    metric_value = metric_value.strip()
    # remove % signs
    metric_value = metric_value.replace("%", "")
    # convert m/M postfixes to numbers (e.g. 22M)
    if metric_value[-1] in ['m', 'M']:
        metric_value = metric_value[:-1]
        metric_value = decimal.Decimal(
            metric_value) * decimal.Decimal('1000000')
    # convert k/K postfixes to numbers (e.g. 22k)
    elif metric_value[-1] in ['k', 'K']:
        metric_value = metric_value[:-1]
        metric_value = decimal.Decimal(metric_value) * decimal.Decimal('1000')
    metric_value = float(metric_value)
    return metric_value


def clean_ontology():
    query("""
        WITH <tag:ITO>
        DELETE { ?class rdfs:subClassOf ?redundant_superclass . }
        WHERE {
            ?class rdfs:subClassOf ?redundant_superclass .
            ?class rdfs:subClassOf ?intermediate_class .
            ?intermediate_class rdfs:subClassOf+ ?redundant_superclass .
        }
        """)

    query("""
        WITH <tag:ITO>
        DELETE { ?class rdfs:subClassOf owl:Thing . }
        WHERE {
            ?class rdfs:subClassOf ?someClass .
            ?class rdfs:subClassOf owl:Thing .
            FILTER (!sameTerm(?someClass, owl:Thing))
        }""")

    query("""
        prefix ito: <https://identifiers.org/ito:>

        WITH <tag:ITO>
        DELETE { ?class rdfs:subClassOf ?superclass . }
        INSERT { ?class rdfs:subClassOf ito:ITO_00492 . }
        WHERE {
        ?class rdfs:subClassOf+ ito:ITO_01625 . 
        ?class rdfs:subClassOf ?superclass . 
        FILTER NOT EXISTS { 
            ?subclass rdfs:subClassOf* ?class .
            ?subclass rdfs:subClassOf ito:Benchmarking . } 
        }""")

    # Merge entities with the same papers_with_code_id value.

    df = query_df('''
        select (GROUP_CONCAT(?subject1) as ?subject1_concat) ?object from <tag:ITO> where { 
        ?subject1 ito:papers_with_code_id ?object .
        ?subject2 ito:papers_with_code_id ?object .
        filter ( ?subject1 != ?subject2 )
        }
        GROUP BY ?object
        LIMIT 1000

        ''')
    for index, row in df.iterrows():
        iris = row['subject1_concat'].split(' ')
        query("""
            WITH <tag:ITO>
            DELETE {<""" + iris[1] + """> ?p ?o}
            INSERT {<""" + iris[0] + """> ?p ?o}
            WHERE  {<""" + iris[1] + """> ?p ?o}
            """)

        query("""
            WITH <tag:ITO>
            DELETE { ?s ?p <""" + iris[1] + """>}
            INSERT { ?s ?p <""" + iris[0] + """>}
            WHERE  { ?s ?p <""" + iris[1] + """>}
            """)

        query("""
            WITH <tag:ITO>
            DELETE { ?s <""" + iris[1] + """> ?o}
            INSERT { ?s <""" + iris[0] + """> ?o}
            WHERE  { ?s <""" + iris[1] + """> ?o}
            """)


def escape(s):
    return s.translate(str.maketrans({  "'":   r"\'",
                                        '"':   r'\"',
                                        "\\":  r"\\",
                                        "\r":  r"\r",
                                        "\n":  r"\n"}))

In [3]:
def create_new_class_from_paperswithcode(class_pwc_label, label=None, superclass="http://www.w3.org/2002/07/owl#Thing", entity_type="class", random_iri=False, flag_as_imported=True):
    # Create a new class from paperswithcode input data, only if the class has not yet been added to the ontology

    # Quick fix for rare cases where entities contain '*' wildcard character (historic, was required for Owlready2)
    class_pwc_label = class_pwc_label.replace("*", "_")

    existing_matching_classes = query_entites_with_pwc_label(class_pwc_label)

    if existing_matching_classes == None:

        if random_iri:
            iri = 'ITO_i' + \
                  ''.join(random.choices(string.ascii_letters + string.digits, k=16))
        else:
            iri = find_new_iri_fragment('https://identifiers.org/ito:ITO_')
        
        logfile.write(
            f"INFO: Creating an new entity for {class_pwc_label} ({iri}) \n")

        if entity_type == "individual":
            add_triple(f"<{iri}> a <{superclass}>")

        elif entity_type == "property":
            add_triple(f"<{iri}> rdfs:subPropertyOf  <{superclass}>")
            add_triple(f"<{iri}> a owl:DatatypeProperty")
            if flag_as_imported:
                add_triple(f"<{iri}> rdfs:subPropertyOf ito:Data_property_requiring_curation")

        elif entity_type == "class":
            add_triple(f"<{iri}> rdfs:subClassOf  <{superclass}>")
            add_triple(f"<{iri}> a owl:Class")
            if flag_as_imported:
                add_triple(f"<{iri}> rdfs:subClassOf ito:Class_requiring_curation")

        else:
            raise ValueError('Illegal entity_type argument')  

        if flag_as_imported:
            add_triple(f"<{iri}> edam:refactor_comment 'IMPORTED'")

        add_triple(f"<{iri}> obo:creation_date '{str(datetime.datetime.now().isoformat())}'^^xsd:dateTime")

        logfile.write(
            f"INFO: Created new entity for {class_pwc_label} ({iri})\n")

        if label:
            add_triple(f"<{iri}> rdfs:label '{escape(label.strip())}'")
        else:
            add_triple(f"<{iri}> rdfs:label '{escape(class_pwc_label.strip())}'")

        add_triple(f"<{iri}> ito:papers_with_code_id '{escape(class_pwc_label)}'")

        add_triple(f"<{iri}> dc:source ito:Papers_with_code")

        return iri

    else:
        # logfile.write(f"INFO: An entity for {class_pwc_label} already exists ({existing_matching_classes}) -- skipping entity creation\n")
        return existing_matching_classes


def add_superclass(myclass, superclass):
    if not myclass == superclass:
        add_triple(f"<{myclass}> rdfs:subClassOf <{superclass}>")

In [3]:
query("DROP GRAPH <tag:ITO>")
query("""
LOAD <file:///D:/Documents/Projekte/AI%20Strategies/ITO/ITO.owl> INTO GRAPH <tag:ITO>
""")

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text&#47;html;charset=UTF-8"><title>blazegraph&trade; by SYSTAP</title\n></head\n><body<p>totalElapsed=12764ms, elapsed=12764ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n><hr><p>COMMIT: totalElapsed=16810ms, commitTime=1628606373733, mutationCount=688346</p\n></html\n>'

In [5]:
# Total triple count before purge

query_df("SELECT (count(*) as ?triple_count) FROM <tag:ITO> WHERE {?s ?p ?o}")

Unnamed: 0,triple_count
0,101572


In [6]:
# Purge existing benchmark result data / pre-July 2021 entities that became obsolete through new modeling

query("""
WITH <tag:ITO>
DELETE { ?s ?p ?o }
WHERE {
?s a/rdfs:subClassOf+ ito:Software .
?s ?p ?o .
}
""")

query("""
WITH <tag:ITO>
DELETE { ?s ?p ?o }
WHERE {
?s rdfs:subClassOf+ ito:Software .
?s ?p ?o .
}
""")

query("""
WITH <tag:ITO>
DELETE { ?s ?p ?o }
WHERE {
?s a/rdfs:subClassOf+ ito:Benchmarking .
?s ?p ?o .
}
""")



b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text&#47;html;charset=UTF-8"><title>blazegraph&trade; by SYSTAP</title\n></head\n><body<p>totalElapsed=338ms, elapsed=336ms, connFlush=0ms, batchResolve=0, whereClause=335ms, deleteClause=0ms, insertClause=335ms</p\n><hr><p>COMMIT: totalElapsed=361ms, commitTime=1628147160555, mutationCount=0</p\n></html\n>'

In [7]:
clean_ontology()

In [8]:
# Total triple count after purge

query_df("SELECT (count(*) as ?triple_count) FROM <tag:ITO> WHERE {?s ?p ?o}")

Unnamed: 0,triple_count
0,65426


In [9]:
# Download and read PWC data dump

filename = './evaluation-tables.json.gz'
if not os.path.exists(filename):
    url = 'https://paperswithcode.com/media/about/evaluation-tables.json.gz'
    myfile = requests.get(url)
    open(filename, 'wb').write(myfile.content)
    with gzip.open(filename, 'rb') as f_in:
        with open('./evaluation-tables.json', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

with open('evaluation-tables.json') as f:
    data = json.load(f)


In [10]:
# Set this values based on highest IRI in current ontology to save some startup time
iri_increment = 17000
logfile = open("logfile.txt", "w", encoding="utf-8")
failed_rows = 0

In [11]:
ai_process_class = "https://identifiers.org/ito:ITO_01625"

def create_task_class(task, depth=0, superclass=None):
    logfile.write(
        f"INFO: Processing task to create class hierarchy: {task['task']}\n")

    # Create class

    task_class = create_new_class_from_paperswithcode(task['task'])

    for category in task['categories']:
        categories_class = create_new_class_from_paperswithcode(category, superclass = ai_process_class)
        # add_superclass(categories_class, ai_process_class)
        if not categories_class == task_class:   # in some cases category name and task name are equal; this would create a cycle that we need to avoid
            add_superclass(task_class, categories_class)
    if superclass:
        add_superclass(task_class, superclass)

    for subtask in task['subtasks']: 
        create_task_class(subtask, depth + 1, superclass = task_class)


for task in data:
    #logfile.write(f"INFO: Creating task class for  {task['task']}\n")
    create_task_class(task)



In [12]:
clean_ontology()

In [13]:
hashed_result_rows = set()   # PWC benchmark data contains duplicates, we make a set of hashes of results to avoid adding the same result more than once.

article_class = "http://edamontology.org/data_0971"

def process_task(task, depth=0, superclass=None):
    global failed_rows

    task_class = create_new_class_from_paperswithcode(task['task'])

    # Add process definitions -- TODO: THIS PROBABLY LEADS TO DUPLICATES IF MINOR CHANGES IN PWC DESCRIPTIONS WERE INTRODUCED

    description = str(task['description']).split('<span')[0].strip()  # remove image attributions
    description = re.sub('<[^<]+?>', '', description)  # strip HTML
    if description and len(description) < 800:
        description += " (Source: paperswithcode.com)"
        add_triple(f"<{task_class}> obo:hasDefinition '{escape(description)}'")

    # Convert benchmark results data

    for dataset in task['datasets']:
        #dataset['dataset'] = dataset['dataset'].encode('utf-8').decode('utf-8')
        #print('decoded: ',dataset['dataset'])
        
        dataset_entity = create_new_class_from_paperswithcode(
            dataset['dataset'] + ' dataset', superclass="https://identifiers.org/ito:Benchmark_dataset", entity_type="individual")
        logfile.write(f"Dataset_entity: {dataset_entity}\n")

        benchmark_pwc_label = dataset['dataset'] + ' - ' + query_label(task_class) + ' benchmarking'

        logfile.write(f"benchmark_pwc_label: {benchmark_pwc_label}\n")

        benchmark_class = create_new_class_from_paperswithcode(benchmark_pwc_label, superclass = "https://identifiers.org/ito:Benchmarking", flag_as_imported = False)
        if not benchmark_class == task_class:    # in rare cases the benchmark and task class end up the same - TODO: Deal with this better
            add_superclass(benchmark_class, task_class)
        #add_superclass(benchmark_class, "https://identifiers.org/ito:Benchmarking")

        for metric in dataset['sota']['metrics']:
            metric_property = create_new_class_from_paperswithcode(
                metric + " property", metric, superclass="https://identifiers.org/ito:performance_measure", entity_type="property")

            add_triple(f"<{benchmark_class}> rdfs:seeAlso <{metric_property}>")

        for row in dataset['sota']['rows']:
            #row_entity = benchmark_class(0) ############
            hashed_row = hash(str(row))
            if hashed_row in hashed_result_rows:
                logfile.write(f"INFO: Skipping duplicate result row, hash {hashed_row}\n")
            else:
                hashed_result_rows.add(hashed_row)

                row_entity = 'https://identifiers.org/ito:ITO_i' + ''.join(random.choices(string.ascii_letters + string.digits, k=16)) # create random IRI
                add_triple(f"<{row_entity}> a <{benchmark_class}>")

                if not row['model_name']:
                    logfile.write("WARNING: Model name missing, skipping row\n")
                    failed_rows += 1
                    continue

                for metric, value in row['metrics'].items():
                    cleaned_metric_value = None
                    try:
                        cleaned_metric_value = clean_metric_value(value)
                    except (ValueError, decimal.InvalidOperation):
                        logfile.write(
                            f"ERROR: Could not convert metric value: {metric} {value}\n")

                    if cleaned_metric_value:
                        metric_property = create_new_class_from_paperswithcode(
                            metric + " property", metric, superclass="https://identifiers.org/ito:performance_measure", entity_type="property")
                        add_triple(f"<{row_entity}> <{metric_property}> {cleaned_metric_value}")

                
                software_entity = create_new_class_from_paperswithcode(row['model_name'] + " software", row['model_name'], superclass="https://identifiers.org/ito:Software", entity_type="individual")

                add_triple(f"<{row_entity}> rdfs:seeAlso <{software_entity}>")
                add_triple(f"<{row_entity}> ito:has_input <{dataset_entity}>")

                if row['paper_title']:
                    row_entity_label = row['model_name'] + " model in '" + row['paper_title'] + "'"        
                    row_entity_label = escape(row_entity_label.strip()) 

                    add_triple(f"<{row_entity}> rdfs:label '{escape(row_entity_label)}'")

                    article_entity = create_new_class_from_paperswithcode(row['paper_url'], row['paper_title'], superclass=article_class, entity_type="individual")

                    add_triple(f"<{article_entity}> foaf:page <{row['paper_url']}>")

                    add_triple(f"<{row_entity}> rdfs:seeAlso <{article_entity}>")
                    add_triple(f"<{software_entity}> rdfs:seeAlso <{article_entity}>")


                    try:
                        paper_date = datetime.datetime.strptime(row['paper_date'], '%Y-%m-%d').date()
                        
                        add_triple(f"<{row_entity}> obo:date '{paper_date}'^^xsd:date")
                        add_triple(f"<{article_entity}> obo:date '{paper_date}'^^xsd:date")

                    except TypeError:
                        logfile.write("ERROR: No valid date found\n")
                else:
                    add_triple(f"<{row_entity}> rdfs:label '{escape(row['model_name'])}'")

                for code_link in row['code_links']:
                    add_triple(f"<{row_entity}> rdfs:seeAlso '{code_link['url']}'")

    # Recursively iterate through subtasks

    for subtask in task['subtasks']:
        process_task(subtask, depth + 1, superclass=task_class)


#logfile.write(f"INFO: Failed rows: {failed_rows}\n")
#logfile.write(f"INFO: Final iri_increment value: {iri_increment}\n")

for task in data:
    logfile.write(f"INFO: Processing benchmark results for {task['task']}\n")
    process_task(task)

In [14]:
clean_ontology()

# END OF MAIN SCRIPT, SCRATCHPAD FOLLOWS

In [None]:
query_df('SELECT ?x FROM <tag:ITO> WHERE { ?x ito:papers_with_code_id "SSIM property" }')

In [None]:
query_df('''
select * from <tag:ITO> where { 
  ?subject ito:papers_with_code_id ?object1, ?object2 .
  filter ( ?object1 != ?object2 )
}
LIMIT 1000

''')

# SPARQL examples

In [None]:
query("CREATE GRAPH <https://samwald.info/graph>")
query("INSERT DATA { GRAPH <https://samwald.info/graph> { <http://example/book1> dc:title 'A new book' }  } ")
#query("SELECT * WHERE { <http://example/book1> ?b ?c . } ")
#query("DELETE DATA { GRAPH <https://samwald.info/graph> { <http://example/book1> dc:title 'A new book' }  } ")

In [None]:
query_df("SELECT * WHERE { <http://example/book1> ?b ?c . } ")

In [None]:
query_df("""
SELECT ?g 
WHERE {
  GRAPH ?g { }
}
""")

In [None]:
query("INSERT DATA { GRAPH <tag:ITO_new> { <http://example/book1> dc:title 'An alternative title!!' }  } ")

In [None]:
query("SELECT * FROM <tag:ITO_new> WHERE { <http://example/book1> ?b ?c . } ")

In [None]:
query("CREATE GRAPH <tag:ITO_old>")

In [None]:
escape(r" el in 'Multiway Attention Networks for Modeling Sentence Pairs' ")

In [None]:
def escape2(s):
    return s.translate(str.maketrans({  "\\":  r"\\",
                                        "'":   r"\'",
                                        '"':   r'\"',
                                        "\r":  r"\r",
                                        "\n":  r"\n"}))

In [None]:
escape2(r" el \ in 'Multiway Attention Networks for Modeling Sentence Pairs' ")

In [None]:
r" el \ in 'Multiway Attention Networks for Modeling Sentence Pairs' "