In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Using cached SPARQLWrapper-1.8.5-py3-none-any.whl (26 kB)
Installing collected packages: SPARQLWrapper
Successfully installed SPARQLWrapper-1.8.5


You should consider upgrading via the 'c:\users\ottsi\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [2]:
# %%
#import some modules
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns
from SPARQLWrapper import SPARQLWrapper, N3, JSON
from rdflib import Graph
from scipy import stats

In [3]:
polarities = pd.read_csv("data/polarities.csv", sep=";")
polarities.head(5)

Unnamed: 0,metric,polarity
0,ACC@1-100Clients,pos
1,AED,neg
2,AKD,neg
3,AP,pos
4,AVERAGE MAE,neg


In [6]:
# %%
#Define here the end point  (i.e. where the blazergraph instance is running)
#current one
endpoint = "http://149.148.229.242:9999/blazegraph/namespace/ito/sparql"
        
prefixes = """
PREFIX edam: <http://edamontology.org/>
PREFIX obo:  <http://purl.obolibrary.org/obo/>
PREFIX ito:  <https://identifiers.org/ito#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
"""


#Use this function to escape some desired_benchmark names that might contain special chars causing crashes.
def escape(s):
    return s.translate(str.maketrans({  "'":   r"\'",
                                        '"':   r'\"',
                                        "\\":  r"\\",
                                        "\r":  r"\r",
                                        "\n":  r"\n"}))


def query(service, query, numeric_cols = []):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']
    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
        
    df = pd.DataFrame(out, columns=cols)
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col])
    
    return df

query = """
    PREFIX edam: <http://edamontology.org/>
    PREFIX obo:  <http://purl.obolibrary.org/obo/>
    PREFIX ito:  <https://identifiers.org/ito:>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

    SELECT DISTINCT *
    WHERE {
            ?paper a edam:data_0971 . 
            ?paper rdfs:label ?paper_label. 
            ?paper oboInOwl:date ?date. 
            ?model  rdfs:seeAlso ?paper ;
                    rdfs:label ?model_label ;
                    a ?dataset . # this will create a place holder for the rdfs:type results that contains the information about the individual
            ?dataset rdfs:label ?dataset_label ;
                    rdfs:subClassOf* ?top_level_class .
            ?top_level_class rdfs:subClassOf ito:ITO_01625 ;
                            rdfs:label ?top_level_class_label .
            ?metric rdfs:subPropertyOf* ito:performance_measure .
            ?metric rdfs:label ?metric_label .
            ?model ?metric ?result

            FILTER(?top_level_class != ito:Benchmarking) 
            FILTER(?top_level_class != ito:ITO_01524) 
        } ORDER by ?date
"""

#send query via API
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
result = sparql.query()
#process results as JSON
processed_results = json.load(result.response)

# save to csv
 #Use accessory function to process results
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
    item = []
    for c in cols:
        item.append(row.get(c, {}).get('value'))
    out.append(item)

#this is the final df containing the results of the query
all = pd.DataFrame(out, columns=cols)


all[['dataset_label', 'task_label']] = all['dataset_label'].str.rsplit(' - ', 1, expand=True)
all["task_label"] = all["task_label"].str.replace(" benchmarking","")


# reorder columns (more hierarchical)
all = all.reindex(columns=['date', 'top_level_class', 'top_level_class_label', 'task_label', 'dataset', 'dataset_label', 'paper', 'paper_label', 'model', 'model_label', 'metric', 'metric_label', 'result'])
all.to_csv("data/all.csv", index=None)
all.head(5)


Unnamed: 0,date,top_level_class,top_level_class_label,task_label,dataset,dataset_label,paper,paper_label,model,model_label,metric,metric_label,result
0,2000-09-24,https://identifiers.org/ito:ITO_00126,Biomedical AI process,Atrial Fibrillation Detection,https://identifiers.org/ito:ITO_28989,MIT-BIH AF,https://identifiers.org/ito:ITO_10000,A method for detection of atrial fibrillation ...,https://identifiers.org/ito:ITO_iBBnwwayqk6JXo3DV,/spl Delta/RR intervals model in \'A method fo...,https://identifiers.org/ito:ITO_02411,Accuracy,94.95
1,2003-07-18,https://identifiers.org/ito:ITO_00137,Graph process,Graph Classification,https://identifiers.org/ito:ITO_39851,PROTEINS,https://identifiers.org/ito:ITO_16777,Distinguishing Enzyme Structures from Non-enzy...,https://identifiers.org/ito:ITO_izaNxE1rJseMr89CW,RW model in \'Distinguishing Enzyme Structures...,https://identifiers.org/ito:ITO_02411,Accuracy,74.22
2,2004-07-01,https://identifiers.org/ito:ITO_00141,Natural language processing,Unsupervised Dependency Parsing,https://identifiers.org/ito:ITO_30848,Penn Treebank,https://identifiers.org/ito:ITO_30858,Corpus-Based Induction of Syntactic Structure:...,https://identifiers.org/ito:ITO_iAIZzoiwy3CPQwKP7,DMV model in \'Corpus-Based Induction of Synta...,https://identifiers.org/ito:ITO_11157,UAS,35.9
3,2005-01-01,https://identifiers.org/ito:ITO_00126,Biomedical AI process,Arrhythmia Detection,https://identifiers.org/ito:ITO_31974,MIT-BIH AR,https://identifiers.org/ito:ITO_12116,Support vector machine based arrhythmia classi...,https://identifiers.org/ito:ITO_iUZSTbCtrW509t1JR,SVM model in \'Support vector machine based ar...,https://identifiers.org/ito:ITO_12104,Accuracy (Inter-Patient),76.3
4,2005-01-01,https://identifiers.org/ito:ITO_00126,Biomedical AI process,Arrhythmia Detection,https://identifiers.org/ito:ITO_31974,MIT-BIH AR,https://identifiers.org/ito:ITO_12116,Support vector machine based arrhythmia classi...,https://identifiers.org/ito:ITO_iUZSTbCtrW509t1JR,SVM model in \'Support vector machine based ar...,https://identifiers.org/ito:ITO_12105,Accuracy (Intra-Patient),98.7
