# Transform datasets to RDF as defined in the JSON configuration file

In [170]:
configurations_folder = "configurations"
configurations = [
    # "qald.json",
    #"qald.short.json",
    # "lc-quad.json",
    # "rubq.json",
    #"rubq.short.json",
    # "cwq.json",
    "mintaka.json"
]

service_url = "http://localhost:8080/json2rdf" # "http://webengineering.ins.hs-anhalt.de:41399/json2rdf"
cache_directory = "/tmp"
stardog_endpoint = "http://localhost:5820"
stardog_database = "rdfizer"

In [171]:
import json
from pprint import pprint
import requests
import rdflib
import os
import owlrl
import stardog
from stardog import Connection

In [172]:
conn = Connection(stardog_database, endpoint=stardog_endpoint, username='admin', password='admin')
pprint(conn)

<stardog.connection.Connection object at 0x7f6bfc15fc40>


In [173]:
def red(s):
    return "\x1b[31m" + s + "\x1b[0m"

def green(s):
    return "\x1b[32m" + s + "\x1b[0m"

def yellow(s):
    return "\x1b[33m" + s + "\x1b[0m"

def blue(s):
    return "\x1b[34m" + s + "\x1b[0m"

In [174]:
def write_text_to_file(filename, text):
    with open(filename, "w") as f:
        f.write(text)
        f.close()
        
def read_text_from_file(filename):
    with open(filename, "r") as f:
        text = f.read()
        f.close()
    return text

def cache_filename_for_url(url):
    url = url.split('?')[0]
    return url.replace("/", "").replace(":", "").replace(".", "")

def drop_graph(conn, graph):
    try:
        query = f"""DROP GRAPH <{graph}>"""
        conn.update(query)
    except:
        pass

def load_data_into_graph(conn, graph, filename):
    #query = f"""CREATE GRAPH <{graph}>"""
    #conn.update(query)
    conn.begin()
    conn.add(stardog.content.File(filename), graph_uri=graph)
    conn.commit()


def get_count_from_result(results):
    for row in results["results"]["bindings"]:
        count = row["count"]["value"]
        return int(count)

def get_number_of_triples_in_graph(conn, graph):
    results = conn.select(f"SELECT (COUNT(DISTINCT ?s) AS ?count) FROM <{graph}> {{ ?s ?p ?o }}")
    return get_count_from_result(results)

def get_number_of_questions_in_graph(conn, graph):
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT (COUNT(DISTINCT ?s) AS ?count) FROM <{graph}> WHERE {{
        ?s a ?question .
        ?question rdfs:subClassOf <urn:qa:benchmark#Question> . }}"""

    results = conn.select(query)
    return int(get_count_from_result(results))

def get_number_of_valid_questions_in_graph(conn, graph):
    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT (COUNT(DISTINCT ?question) AS ?count)
        FROM <{graph}>
        WHERE {{
            VALUES ?HasQuestionTextProperty {{
                <urn:qa:benchmark#questionEng> # RuBQ
                <urn:qa:benchmark#questionText> # RuBQ
                <urn:qa:benchmark#hasQuestion>  # QALD
            }} # needs to be aligned using OWLRL
            
            VALUES ?hasAnswerProperty {{
                <urn:qa:benchmark#hasAnswer> # QALD
                <urn:qa:benchmark#answer> # RuBQ
            }}
            
            VALUES ?hasQueryProperty {{
                <urn:qa:benchmark#hasSPARQLQuery> # QALD
                <urn:qa:benchmark#query> #RuBQ
            }}
            
            ?question a ?questionType .
            ?question ?hasQueryProperty ?query .
            ?question ?hasAnswerProperty ?answer .
            ?question ?hasQuestionTextProperty ?questionText .
            ?questionType rdfs:subClassOf <urn:qa:benchmark#Question> .
        }}
    """
    #print(query)
    result = conn.select(query)
    count_valid_questions = get_count_from_result(result)
    
    return count_valid_questions

## Push all data to Stardog and retrieve statistics about each dataset using SPARQL queries

In [175]:
for configuration_filename in configurations:
    configuration_list = json.load(open(configurations_folder + "/" + configuration_filename))
    
    for configuration in configuration_list:
        #pprint(configuration)
        
        cache_filename = cache_filename_for_url(configuration["filePath"]) + ".ttl"
        
        if not os.path.isfile(cache_directory + "/" + cache_filename):
            post_data = {   
                "filePath": configuration["filePath"],
                "homepage": configuration["homepage"],
                "format": configuration["format"],
                "label": configuration["label"],
            }
            
            #pprint(post_data)
            turtle_data = requests.post(service_url, json=post_data, headers={'Content-Type': 'application/json'})

            if not turtle_data.ok:
                print(configuration["label"], turtle_data.text, "skipping")
                continue

            write_text_to_file(cache_directory + "/" + cache_filename, turtle_data.text)

        try:
            turtle_data_as_text = read_text_from_file(cache_directory + "/" + cache_filename)
           
            ### TOO SLOW 
            #graph = rdflib.Graph()
            #graph.parse(cache_directory + "/" + cache_filename, format="turtle")
            #owlrl.DeductiveClosure(owlrl.OWLRL_Semantics).expand(graph)
            
            graph = configuration["filePath"]
            
            # init graph
            drop_graph(conn, graph)
            load_data_into_graph(conn, graph, cache_directory + "/" + cache_filename)
            
            # stats
            number_of_triples_in_graph = get_number_of_triples_in_graph(conn, graph)
            number_of_questions_in_graph = get_number_of_questions_in_graph(conn, graph)
            number_of_valid_questions_in_graph = get_number_of_valid_questions_in_graph(conn, graph)
            
            # create colorful output 
            number_of_triples_in_graph_output = "%6d triples" % (number_of_triples_in_graph,)
            if number_of_triples_in_graph == 0:
                number_of_triples_in_graph_output = red(number_of_triples_in_graph_output)
                
            number_of_questions_in_graph_output = "%6d questions" % (number_of_questions_in_graph,)
            if number_of_triples_in_graph == 3 * number_of_questions_in_graph + 1 and number_of_questions_in_graph != 0:
                number_of_questions_in_graph_output = green(number_of_questions_in_graph_output)
            else:
                number_of_questions_in_graph_output = red(number_of_questions_in_graph_output)
            
            number_of_valid_questions_in_graph_output = "%6d valid questions" % (number_of_valid_questions_in_graph,)
            if number_of_valid_questions_in_graph == number_of_questions_in_graph and number_of_valid_questions_in_graph != 0:
                number_of_valid_questions_in_graph_output = green(number_of_valid_questions_in_graph_output)
            else:
                number_of_valid_questions_in_graph_output = red(number_of_valid_questions_in_graph_output)
            
            print("%10d" % (len(turtle_data_as_text),), number_of_triples_in_graph_output, number_of_questions_in_graph_output, number_of_valid_questions_in_graph_output, "\t", configuration["label"], "\t", configuration["filePath"])
           
        except Exception as e:
            print("    ERROR", "\t", configuration["label"],"\t", configuration["filePath"], e)



   3296669   6057 triples [31m  2000 questions[0m [32m  2000 valid questions[0m 	 Mintaka dev 	 https://raw.githubusercontent.com/amazon-science/mintaka/main/data/mintaka_dev.json
   6568887  12057 triples [31m  4000 questions[0m [32m  4000 valid questions[0m 	 Mintaka test 	 https://raw.githubusercontent.com/amazon-science/mintaka/main/data/mintaka_test.json
  23068679  42057 triples [31m 14000 questions[0m [32m 14000 valid questions[0m 	 Mintaka train 	 https://raw.githubusercontent.com/amazon-science/mintaka/main/data/mintaka_train.json
