In [3]:
# create a dictionary based on all the words in the ontology

import csv

# load from original dataset
with open('CSO.3.1_short.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    entitySet = set()
    for item in reader:
        # subject
        if "https://cso.kmi.open.ac.uk/topics/" in item[0]:
            entity = item[0].replace('https://cso.kmi.open.ac.uk/topics/', '')
            entitySet.add(entity[1:-1])
        
        # object
        if "https://cso.kmi.open.ac.uk/topics/" in item[2]:
            entity = item[2].replace('https://cso.kmi.open.ac.uk/topics/', '')
            entitySet.add(entity[1:-1])
    #print (entitySet)

# store to new dictionary
with open('cso_dict_short.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for entity in entitySet:
        writer.writerow({entity})

print ("Dictionary created!")

Dictionary created!


In [33]:
# query local ontology ('nt' format) with 
from rdflib import Graph
import json

g = Graph()
#g.parse("CSO.3.1.nt", format="nt")
g.parse("dbpedia.nt", format="nt")

results = g.query("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?label
    WHERE { <https://cso.kmi.open.ac.uk/topics/robotics> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?label }
""").serialize(format="json")
results = json.loads(results)

for result in results["results"]["bindings"]:
    print(result["label"]["value"])

In [3]:
# query online ontology ('owl' format) with sparqlWrapper
from SPARQLWrapper import SPARQLWrapper, JSON

#sparql = SPARQLWrapper("dbpedia.owl")
sparql = SPARQLWrapper("http://localhost:8890/sparql")
sparql.setQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?label
    WHERE { <https://cso.kmi.open.ac.uk/topics/robotics> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?label }
""")
# SELECT * WHERE {
#        ?s ?p ?o .
#}
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["label"]["value"])
    #print(result["thumb"]["value"])

http://cso.kmi.open.ac.uk/schema/cso#Topic


In [2]:
import gensim 
from gensim.models import Word2Vec
import csv

sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

# load from original dataset
with open('CSO.3.1_short.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    entitySet = set()
    for item in reader:

In [4]:
import csv
import re
import spacy
from spacy import displacy
from spacy.pipeline import EntityRecognizer
import json

import urllib
#from owlready2 import *
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON

import re
import xml.etree.ElementTree as ET

from allennlp.common.testing import AllenNlpTestCase
from allennlp.predictors.predictor import Predictor

# pre-processing
def PreProcess(senSet):
    #remove content between [ ]
    print("Pre-processing...")
    for index in range(len(senSet)):
        while senSet[index].find('[')>=0:
            i_start = senSet[index].find('[')
            i_end = senSet[index].find(']')
            s = senSet[index][i_start:i_end+2]
            senSet[index] = senSet[index].replace(s, "")


def QueryURI(keywords, index=-2):
    localSite = 'http://localhost:1111/api/search/KeywordSearch?'
    onlineSite = 'http://lookup.dbpedia.org/api/search/KeywordSearch?'
    prefix = "{http://lookup.dbpedia.org/}"
    
    keywords = keywords.replace(' ', "%20")
    request = onlineSite + \
    'QueryClass='   + ''  + \
    '&MaxHits='     + '5' + \
    '&QueryString=' + keywords
    response = str(urllib.request.urlopen(request).read(), 'utf-8')

    root = ET.fromstring(response)
    result = root.findall(prefix + "Result")
    uriList = []
    
    if len(result)>0:
        for entity in result:
            uriList.append(entity.find(prefix + "URI").text);
        return uriList
    else:
        print("Sorry, we find nothing for this stuff :(\n")
        return None
    
    '''if len(result)>0:
        selected = -1
        count = 0
        for name in result:
            print(str(count) + ": " + name.find(prefix + "Label").text)
            count += 1
        # for some default input during debugging
        if index<-1:
            index = int(input("Which one is closer to what you mean? (type \"-1\" if nothing seems correct) "))
        if index >= 0:
            selected = "<" + result[index].find(prefix + "URI").text + ">"
        else:
            selected = None
        return selected.replace("/resource", "/ontology")
    else:
        print("Sorry, we find nothing for this stuff :(\n")
        return None'''

# transfer a phrase to a URI form
def FormatURI(phrase, isS_O = False):
    #print('Before formatting:  ' + phrase)
    chars = list(phrase)
    
    if len(chars) > 0 and not isS_O:
        chars[0] = chars[0].upper()
    for i in range(len(chars)):
        if chars[i] == ' ' and i+1 < len(chars):
            chars[i+1] = chars[i+1].upper()
    phrase = ''.join(chars)
    phrase = phrase.replace(' ', '')
    phrase = re.sub(r'[^a-zA-Z0-9\s]', '', phrase)
    #print('After formatting:  ' + phrase)
    return phrase

# transfer a phrase to a URI form
def NameURI(url):
    index_slash = 0
    for i in range(len(url)-1, -1, -1):
        if url[i] == '/':
            index_slash = i+1
            break
    return url[index_slash:]

# query the given triple in the ontology with SPARQL
# return true/false as result
def QueryTriple(subj, pred, obj):
    if subj==None or pred==None or obj==None:
        return None
    else:
        prefix = """
        PREFIX rdf:<http://www.w3.org/2000/01/rdf-schema#>
        PREFIX dbpd:<http://dbpedia.org/ontology/>
        """
        #subj = "provinceLink"
        #pred = "range"
        #obj = "Province"
        qSelect = prefix + """
        SELECT ?sub WHERE {
          ?sub rdf:""" + FormatURI(pred) + """ dbpd:""" + FormatURI(obj) + """.
        }"""
        qAsk = prefix + """
        ASK {
            dbpd:""" + FormatURI(subj) + """ rdf:""" + FormatURI(pred) + """ dbpd:""" + FormatURI(obj) + """.
        }"""

        r = list(m_graph.query(qAsk))
        return r

# given a URI, query the ontology iteratively to get its path to root
def QueryHierarchy(URI):
    path = []
    path.insert(0, URI)
    
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    curURI = URI
    predicate = "rdf:type"
    endFlag = False # to mark whether a dbo:entity is found in current level
    
    while not endFlag:
        endFlag = True;
        
        qSelect = """
            SELECT ?type WHERE 
            {
            """ + curURI + predicate + """ ?type.
            }
        """

        sparql.setQuery(qSelect)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        for result in results["results"]["bindings"]:
            resultURI = '<' + result["type"]["value"] + '>'
            # begin the class part
            if "owl#Class" in resultURI:
                endFlag = False;
                predicate = "rdfs:subClassOf"
                break;
            # insert the first found dbo:entity into the path
            elif "http://dbpedia.org/ontology" in resultURI:
                endFlag = False;
                curURI = resultURI
                path.insert(0, resultURI)
                break;
     
    # insert the common root node to current path
    path.insert(0, '<http://www.w3.org/2002/07/owl#Thing>')
    return path
            
# get ontology hierarchy for every keyword and append the knowledge tree
def AppendTree(URIList, treeDict):
    for URI in URIList:
        hierarchy = QueryHierarchy(URI);
        #print(hierarchy)
        
        curDict = treeDict;
        for curKey in hierarchy:
            if curKey in curDict:
                curDict = curDict[curKey]
            else:
                curDict[curKey] = dict()
                curDict = curDict[curKey]
                
def QueryInfo(URI, entityInfo):
    #sparql = SPARQLWrapper("dbpedia.owl")
    sparql.setQuery("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?abstract ?thumb
        WHERE {""" 
            + URI + """ dbo:abstract ?abstract ."""
            + URI + """ dbo:thumbnail ?thumb .
        FILTER (lang(?abstract) = 'en')
        }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        entityInfo["abstract"] = result["abstract"]["value"]
        entityInfo["thumbnail"] = result["thumb"]["value"]
    
# A recursive helper function to traverse treeDict and format it to json
def PreorderFormat(curDict):
    if len(curDict) == 0:
        return;
    
    childList = []
    for key in curDict:
        children = PreorderFormat(curDict[key])
        if children:
            childList.append({
                "name": key,
                "children": children
            })
        else:
            childList.append({
                "name": key
            })
    return childList
    
        
def FormatToJson(treeDict):
    result = PreorderFormat(treeDict)
    finalResult = None
    if result:
        finalResult = {
            "name": "GroundRoot",
            "children": result
        }
    return finalResult

def PrintQueryResult(results, sub, pred, obj):
    # for sparqlWrapper
    for group in results:
        #print(group)
        for result in group["results"]["bindings"]:
            print('( ', end='')
            if "sub" in result:
                print(NameURI(result["sub"]["value"]) + ' - ', end='')
            else:
                if '<' in sub:
                    sub = sub[29:-1]
                print(sub + ' -', end='')
            if "pred" in result:
                print(NameURI(result["pred"]["value"]) + ' - ', end='')
            else:
                if '<' in pred:
                    pred = pred[29:-1]
                print(pred + ' -', end='')
            if "obj" in result:
                print(NameURI(result["obj"]["value"]) + ' )\n', end='')
            else:
                if '<' in obj:
                    obj = obj[29:-1]
                print(obj + ' )\n', end='')
        
    '''# for owlready
    for result in results:
        for var in result.vars:
            print(var.toPython())
        for binding in result.bindings:
            print(binding.toPython())'''
    
# extract one triple from given sentence
def RunNER(sen):
    # initialize the named entity list
    entityList = []
    
    # parse sentence
    doc = nlp(str(sen))
    print('\n' + str(index) + '. Original Sentence:\n' + sen)

    #ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
    chunks = []
    for chunk in doc.noun_chunks:
        if "subj" in chunk.root.dep_ or "obj" in chunk.root.dep_:
            # test whether current chunk is or contains stop words
            result = ''
            doc_phrase = nlp(chunk.text)
            for token in doc_phrase:
                #print(token.text, token.is_stop, token.lemma_)
                if not token.is_stop and token.lemma_ != "-PRON-":
                # exclude stop words and personal pronouns (whose lemma_ is "-PRON-")
                    result = result + token.text + ' '
            
            if result != '':
                chunks.append(result[:-1])
    
    return chunks
        
# load Spacy NLP dictionary
nlp = spacy.load('en_core_web_sm')

# load DBPD ontology and construct graph for query
#m_world = World()# Owlready2 stores every triples in a ‘World’ object
#m_onto = m_world.get_ontology("dbpedia.owl").load()
#m_graph = m_world.as_rdflib_graph()
sparql = SPARQLWrapper("http://localhost:8890/sparql")
#sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

# load data
file = open("shortdataset.csv", "r")
#file = open("newdataset_formatted.csv", "r")
reader = csv.reader(file)
senSet = []
for item in reader:
    # format sentences in item as string
    fullP = "".join(item)
    splitP = fullP.split(";", 3);
    splitS = splitP[3][1:len(splitP[3])].split(".");
    #print(splitS)
    for sen in splitS:
        senSet.append(sen)#store the sentence into an array
file.close()
print("Total sentences: " + str(len(senSet)))

# pre-processing
PreProcess(senSet)

treeDict = dict()
cacheDict = dict()
# parse and query each sentence
entityList = []
#for index in range(10, 20):
#for index in range(len(senSet)):
index = 26
sampleSentence = "We examine how animating a viewpoint change in a spatial \
information system affects a user’s ability to build a mental \
map of the information in the space. We found that \
animation improves users' ability to reconstruct the \
information space, with no penalty on task performance \
time. We believe that this study provides strong evidence \
for adding animated transitions in many applications with \
fixed spatial data where the user navigates around the data \
space."
# sampleSentence = "Neverland has the tree house."

# extract named entities from current sentence
entityList = RunNER(sampleSentence)
#entityList = RunNER(senSet[index])
print(entityList)

# look up the URI for the entities
URIList = []
for entity in entityList:
    print("\nFor \"" + entity + "\":")
    try:
        if entity in cacheDict:
            entityURI = cacheDict[entity];
            if entityURI != None: 
                print("You mentioned", entity, "before. Do you mean", entityURI, "?")
            else:
                print("You mentioned", entity, "before, but we can't find anything about it.")

        else:
            entityURI = QueryURI(entity)
            print(entityURI)
            #cacheDict[entity] = entityURI

        print("\n")
        #print("URI: " + entityURI[1:len(entityURI)-1])
        if entityURI != None:
            #URIList.append(entityURI)
            for dbpediaURI in entityURI:
                csoURIs = DBPD2CSO(dbpediaURI)
                URIList.extend(csoURIs)
    except:
        print("none")

print(URIList)

'''if len(URIList)>0:
    AppendTree(URIList, treeDict)'''

outputList = []
if len(URIList)>0:
    for URI in URIList:
        entityInfo = {
            "uri": URI,
            "strPath": "",
            "sentence": senSet[index],
            "abstract": None,
            "thumbnail": None
        }
        hierarchy = QueryHierarchy(URI)
        for curKey in hierarchy:
            entityInfo["strPath"] = entityInfo["strPath"]  + curKey + "&-&"
        entityInfo["strPath"] = entityInfo["strPath"][:-3]
        QueryInfo(URI, entityInfo)
        outputList.append(entityInfo)

print(outputList)

'''treeJson = FormatToJson(outputList)
print(treeJson)

with open('../IdeaTest/Tree/conv-test.json', 'w') as outfile:  
    json.dump(treeJson, outfile, indent = 2)'''
    


Total sentences: 46
Pre-processing...

26. Original Sentence:
We examine how animating a viewpoint change in a spatial information system affects a user’s ability to build a mental map of the information in the space. We found that animation improves users' ability to reconstruct the information space, with no penalty on task performance time. We believe that this study provides strong evidence for adding animated transitions in many applications with fixed spatial data where the user navigates around the data space.
['viewpoint change', 'spatial information system', 'user ’s ability', 'mental map', 'information', 'space', 'animation', "users ' ability", 'information space', 'penalty', 'task performance time', 'study', 'strong evidence', 'animated transitions', 'applications', 'fixed spatial data', 'user', 'data space']

For "viewpoint change":
Sorry, we find nothing for this stuff :(

None



For "spatial information system":
Sorry, we find nothing for this stuff :(

None



For "user

TypeError: must be str, not list



31050


Exception: DESCRIBE not implemented

In [29]:
d = {
    "name": "Root Level",
    "children": [{ 
        "name": "Top Level",
        "children": [{ 
            "name": "Level 2: A",
                "children": [
                    { "name": "Child1 of A" },
                    { "name": "Child2 of A",
                        "children": [
                            { "name": "Child1 of Child2" },
                            { "name": "Child2 of Child2" },
                            { "name": "Child3 of Child2" }
                        ]
                    },
                    { "name": "Child3 of A" },
                    { "name": "Child4 of A" },
                    { "name": "Child5 of A" }
                ]
            },
            { 
                "name": "Level 2: B",
                "children": [
                    { "name": "Child1 of B" },
                    { "name": "Child2 of B" },
                    { "name": "Child3 of B" }
                ]
            }
        ]
    }]
}

# print(d)

node = d["children"][0]["children"][0]
node["name"] = "TestTTTTT"

print(d)

{'name': 'Root Level', 'children': [{'name': 'Top Level', 'children': [{'name': 'TestTTTTT', 'children': [{'name': 'Child1 of A'}, {'name': 'Child2 of A', 'children': [{'name': 'Child1 of Child2'}, {'name': 'Child2 of Child2'}, {'name': 'Child3 of Child2'}]}, {'name': 'Child3 of A'}, {'name': 'Child4 of A'}, {'name': 'Child5 of A'}]}, {'name': 'Level 2: B', 'children': [{'name': 'Child1 of B'}, {'name': 'Child2 of B'}, {'name': 'Child3 of B'}]}]}]}
