In [3]:
# create a dictionary based on all the words in the ontology

import csv

# load from original dataset
with open('CSO.3.1_short.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    entitySet = set()
    for item in reader:
        # subject
        if "https://cso.kmi.open.ac.uk/topics/" in item[0]:
            entity = item[0].replace('https://cso.kmi.open.ac.uk/topics/', '')
            entitySet.add(entity[1:-1])
        
        # object
        if "https://cso.kmi.open.ac.uk/topics/" in item[2]:
            entity = item[2].replace('https://cso.kmi.open.ac.uk/topics/', '')
            entitySet.add(entity[1:-1])
    #print (entitySet)

# store to new dictionary
with open('cso_dict_short.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for entity in entitySet:
        writer.writerow({entity})

print ("Dictionary created!")

Dictionary created!


In [32]:
# query local ontology ('nt' format) with rdflib
# export the hierarchy structure of current ontology
# the max depth can be decided by users with MAX_DEPTH
from rdflib import Graph
import json
import time

MAX_DEPTH = 2
totalEntity = 1

def QueryChildren (parentURI, depth): 
    # constraint the max depth for testing
    if depth > MAX_DEPTH:
        return []
    
    results = g.query("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
        SELECT ?children
        WHERE { <""" + parentURI + """> <http://cso.kmi.open.ac.uk/schema/cso#superTopicOf> ?children. }
    """).serialize(format="json")
    results = json.loads(results)

    # the termination case
    if not len(results):
        return []
    
    chilrenList = []
    for result in results["results"]["bindings"]:
        entity = {
            "name": result["children"]["value"],
            "uncertainty" : 3
        }
        subChildren = QueryChildren (result["children"]["value"], depth+1)
        if len(subChildren):
            entity.update( {"children" : subChildren} )
        else:
            entity.update( {"size" : 3} )
        #print(result["children"]["value"])
        chilrenList.append(entity)
        
        global totalEntity
        totalEntity = totalEntity+1

    return chilrenList
    
g = Graph()
g.parse("CSO.3.1.nt", format="nt")
#g.parse("dbpedia.nt", format="nt")

start = time.clock()
csoHierarchy = {
    "name": "https://cso.kmi.open.ac.uk/topics/computer_science",
    "uncertainty" : 3,
    "children": QueryChildren("https://cso.kmi.open.ac.uk/topics/computer_science", 1)
}

#treeJson = FormatToJson(csoHierarchy)
#print(treeJson)

with open('../IdeaTest/bubble-treemaps/html/cso_hierarchy.json', 'w') as outfile:  
    json.dump(csoHierarchy, outfile, indent = 2)

print ("Json file with " + str(totalEntity) + " entities is created successfully!")
print ("Total time: " + str(time.clock()-start))

Json file with 919 entities is created successfully!
Total time: 0.32504400000016176


In [3]:
# query online ontology ('owl' format) with sparqlWrapper
from SPARQLWrapper import SPARQLWrapper, JSON

#sparql = SPARQLWrapper("dbpedia.owl")
sparql = SPARQLWrapper("http://localhost:8890/sparql")
sparql.setQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?
    WHERE { <https://cso.kmi.open.ac.uk/topics/robotics> <<http://www.w3.org/2002/07/owl#sameAs>> <http://dbpedia.org/resource/Database> }
""")
# SELECT * WHERE {
#        ?s ?p ?o .
#}
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["label"]["value"])
    #print(result["thumb"]["value"])

http://cso.kmi.open.ac.uk/schema/cso#Topic


In [42]:
import csv
import re

import spacy
from spacy import displacy
from spacy.pipeline import EntityRecognizer
from nltk.corpus import wordnet 

import json

import urllib
#from owlready2 import *
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON

import re
import xml.etree.ElementTree as ET

from allennlp.common.testing import AllenNlpTestCase
from allennlp.predictors.predictor import Predictor

# pre-processing
def PreProcess(senSet):
    #remove content between [ ]
    print("Pre-processing...")
    for index in range(len(senSet)):
        while senSet[index].find('[')>=0:
            i_start = senSet[index].find('[')
            i_end = senSet[index].find(']')
            s = senSet[index][i_start:i_end+2]
            senSet[index] = senSet[index].replace(s, "")


def QueryURI(keywords, index=-2):
    localSite = 'http://localhost:1111/api/search/KeywordSearch?'
    onlineSite = 'http://lookup.dbpedia.org/api/search/KeywordSearch?'
    prefix = "{http://lookup.dbpedia.org/}"
    
    keywords = keywords.replace(' ', "%20")
    request = onlineSite + \
    'QueryClass='   + ''  + \
    '&MaxHits='     + '5' + \
    '&QueryString=' + keywords
    response = str(urllib.request.urlopen(request).read(), 'utf-8')

    root = ET.fromstring(response)
    result = root.findall(prefix + "Result")
    uriList = []
    
    if len(result)>0:
        for entity in result:
            uriList.append(entity.find(prefix + "URI").text);
        return uriList
    else:
        print("Sorry, we find nothing for this stuff :(\n")
        return None
    
    '''if len(result)>0:
        selected = -1
        count = 0
        for name in result:
            print(str(count) + ": " + name.find(prefix + "Label").text)
            count += 1
        # for some default input during debugging
        if index<-1:
            index = int(input("Which one is closer to what you mean? (type \"-1\" if nothing seems correct) "))
        if index >= 0:
            selected = "<" + result[index].find(prefix + "URI").text + ">"
        else:
            selected = None
        return selected.replace("/resource", "/ontology")
    else:
        print("Sorry, we find nothing for this stuff :(\n")
        return None'''

            
# get ontology hierarchy for every keyword and append the knowledge tree
def AppendTree(URIList, treeDict):
    for URI in URIList:
        hierarchy = QueryHierarchy(URI);
        #print(hierarchy)
        
        curDict = treeDict;
        for curKey in hierarchy:
            if curKey in curDict:
                curDict = curDict[curKey]
            else:
                curDict[curKey] = dict()
                curDict = curDict[curKey]
    
# A recursive helper function to traverse treeDict and format it to json
def PreorderFormat(curDict):
    if len(curDict) == 0:
        return;
    
    childList = []
    for key in curDict:
        children = PreorderFormat(curDict[key])
        if children:
            childList.append({
                "name": key,
                "uncertainty": 3,
                "children": children
            })
        else:
            childList.append({
                "name": key,
                "uncertainty": 3,
                "size": 10
            })
    return childList
    
        
def FormatToJson(treeDict):
    resultList = PreorderFormat(treeDict)
    finalResult = None
    
    # only show the computer science part (remove math ...)
    for result in resultList:
        if "computer_science" in result["name"]:
            finalResult = result
    
    # show all part and add a root node
    '''finalResult = {
        "name": "GroundRoot",
        "uncertainty": 3,
        "children": resultList
    }'''
    
    return finalResult
    
# extract one triple from given sentence
def RunNER(sen):
    # initialize the named entity list
    entityList = []
    
    # parse sentence
    doc = nlp(str(sen))
    print('\n' + str(index) + '. Original Sentence:\n' + sen)

    #ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
    chunks = []
    for chunk in doc.noun_chunks:
        if "subj" in chunk.root.dep_ or "obj" in chunk.root.dep_:
            # test whether current chunk is or contains stop words
            result = ''
            doc_phrase = nlp(chunk.text)
            for token in doc_phrase:
                #print(token.text, token.is_stop, token.lemma_)
                if not token.is_stop and token.lemma_ != "-PRON-":
                # exclude stop words and personal pronouns (whose lemma_ is "-PRON-")
                    result = result + token.text + ' '
            
            if result != '':
                chunks.append(result[:-1])
    
    return chunks

# given a URI in DBPedia, query corresponding URI in CSO
def DBPD2CSO(dbpediaURI):
    csoURIList = []
    
    results = csoGraph.query("""
        SELECT ?csoURI
        WHERE { ?csoURI <http://www.w3.org/2002/07/owl#sameAs> <""" + dbpediaURI + """>. }
    """).serialize(format="json")
    results = json.loads(results)

    for result in results["results"]["bindings"]:
        csoURIList.append('<' + result["csoURI"]["value"] + '>');
        
    return csoURIList

# given a list of candidate uri, select the best one
def SelectURI(source, candiList):
    maxSim = 0;
    maxURI = candiList[0]
    
    for candidate in candiList:
        print(source[source.rfind("/")+1: -1])
        print(candidate[candidate.rfind("/")+1: -1])
        w1 = wordnet.synsets(source[source.rfind("/")+1: -1])
        w2 = wordnet.synsets(candidate[candidate.rfind("/")+1: -1])
        if len(w1) and len(w2):
            similarity = w1[0].wup_similarity(w2[0])
            print(similarity)
            if similarity > maxSim:
                maxSim = similarity
                maxURI = candidate
                print(candidate)
                print(similarity)
    return maxURI

# given a URI, query the ontology iteratively to get its path to root
def QueryHierarchy(URI):
    print(URI)
    path = []
    path.insert(0, URI)
    
    curURI = URI
    endFlag = False # to mark whether a dbo:entity is found in current level
    
    while not endFlag:
        endFlag = True
        
        qSelect = """
            SELECT ?parentURI
            WHERE { ?parentURI <http://cso.kmi.open.ac.uk/schema/cso#superTopicOf> """ + curURI + """. }
        """

        results = csoGraph.query(qSelect).serialize(format="json")
        results = json.loads(results)

        
        for result in results["results"]["bindings"]:
            resultURI = '<' + result["parentURI"]["value"] + '>'
            #print(resultURI)
            curURI = resultURI
            path.insert(0, resultURI)
            endFlag = False
            break;
     
    # insert the common root node to current path
    # path.insert(0, '<https://cso.kmi.open.ac.uk/topics/computer_science>')
    print(path)
    return path
        
# load Spacy NLP dictionary
nlp = spacy.load('en_core_web_sm')

# load DBPD ontology and construct graph for query
#m_world = World()# Owlready2 stores every triples in a ‘World’ object
#m_onto = m_world.get_ontology("dbpedia.owl").load()
#m_graph = m_world.as_rdflib_graph()
sparql = SPARQLWrapper("http://localhost:8890/sparql")
#sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

csoGraph = Graph()
csoGraph.parse("CSO.3.1.nt", format="nt")

# load data
file = open("shortdataset.csv", "r")
#file = open("newdataset_formatted.csv", "r")
reader = csv.reader(file)
senSet = []
for item in reader:
    # format sentences in item as string
    fullP = "".join(item)
    splitP = fullP.split(";", 3);
    splitS = splitP[3][1:len(splitP[3])].split(".");
    #print(splitS)
    for sen in splitS:
        senSet.append(sen)#store the sentence into an array
file.close()
print("Total sentences: " + str(len(senSet)))

# pre-processing
PreProcess(senSet)

cacheDict = dict()
# parse and query each sentence
entityList = []
#for index in range(10, 20):
#for index in range(len(senSet)):
index = 26
sampleSentence = "We examine how animating a viewpoint change in a spatial \
information system affects a user’s ability to build a mental \
map of the information in the space. We found that \
animation improves users' ability to reconstruct the \
information space, with no penalty on task performance \
time. We believe that this study provides strong evidence \
for adding animated transitions in many applications with \
fixed spatial data where the user navigates around the data \
space."
'''sampleSentence = "We believe that this study provides strong evidence \
for adding animated transitions in many applications with \
fixed spatial data where the user navigates around the data \
space."'''

# extract named entities from current sentence
entityList = RunNER(sampleSentence)
#entityList = RunNER(senSet[index])
print(entityList)

# look up the URI for the entities
URIList = []
for entity in entityList:
    print("\nFor \"" + entity + "\":")
    try:
        if entity in cacheDict:
            entityURI = cacheDict[entity];
            if entityURI != None: 
                print("You mentioned", entity, "before. Do you mean", entityURI, "?")
            else:
                print("You mentioned", entity, "before, but we can't find anything about it.")

        else:
            entityURI = QueryURI(entity)
            #print(entityURI)
            #cacheDict[entity] = entityURI

        print("\n")
        #print("URI: " + entityURI[1:len(entityURI)-1])
        if entityURI != None:
            #URIList.append(entityURI)
            csoURIList = []
            for dbpediaURI in entityURI:
                csoURIList.extend(DBPD2CSO(dbpediaURI))
            if len(csoURIList):
                URIList.append(SelectURI(entity, csoURIList))
    except:
        print("none")

print(URIList)

# output the concatenated hierarchy
treeDict = dict()
if len(URIList)>0:
    AppendTree(URIList, treeDict)

treeJson = FormatToJson(treeDict)
print(treeJson)

with open('../IdeaTest/bubble-treemaps/html/cso_query_result.json', 'w') as outfile:  
    json.dump(treeJson, outfile, indent = 2)

'''
# output separated information for each entity
outputList = []
if len(URIList)>0:
    for URI in URIList:
        entityInfo = {
            "uri": URI,
            "strPath": "",
            "sentence": senSet[index],
            "abstract": None,
            "thumbnail": None
        }
        hierarchy = QueryHierarchy(URI)
        for curKey in hierarchy:
            entityInfo["strPath"] = entityInfo["strPath"]  + curKey + "&-&"
        entityInfo["strPath"] = entityInfo["strPath"][:-3]
        #QueryInfo(URI, entityInfo)
        outputList.append(entityInfo)

print(outputList)
'''


SyntaxError: invalid syntax (<ipython-input-42-7428b4e5cd3b>, line 276)

In [39]:
from nltk.corpus import wordnet 
    
w1 = wordnet.synsets("%$3")
w2 = wordnet.synsets("cat")
print(w1)
print(w2)
#print(w1.wup_similarity(w2))

[]
[Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01'), Synset('cat.v.01'), Synset('vomit.v.01')]


In [14]:
source = "<https://cso.kmi.open.ac.uk/topics/sequent_calculus>"
source = source[source.rfind("/")+1: -1]
print(source)

sequent_calculus


In [29]:
d = {
    "name": "Root Level",
    "children": [{ 
        "name": "Top Level",
        "children": [{ 
            "name": "Level 2: A",
                "children": [
                    { "name": "Child1 of A" },
                    { "name": "Child2 of A",
                        "children": [
                            { "name": "Child1 of Child2" },
                            { "name": "Child2 of Child2" },
                            { "name": "Child3 of Child2" }
                        ]
                    },
                    { "name": "Child3 of A" },
                    { "name": "Child4 of A" },
                    { "name": "Child5 of A" }
                ]
            },
            { 
                "name": "Level 2: B",
                "children": [
                    { "name": "Child1 of B" },
                    { "name": "Child2 of B" },
                    { "name": "Child3 of B" }
                ]
            }
        ]
    }]
}

# print(d)

node = d["children"][0]["children"][0]
node["name"] = "TestTTTTT"

print(d)

{'name': 'Root Level', 'children': [{'name': 'Top Level', 'children': [{'name': 'TestTTTTT', 'children': [{'name': 'Child1 of A'}, {'name': 'Child2 of A', 'children': [{'name': 'Child1 of Child2'}, {'name': 'Child2 of Child2'}, {'name': 'Child3 of Child2'}]}, {'name': 'Child3 of A'}, {'name': 'Child4 of A'}, {'name': 'Child5 of A'}]}, {'name': 'Level 2: B', 'children': [{'name': 'Child1 of B'}, {'name': 'Child2 of B'}, {'name': 'Child3 of B'}]}]}]}
