In [None]:
import spacy
import requests
import itertools
import json
import time
import sys
excluded = ["average", "called", 'lifespan', 'maximum', 'minimum'] #words to completely ignore

In [None]:
nlp_web = spacy.load("en_core_web_lg")

In [None]:
nlp_sci = spacy.load("en_core_sci_lg")

In [None]:
#add patterns.json (make sure it's saved in the same directory)

ruler_web = nlp_web.add_pipe("entity_ruler", first=True)
ruler_sci = nlp_sci.add_pipe("entity_ruler", first=True)

with open('./patterns.json') as json_file:
    patterns = json.load(json_file)
    
ruler_web.add_patterns(patterns)
ruler_sci.add_patterns(patterns)

In [None]:
#run this and then the block above to update patterns.json

# nlp_web.remove_pipe("entity_ruler")
# nlp_sci.remove_pipe("entity_ruler")

# General functions

In [None]:
#returns the results of a query after making sure it worked
#returnTries: whether the tries it took for the query to be successfull, should be returned
def querySPARQL(query, returnTries=False):
    url = 'https://query.wikidata.org/sparql'
    results = ""
    i = 1
    while (str(results) != "<Response [200]>"):
        results = requests.get(url, params={'query': query, 'format': 'json'})
        if str(results) == "<Response [200]>":
            results = results.json()
            if returnTries: return results, i
            return results
        else:
            time.sleep(.2)
            i += 1

#returns the wikidata id of a string
#depth: index of the result to return
#prop: whether it's a property instead of an entity
#returnAll: whether all results should be returned in a list (depth will be ignored)
def getWikidataEntityId(term, depth=0, prop=False, returnAll=False):
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbsearchentities', 'language':'en', 'format':'json', 'search':str(term)}
    if prop: params['type'] = 'property'
    json = requests.get(url,params).json()
    if 'search' in json:
        if returnAll: #return a list of all results
            ids = []
            for item in json['search']: ids.append(item['id'])
            return ids
        elif len(json['search']) > depth: #return the nth result where n is depth
            return json['search'][depth]['id']
    return ''

#tries to return the entity corresponding to a term with both parsers, and returns '' otherwise
def termToEnt(term):
    parse = nlp_sci(term)
    if not parse.ents: parse = nlp_web(term)
    if not parse.ents: return ""
    return parse.ents[0]

def removePlural(term):
    if len(term) >= 4:
        if term[-3] in ["s","x","z"]: return term[:-2]
        if term[-4:-2] in ["ss","sh","ch"]: return term[:-2]
        if term[-3:] == "ies": return term[:-3]+'y'
    return term[:-1]

#returns a list of entityIDs for a term
#prop: whether it's a property instead of an entity
def termToIds(term, prop):
    if not term: return []
    
    IDS = []
    ents = [termToEnt(term)]
    
    #use the initial term to get IDs
    IDS.extend(getWikidataEntityId(term, returnAll=True, prop=prop))
    
    #remove 's' from the term in case it's plural, and add it and its entities
    if term[-1] == 's':
        ents.append(termToEnt(removePlural(term)))
        IDS.extend(getWikidataEntityId(removePlural(term), returnAll=True, prop=prop))
        
    #add IDs of the entities in the term, found by the parser
    for ent in ents:
        if str(ent) != "":
            #if the ID is stored in patterns.json, return only that ID
            if ent.label_[0] in ["P","Q"]:
                return [ent.label_]

            #add the IDs that this entity gives
            IDS.extend(getWikidataEntityId(ent, returnAll=True, prop=prop))

    #split the term into words and get more IDs from those
    splitTerms = term.strip().split(" ")
    for splitTerm in splitTerms:
        ents = [termToEnt(splitTerm)]
        if splitTerm[-1] == 's': ents.append(termToEnt(removePlural(splitTerm)))
        for ent in ents:
            if str(ent) != "":
                if ent.label_[0] in ["P","Q"]:
                    IDS.append(ent.label_)
                IDS.extend(getWikidataEntityId(ent, returnAll=True, prop=prop))

    #return the list without duplicates
    return list(dict.fromkeys(IDS))

#returns a list of skos:altLabels for an ID
def getAltLabelsQuerySPARQL(ID):
    query = '''SELECT ?name WHERE { wd:''' + ID + ' skos:altLabel ?name }'
    altLabels = []
    results = querySPARQL(query)
    if results['head']:
        for item in results['results']['bindings']:
            altLabels.append(item['name']['value'].casefold())
            #also add possible plurals
            altLabels.append(item['name']['value'].casefold()+'s')

    return altLabels

# Functions for how and verb questions

In [None]:
# Function that returns an ID (or all IDs) for a given property or entity 
def getID(term, depth=0, property = False, returnAll = False):
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbsearchentities', 
      'language':'en',
      'format':'json'} 
    params['search'] = str(term)
    if property: params['type'] = 'property'
    json = requests.get(url, params).json()
    ID = ''
    if 'search' in json:
        if returnAll: #return a list of all results
            IDs = []
            for item in json['search']:
                IDs.append(item['id'])
            return IDs
        elif len(json['search']) > depth: #return nth result where n is depth
            ID = json['search'][depth]['id']
    return ID         

def querySearch(entID, propID):
    query = 'SELECT ?answerLabel WHERE { wd:' + entID + ' wdt:' + propID + ' ?answer . SERVICE wikibase:label {  bd:serviceParam wikibase:language "en" .  ?answer rdfs:label ?answerLabel . } }'
    url = 'https://query.wikidata.org/sparql'

    data = requests.get(url, params={'query': query, 'format': 'json'}).json()

    if (data['results']['bindings'] == []):
        return 0
    else:
        for item in data['results']['bindings']:
            for var in item :
                answers = []
                answers.append(item[var]['value'])
                return answers

# Function to find entity of parsed questions
def findEntity(parse):
    entity = ''
    numberOfNouns = 0
    i = 0
    for token in parse:
        if token.pos_ == 'NOUN' or token.pos_ == 'PRON':
            numberOfNouns += 1 # First checks the number of nouns in the question
    if numberOfNouns == 1:
        for token in parse: # The search criteria if only one noun is present
            if token.dep_ == 'nsubj'or (token.dep_ == 'attr' and token.pos_ == 'NOUN'):
                entity = token.lemma_
                break
            if token.dep_ == 'amod' or token.dep_ == 'compound':
                if (token.head.dep_ == 'amod' or token.head.dep_ == 'compound'):
                    entity = token.lemma_ + ' ' + token.head.lemma_ + ' ' + token.head.head.lemma_
                else:
                    entity = token.lemma_ + ' ' + token.head.lemma_
                break
            i += 1
    else:
        for token in parse: # Search criterias for when there are multiple nouns in question
            if token.lemma_ not in excluded:
                if ((token.dep_ == 'nsubj' and token.head.dep_ == 'ccomp') or token.pos_ == 'PART'):
                    entity = token.head.lemma_
                    break
                elif token.pos_ == 'ADJ' and token.head.dep_ == 'nsubj' and token.head.head.pos_ != 'AUX':
                    entity = token.lemma_ + ' ' + token.head.lemma_
                    break
                elif ((token.dep_ == 'amod' and token.head.dep_ == 'pobj') or (token.dep_ == 'compound' and token.head.dep_ == 'pobj') or 
                (token.dep_ == 'compound' and token.head.dep_ == 'compound') or (token.dep_ == 'compound') and token.head.dep_ == 'nsubj' and token.head.head.pos_ == 'VERB'):
                    if (token.head.dep_ == 'amod' or token.head.dep_ == 'compound'):
                        entity = token.lemma_ + ' ' + token.head.lemma_ + ' ' + token.head.head.lemma_
                    else:
                        entity = token.lemma_ + ' ' + token.head.lemma_   
                    break
                i += 1 
        if entity == '': # Second search for when there are multiple nouns in the question
            for token in parse:
                if (token.head.head.lemma_ == 'of'):
                    if token.dep_ == 'compound' or token.dep_ == 'amod':
                        if (token.head.dep_ == 'amod' or token.head.dep_ == 'compound'):
                            entity = token.lemma_ + ' ' + token.head.lemma_ + ' ' + token.head.head.lemma_
                        else:
                            entity = token.lemma_ + ' ' + token.head.lemma_  
                    else:
                        entity = token.head.lemma_
                    break

        if entity == '': # Final search for when there are multiple nouns in the question
            for token in parse:
                if ((token.dep_ == 'nsubj' and token.head.pos_ == 'AUX')):
                    entity = token.lemma_ 
                    break   
                elif token.dep_ == 'pobj':
                    entity = token.lemma_
                    break
      
    return entity

# Verb questions

In [None]:
# Function to perform query searches for 'verb' questions
def verbQuerySPARQL(ent, prop):
    query = 'SELECT ?answerLabel WHERE { wd:' + ent + ' wdt:' + prop + ''' ?answer .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }}'''
    results = querySPARQL(query)
    answers = []
    if results['head']:
        for item in results['results']['bindings']:
            for var in item:
                print("                          ", end="\r") #clear the 'loading' message
                answers.append(item[var]['value'])
    return answers


# Main 'verb' questions function
def verbQuestions(parse):
    verb = ''
    for token in parse:
        if token.pos_  == 'VERB':
            verb = token.lemma_
            break
            
    entity, entID, propID = '', '', ''
    for token in parse:
        if (token.dep_ == 'nsubj' or token.dep_ == 'dobj') and (token.head.lemma_ == verb or token.head.pos_ == 'AUX') and token.pos_ == 'NOUN':
            entity = token.lemma_ # First attempt at finding the entity
        if token.pos_  == 'VERB':
            ents = nlp_web(token.lemma_).ents
            if ents:
                if hasattr(ents[0], 'label_') and ents[0].label_[0] == "P":
                    propID = ents[0].label_ # Search for the property ID in the patterns file

    if not entity: entity = findEntity(parse) # If entity was not found prior, use the findEntity function defined above
    entID = termToIds(entity, False)
    answerFound = False
    for entityID in entID:
        answers = verbQuerySPARQL(entityID, propID)
        if answers: return answers
    return []

# How questions

In [None]:
# Function to perform query searches for 'how' questions
def howQuerySPARQL(ent, prop):
    query = 'SELECT ?answerLabel WHERE { wd:' + ent + ' wdt:' + prop + ''' ?answer .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }}'''
    results = querySPARQL(query)
    answers = []
    if results['head']:
        for item in results['results']['bindings']:
            for var in item:
                print("                          ", end="\r") #clear the 'loading' message
                answers.append(item[var]['value'])
    return answers

# Checks the number of nouns in the question
def alternateHowQuestion(parse):
    numberOfNouns = 0
    for token in parse:
        if token.pos_ == 'NOUN': numberOfNouns += 1
    return numberOfNouns > 1

# Alternate method to find the property of the question
def findAlternateProperty(parse):
    prop = ''
    amodPresent = False
    i, j = 0, 0
    for token in parse:
        if token.lemma_ == 'many' and token.head.dep_ == 'nsubj':
            amodPresent = True
            prop = token.head.lemma_
        if token.dep_ == 'amod' and token.head.dep_ == 'nsubj' and token.lemma_ != 'many':
            j += 1
        if j > 0 and token.dep_ == 'nsubj':
            prop = parse[i - j : i + 1].text
        elif (token.pos_ == 'NOUN' and token.dep_ == 'compound' and token.head.dep_ == 'nsubj'):
            prop = token.text + ' ' + token.head.text
            break
        elif token.pos_== 'NOUN' and token.dep_ == 'nsubj' and amodPresent == False:
            prop = token.text
        i += 1
    
    return prop

# Main 'how' question function
def howQuestions(parse):
    entity, entID, propID = '', [], []
    entity = findEntity(parse) # Search for the entity using the findEntity method defined above
    storeOld = False
    for token in parse:
        ents = nlp_web(token.lemma_).ents
        if ents:
            if hasattr(ents[0], 'label_') and ents[0].label_[0] == "P":
                propID = [ents[0].label_] # As a first attempt, search for the property in the patterns file
            if propID == ['P4214']:
                storeOld = True # If the property refers to age, don't let any other saved term overwrite it

    if storeOld: propID = ['P4214']

    if (alternateHowQuestion(parse) and propID == []): # If property is still not found, and there is more than one noun, use the alternate property search method
        prop = findAlternateProperty(parse)
        propID = termToIds(prop, True)
    if not entity:
        for ent in parse.ents:
            entID = termToIds(str(ent), False)
            if entID and propID:
                answers = howQuestionAnswerLoop(propID, entID) # Cross search every saved entity and property ID
                if answers: return answers
    else:
        entID = termToIds(entity, False)
        if entID and propID:
            answers = howQuestionAnswerLoop(propID, entID)
            if answers: return answers
    return []
    
def howQuestionAnswerLoop(IDS_P, IDS_E):
    #query loop, add 1 new P and 1 new E every time
    depth = 0 #index of the new ID_E and ID_P
    while depth < len(IDS_E) or depth < len(IDS_P):
        #show the progress of the loop
        print("\rloading:", depth, "/", max(len(IDS_E), len(IDS_P)), end="\r")
        
        #query with all IDS_E up to depth and IDS_P[depth]
        if depth < len(IDS_P):
            for ID_E in IDS_E[:depth]:
                answers = howQuerySPARQL(ID_E, IDS_P[depth])
                if answers:
                    print("                          ", end="\r") #clear the 'loading' message
                    return answers
        
        #query with IDS_E[depth] and all IDS_P
        if depth < len(IDS_E):
            for ID_P in IDS_P[:depth+1]:
                answers = howQuerySPARQL(IDS_E[depth], ID_P)
                if answers:
                    print("                          ", end="\r")
                    return answers
            
        depth += 1
    
    print("                          ", end="\r")
    return []

# What is the X of Y questions

In [None]:
#performs a SPARQL query for a question in the form: SELECT ?answerLabel WHERE wd:ID_Y wdt:ID_X ?answer,
#returns all answers
def whatQuerySPARQL(prop, ent):
    query = 'SELECT ?answerLabel WHERE { wd:' + ent + ' wdt:' + prop + ''' ?answer .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }}'''
    results = querySPARQL(query)
    answers = []
    if results['head']:
        for item in results['results']['bindings']:
            for var in item:
                print("                          ", end="\r") #clear the 'loading' message
                answers.append(item[var]['value'])
    return answers

#prints the answer(s) to a question of the form: What/Who was/is/were (the/a/an) X of (the/a/an) Y?
#returns all answers
def answer_what(parse, question):
    #get the X and Y
    ofSplit = 0
    for i, word in enumerate(question.split()):
        if word in ["of", "for"]: ofSplit = i-2
    x, y = [], []
    for i, word in enumerate(question.split()[2:]):
        if word in ["the", "an", "a"]: continue
        if i == ofSplit: continue #X is done, start adding to Y
        if word not in excluded:
            if i < ofSplit: x.append(word.strip("?")) #add to X
            else: y.append(word.strip("?")) #add to Y 
                
    x = " ".join(x); y = " ".join(y) 

    #get the entityIDs corresponding to x and y
    IDS_X = termToIds(x, True); IDS_Y = termToIds(y, False);

    #query loop, add 1 new X and 1 new Y every time
    depth = 0 #index of the new ID_Y and ID_X
    while depth < len(IDS_Y) or depth < len(IDS_X):
        #show the progress of the loop
        print("\rloading:", depth, "/", max(len(IDS_Y), len(IDS_X)), end="\r")
        
        #query with all IDS_Y up to depth and IDS_X[depth]
        if depth < len(IDS_X):
            for ID_Y in IDS_Y[:depth]:
                answers = whatQuerySPARQL(IDS_X[depth], ID_Y)
                if answers:
                    print("                          ", end="\r") #clear the 'loading' message
                    return answers
        
        #query with IDS_Y[depth] and all IDS_X
        if depth < len(IDS_Y):
            for ID_X in IDS_X[:depth+1]:
                answers = whatQuerySPARQL(ID_X, IDS_Y[depth])
                if answers:
                    print("                          ", end="\r")
                    return answers
            
        depth += 1
    
    print("                          ", end="\r")
    return []

# Or questions

In [None]:
#returns the answers to a question of the form: Is/Was/Are/Were X (a/an) Y(, Z) or W?
def answer_isXaYor(parse, question):
    #Dependency analysis
    subject, attributes = [], []
    for w in parse:
        if w.dep_ == "nsubj":
            for d in w.subtree:
                if d.dep_ not in ["det", "conj", "cc", "attr", "appos"] and d.text not in excluded:
                    subject.append(d.text.strip(', '))
        if w.dep_ in ["appos", "conj", "attr"]:
            currAttribute = [w.text]
            for d in w.subtree:
                if d.dep_ not in ["det", "conj", "cc", "attr", "appos"] and d.text not in excluded:
                    currAttribute.append(d.text.strip(', '))
            attributes.append(currAttribute)
                    
    subj = " ".join(subject)
        
    #for all possible attributes / entities to compare to, use the function answer_isXaY
    #and print that attribute if it's a superclass
    answers = []
    for attribute in attributes:
        attr = " ".join(attribute) 
        if answer_isXaY(parse, question, subj=subj, attr=attr):
            answers.append(attr)
    return answers

# Are X and Y the same / different questions

In [None]:
#returns the answer to a question that asks whether X and Y are different, or whether X and Y are the same
#same: whether the question is whether X and Y are the same
def answer_isXYdiff(parse, question, same=False):
    #Dependency analysis
    subject, attribute = [], []
        
    #question form 1
    if "different from" in question or "same as" in question:
        for w in parse:
            if w.dep_ in ["nsubj", "npadvmod"]:
                for d in w.subtree:
                    if d.dep_ != "det" and d.text not in excluded and d.text != "same":
                        subject.append(d.text)
            if w.dep_ == "pobj":
                for d in w.subtree:
                    if d.dep_ != "det" and d.text not in excluded and d.text != "same":
                        attribute.append(d.text)
                        
    #question form 2
    else:
        for w in parse:
            if w.dep_ in ["nsubj","attr"]:
                for d in w.subtree:
                    if d.dep_ not in ["det", "conj", "cc"] and d.text not in excluded and d.text not in ["same", "thing"]:
                        subject.append(d.text)
                    elif d.dep_ == "conj":
                        for e in d.subtree:
                            if e.dep_ != "det" and e.text not in excluded and e.text not in ["same", "thing"]:
                                attribute.append(e.text) 
           
    x = " ".join(subject); y = " ".join(attribute)
    
    #get the entityIDs corresponding to the subject and attribute
    IDS_X = termToIds(x, False); IDS_Y = termToIds(y, False);
    
    #query loop 1, first IDS_X and then IDS_Y
    for i, currIDS in enumerate([IDS_X, IDS_Y]):
        for ID in currIDS:
            otherList = [IDS_Y, IDS_X][i]
            
            #query whether one ID is in the 'different from' or 'said to be the same as' property of the other
            queries = ['SELECT ?answer WHERE { wd:' + ID + ' wdt:P1889 ?answer . }',
                       'SELECT ?answer WHERE { wd:' + ID + ' wdt:P460 ?answer . }']
            for j, query in enumerate(queries):
                results = querySPARQL(query)
                for item in results['results']['bindings']:
                    for var in item:
                        if item[var]['value'].split('/')[-1] in otherList: #it is
                            return([True,False][(int(same)+j)%2])
                        
    #query loop 2, first IDS_X and then IDS_Y   
    for i, currIDS in enumerate([IDS_X, IDS_Y]):
        for ID in currIDS:
            #query whether the altLabels of one ID contains the other term
            alts = getAltLabelsQuerySPARQL(ID)
            if not set(alts).isdisjoint([y,x][i].split()) or " ".join([y,x][i]) in alts: #it does
                return([False,True][int(same)])
            
    #nothing was found, so X and Y are likely different
    return([True,False][int(same)])

# Yes/no questions

In [None]:
#returns the answer to a question that asks whether X has a property with value Y
#same: whether the question is whether X and Y are the same
def answer_isXpY(parse, question, subj, val):
    #get entityIDs corresponding to the subject and object
    IDS_X = termToIds(subj, False); IDS_Y = termToIds(val, False);
        
    for ID_X in IDS_X:
        query = '''SELECT ?answer 
              WHERE { wd:''' + ID_X + ' ?prop ?answer . }'
        results = querySPARQL(query)
        for item in results['results']['bindings']:
            for var in item:
                if item[var]['value'].split('/')[-1] in IDS_Y:
                    return True
    return False
    
#returns the answer to a question of the form: Is/Was/Are/Were/Does/Did/Do/Can/Could X (a/an) Y?
#Y can be an entity or a property
def answer_isXY(parse, question):
    #Dependency analysis
    subject, values = [], []
    for w in parse:
        if w.dep_ == "nsubj":
            for d in w.subtree:
                if d.dep_ != "det" and d.text not in excluded:
                    subject.append(d.text.strip(', '))
    for w in parse:
        if w.text not in subject:
            currValue = []
            for d in w.subtree:
                if d.dep_ not in ["det", "nsubj", "punct", "ROOT"] and d.text not in excluded:
                    currValue.append(d.text.strip(', '))
            if currValue: values.append(" ".join(currValue))
                    
    subj = " ".join(subject)
    
    #try to find Y in a property of X with the function answer_isXpY
    for i, value in enumerate(values):
        print("\rloading:", i, "/", len(values), end="\r")
            
        if answer_isXpY(parse, question, subj, value):
            print("                          ", end="\r")
            return True
        
    print("                          ", end="\r")
        
    #use the function answer_isXaY in case Y is an entity
    if parse[0].text.casefold() in ["is","was","are","were"]:
        return(answer_isXaY(parse, question))
    return True

# Is X a Y questions

In [None]:
#recursively checks superclasses of a class(cls) to find a not necessarily direct superclass(sprCls)
#depth: the number of superclasses away from the original class
def isSuperclass(cls, sprClasses, sprTerm, checked=set(), depth=0):
    url = 'https://query.wikidata.org/sparql'
    
    #query all superclasses (and 'instance of' classes) of this class
    if depth < 3:
        queries = ['SELECT ?answer WHERE { wd:' + cls + ' wdt:P279 ?answer . }',
                   'SELECT ?answer WHERE { wd:' + cls + ' wdt:P31 ?answer . }']
        for query in queries:
            results, tries = querySPARQL(query, returnTries=True)
            if results['head']:
                for i, item in enumerate(results['results']['bindings']):
                    for var in item:
                        actualSpr = item[var]['value'].split('/')[-1]

                        if actualSpr not in checked: #don't do the same queries again
                            checked.add(actualSpr)
                            
                            #check whether the actual superclass of this class is the correct one
                            if actualSpr in sprClasses: return True, checked
                            altLabels = getAltLabelsQuerySPARQL(actualSpr)
                            if sprTerm.casefold() in altLabels: return True, checked
                            
                            found, newChecked = isSuperclass(actualSpr, sprClasses, sprTerm, checked=checked, depth=depth+1)
                            checked.update(newChecked)
                            if found: return True, checked
            
    return False, checked
    
#returns the answer to a question of the form: Is/Was/Are/Were X (a/an) Y?
#subj: subject given by the function answer_isXaYor
#attr: attribute given by the function answer_isXaYor
def answer_isXaY(parse, question, subj="", attr=""):
    if not subj:
        #Dependency analysis
        subject, attribute = [], []
        for w in parse:
            if w.dep_ == "nsubj":
                for d in w.subtree:
                    if d.dep_ != "det" and d.text not in excluded: subject.append(d.text)
            elif w.dep_ == "attr":
                for d in w.subtree:
                    if d.dep_ != "det" and d.text not in excluded:
                        attribute.append(d.text)
                        
        subj = " ".join(subject); attr = " ".join(attribute) 
                    
    #get the entityIDs corresponding to subj and attr
    IDS_S = termToIds(subj, False); IDS_O = termToIds(attr, False);
    
    #query loop
    for i, ID_S in enumerate(IDS_S):
        print("\rloading:", i, "/", len(IDS_S), end="\r")
        
        #use the isSuperclass function to recursively check whether an ID_O is a(n inderect) superclass of ID_S
        found, temp = isSuperclass(ID_S, IDS_O, attr)
        if found:
            print("                          ", end="\r")
            return True  
        
    print("                          ", end="\r")
    return False

# List questions

In [None]:
#returns the answers to a question of the form: List/Name X of/for/in Y
def answer_listXofY(parse, question, reverse=False):
    #Dependency analysis
    ent, prop = [], []
    for w in parse:
        if w.dep_ in ["nsubj"]:
            for d in w.subtree:
                if d.dep_ not in ["det", "prep", "dobj", "pobj"] and d.text not in excluded:
                    ent.append(d.text)
        if w.dep_ in ["relcl", "dobj", "amod", "pobj", "prep", "attr"]:
            for d in w.subtree:
                if d.dep_ not in ["det", "acl", "nsubj"] and d.text not in excluded:
                    prop.append(d.text)

    ent = " ".join(ent); prop = " ".join(prop) 
                    
    if reverse: temp = ent; ent = prop; prop = temp;
        
    #get the entityIDs corresponding to x and y
    IDS_X = termToIds(prop, True); IDS_Y = termToIds(ent, False);

    #query loop, add 1 new X and 1 new Y every time
    depth = 0 #index of the new ID_Y and ID_X
    while depth < len(IDS_Y) or depth < len(IDS_X):
        #show the progress of the loop
        print("\rloading:", depth, "/", max(len(IDS_Y), len(IDS_X)), end="\r")
        
        #query with all IDS_Y up to depth and IDS_X[depth]
        if depth < len(IDS_X):
            for ID_Y in IDS_Y[:depth]:
                answers = whatQuerySPARQL(IDS_X[depth], ID_Y)
                if answers:
                    print("                          ", end="\r") #clear the 'loading' message
                    return answers
        
        #query with IDS_Y[depth] and all IDS_X
        if depth < len(IDS_Y):
            for ID_X in IDS_X[:depth+1]:
                answers = whatQuerySPARQL(ID_X, IDS_Y[depth])
                if answers:
                    print("                          ", end="\r")
                    return answers
            
        depth += 1
    
    print("                          ", end="\r")
    if not reverse and (ent or prop): return answer_listXofY(parse, question, reverse=True)
    return []

# Count Questions

In [None]:
# Function to perform query searches for count questions
def queryCountSearch(ent, prop):
    query = 'SELECT (COUNT(?prop) as ?answer) WHERE { wd:' + ent + ' wdt:' + prop + ' ?prop . SERVICE wikibase:label {  bd:serviceParam wikibase:language "en" . } }'
    results = querySPARQL(query)
    answers = []
    if results['head']:
        for item in results['results']['bindings']:
            for var in item:
                answers.append(item[var]['value'])
                print("                          ", end="\r") #clear the 'loading' message
    return answers

# Function to check how many nouns is in a question 
def moreThanOneNoun(parse):
    numberOfNouns = 0
    for token in parse:
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            numberOfNouns += 1
    return numberOfNouns > 1

# Searching for the entity in count questions
def findCountEntity(parse):
    entity = []
    doesPresent = False
    for token in parse:
        if entity == []:
            if token.lemma_ == 'do':
                doesPresent = True
            if doesPresent:
                if token.pos_ == 'PROPN':
                    if (token.head.pos_ == 'PROPN'):
                        entity = token.lemma_ + ' ' + token.head.lemma_
                        break
                    else:
                        entity = token.lemma_
                        break
                if token.pos_ == 'NOUN':
                    entity = token.lemma_
                    break 
    if entity == []: # If entity was not found after first attempt, try second set of search criteria
        for token in parse:
            if token.pos_ == 'PROPN':
                if (token.head.pos_ == 'PROPN'):
                    entity = token.lemma_ + ' ' + token.head.lemma_
                else:
                    entity = token.lemma_
                break
            if not moreThanOneNoun(parse) and token.pos_ == 'NOUN':
                entity = token.lemma_
                break
    if entity == []: # If entity is still not found, use the findEntity method defined above
        entity = findEntity(parse)
    return entity

# Function to search for the property of count questions
# It searches for tokens whose head is a noun, and the head's head token is not a verb
def searchForCountProperty(parse):
    prop, propID = [], []
    for token in parse:
        if token.lemma_ == 'many':
            if token.head.pos_ == 'NOUN' and token.head.head.pos_ != 'VERB':
                prop = token.head.lemma_
                propID = getID(prop, property = True)
    return propID

# Performs two types of property checks
def getCountProperty(parse):
    propID = []
    ents = []
    for token in parse: # First, we search for the property in the patterns file
        if token.pos_ == 'NUM':
            propID = 'P1082'
            break
        ents = nlp_web(token.lemma_).ents
        if ents:
            if hasattr(ents[0], 'label_') and ents[0].label_[0] == "P":
                propID = ents[0].label_

    if propID == []: # If not found, we use the aforementioned searchForCountProperty function
        propID = searchForCountProperty(parse)
    return propID

# Main count question function
def countQuestion(parse):
    entity, entID, propID = '', '', ''
    entity = findCountEntity(parse) # Search for entity using specialised method
    propID = getCountProperty(parse) # Obtain the property using the previously mentioned function
    alt = False
    if propID == 'P7725' or propID == 'P3395' or  propID == 'P1082': # If the property ID matches one of these, it means we use a query search that is similar to other question types
        alt = True
    answerFound = False
    if entity == []: # If entity was not found using the function, search through the entities in parse.ents
        for ent in parse.ents:
            entID = termToIds(ent, False)
            for entityID in entID:
                if(alt):
                    answers = howQuerySPARQL(entityID, propID) # We use the query search that is used for 'how' questions when alt is set true
                    if answers: return answers
                else:
                    answers = countQuestionAnswerLoop(propID, entityID) # Otherwise, we use the standard count query search
                    if answers: return answers  
    else:
        entID = termToIds(entity, False)
        for entityID in entID:
            if(alt):
                answers = howQuerySPARQL(entityID, propID)
                if answers: return answers
            else:
                answers = countQuestionAnswerLoop(propID, entityID)
                if answers: return answers 
    return []

def countQuestionAnswerLoop(IDS_P, IDS_E):
    #query loop, add 1 new P and 1 new E every time
    depth = 0 #index of the new ID_E and ID_P
    while depth < len(IDS_E) or depth < len(IDS_P):
        #show the progress of the loop
        print("\rloading:", depth, "/", max(len(IDS_E), len(IDS_P)), end="\r")
        
        #query with all IDS_E up to depth and IDS_P[depth]
        if depth < len(IDS_P):
            for ID_E in IDS_E[:depth]:
                answers = queryCountSearch(ID_E, IDS_P[depth])
                if answers:
                    print("                          ", end="\r") #clear the 'loading' message
                    return answers
        
        #query with IDS_E[depth] and all IDS_P
        if depth < len(IDS_E):
            for ID_P in IDS_P[:depth+1]:
                answers = queryCountSearch(IDS_E[depth], ID_P)
                if answers:
                    print("                          ", end="\r")
                    return answers
            
        depth += 1
    
    print("                          ", end="\r")

# Different language question

In [None]:
def otherLanguage(qtype, language, parse):
    with open('./language_codes.json') as json_file:
        languageCodes = json.load(json_file)
    languageLabel = ""
    
    for i in languageCodes:
        file_languages = i["pattern"].lower().split("; ")
        for file_lang in file_languages:
            if file_lang == language.lower():
                languageLabel = i["label"]
    searches = {}                 
    if qtype == 1: # What is the X word/name for Y?
        i = 0
        for word in parse : # iterate over the token objects
            while (word.text == 'for'):  
                j = i + 1
                break
            i += 1
        searches["entity"] = parse[j:i-1].text #entity
        entID = termToIds(searches["entity"], False)
    elif qtype == 2: # What is Y called in X?
        i = 0
        for word in parse : # iterate over the token objects
            while (word.lemma_ == 'be'):  
                j = i + 1
                break
            while (word.lemma_ == "call"):
                k = i
                break
            i += 1
        searches["entity"] = parse[j:k].text #entity
        entID = termToIds(searches["entity"], False)
    elif qtype == 3: # How do I say Y in X? (Where X is a language)
        i = 0
        for word in parse : # iterate over the token objects
            while (word.lemma_ == 'say'):  
                j = i + 1
                break
            while (word.lemma_ == "in"):
                k = i
                break
            i += 1
        searches["entity"] = parse[j:k].text #entity
        entID = termToIds(searches["entity"], False) 
    elif qtype == 4: # Other  
        entity = []
        for w in parse:
            if w.dep_ in ["pobj"]:
                for d in w.subtree:
                    if d.dep_ not in ["det"] and d.text not in excluded:
                        entity.append(d.text)
        entID = termToIds(" ".join(entity), False)
        
    for ID_E in entID:
        answers = languageQuery(ID_E, languageLabel)
        if answers: return answers
    
    return []

def languageQuery(entID, languageLabel):
    query = 'SELECT ?name WHERE { wd:'+entID+' rdfs:label ?name FILTER(lang(?name) = "'+languageLabel+'")}'
    results = querySPARQL(query)
    for item in results['results']['bindings']:
        for var in item:
            answers = [item[var]['value']]
            if answers: return answers
        if len(item) == 0:
            return []
    return []

# Description question

In [None]:
def descriptionQuestion(parse):
    entity = ""
    if (parse[2].lemma_ == "a") or (parse[2].lemma_ == "an"): # What is a/an X?
        i = 3
    else: 
        i = 2 # What is X?
    for word in parse[i:len(parse) - 1]:
        entity = entity + " " + word.text
    entID = termToIds(entity, False)
        
    for ID_E in entID:
        answers = descriptionQuery(ID_E)
        if answers: return answers
        
    return []
        
def descriptionQuery(entID):
    query = '''SELECT ?description
  WHERE {
    wd:'''+entID+''' schema:description ?description .
    FILTER(lang(?description) = "en")
        }'''
    results = querySPARQL(query)
    for item in results['results']['bindings']:
        for var in item:
            answers = [item[var]['value']]
            if answers: return answers
        if len(item) == 0:
            return []
    return []

# Main

In [None]:
def printAnswers(answers, question, depth=0, isListQuestion=False):
    if answers: print(", ".join(answers))
    elif depth == 0 and not isListQuestion:
        parse = nlp_web(question)
        answers = answer_listXofY(parse, question)
        printAnswers(answers, question, depth=1)
    else:
        print("null")

def main():
    with open('./all_submitted_questions.json') as json_file:
        questions = json.load(json_file)

    for i, item in enumerate(questions):
        if not(0 <= i < 999): continue
        question = item['string']
        print(i, question)
        parse = nlp_web(question)
        question_split = question.split()
        isLanguageQuestion = False
        
        # What is the X word/name for Y?(Where X is a language) 
        if (len(parse) > 6) and (parse[0].lemma_ == "what") and (parse[4].lemma_ == "word" or parse[4].lemma_ == "name") and (parse[5].lemma_ == "for"):
            qtype = 1
            printAnswers(otherLanguage(qtype, parse[3].lemma_, parse), question)
            isLanguageQuestion = True
            
         # What is Y called in X?(Where X is a language) 
        elif (len(parse) > 5) and (parse[0].lemma_ == "what") and (parse[3].lemma_ == "call") and (parse[4].lemma_ == "in"):
            qtype = 2
            printAnswers(otherLanguage(qtype, parse[len(parse) - 2].lemma_, parse), question)
            isLanguageQuestion = True
            
        # How do I say Y in X? (Where X is a language) 
        elif (len(parse) > 5) and (parse[0].lemma_ == "how") and (parse[3].lemma_ == "say"):
            qtype = 3
            printAnswers(otherLanguage(qtype, parse[len(parse) - 2].lemma_, parse), question)
            isLanguageQuestion = True
            
        # Other question with a language in it 
        elif True:
            with open('./language_codes.json') as json_file:
                languageCodes = json.load(json_file)
            
            breakLanguage = False
            for word in parse:
                if word.dep_ != "amod":
                    for i in languageCodes:
                        if i["pattern"].lower() == word.lemma_.lower():
                            language = i["label"]
                            breakLanguage = True
                            break
                    if breakLanguage:
                        qtype = 4
                        printAnswers(otherLanguage(qtype, word.lemma_, parse), question)
                        isLanguageQuestion = True
                        break
                    
        if isLanguageQuestion: continue
            
        #What is the X of Y question
        if question_split[0].casefold() in ["what","who","which"] and question_split[1].casefold() in ["was","is","were","are"] and ("of" in question_split or "for" in question_split) and (parse[4].lemma_ != "word" and parse[4].lemma_ != "name"):
            printAnswers(answer_what(parse, question), question)
          
        #Yes/no questions
        elif question_split[0].casefold() in ["is","was","are","were","does","did","do","can","could"]:
            if "different" in question_split:
                print(["No","Yes"][answer_isXYdiff(parse, question)])

            elif "same" in question_split:
                print(["No","Yes"][answer_isXYdiff(parse, question, same=True)])

            elif "or" in question_split:
                printAnswers(answer_isXaYor(parse, question), question)

            else: print(["No","Yes"][answer_isXY(parse, question)])
            
        elif question_split[0].casefold() == "how" and question_split[1].casefold() != "many":
            answers = howQuestions(parse)
            if not answers:
                parse = nlp_sci(question)
                answers = howQuestions(parse)
            if not answers:
                answers = answer_what(parse, "What " + question.split(' ', 2)[2])
            printAnswers(answers, question)

        elif question_split[0].casefold() == "how" and question_split[1].casefold() == "many":
            answers = countQuestion(parse)
            if not answers:
                parse = nlp_sci(question)
                answers = countQuestion(parse)
            printAnswers(answers, question)
                
        #Other questions
        else:
            isListQuestion = False
            #Try verb question
            answers = verbQuestions(nlp_web(question))
            if not answers:
                #Try general/list question
                answers = answer_listXofY(parse, question)
                isListQuestion = True
                if not answers:
                    isListQuestion = False
                    #What is/are (a/an) X? (description question)
                    if (len(parse) > 2) and (parse[0].lemma_ == "what") and (parse[1].lemma_ == "be"):
                        answers = descriptionQuestion(parse)
            
            printAnswers(answers, question, isListQuestion=isListQuestion)
                    
if __name__ == '__main__':
    main()