In [None]:
from wikipedia import *
from operator import itemgetter
import requests
anchor2concept('Hello')

In [None]:
id2title(48324759L)

In [None]:
"""
Testing the Solr splitting
"""

import requests
import json
from wikipedia import *
from __future__ import division
import nltk
from nltk.corpus import stopwords
from unidecode import unidecode
import tagme

tagme.GCUBE_TOKEN = "f6c2ba6c-751b-4977-a94c-c140c30e9b92-843339462"

def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """

    q='+text:(\"%s\")'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    try:
        if 'response' not in r.json():
            return 0
        else:
            return r.json()['response']['numFound']
    except:
        return 0

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    result = anchor2concept(s)
    rSum = 0
    for item in result:
        rSum += item[1]
        
    return rSum

def getTextMentions(line):
    """
    Description:
        A helper method to get the mentions in an evaluable format, includes the mentions'
        start and end.
    Args:
        line: The json data that has info that needs to be converted.
    Return:
        The mentions in the form [[start, end, text],...].
    """
    
    mentions = []
    curWord = 0 
    curStart = 0
    for mention in line['mentions']:
        while curWord < mention[0]:
            curStart += len(line['text'][curWord]) + 1
            curWord += 1
        mentions.append([curStart, curStart + len(line['text'][curWord]), line['text'][curWord]])
        
    return mentions

def destroyExclusiveOverlaps(textData):
    """
    Description:
        Removes all overlaps from text data, so that only the best mention in an
        overlap set is left.
    Args:
        textData: [[start, end, text, anchProb],...]
    Return:
        textData minus the unesescary parts of the overlapping
    """
    
    newTextData = [] # textData minus the unesescary parts of the overlapping
    overlappingSets = [] # stores arrays of the indexes of overlapping items from textData
    
    # creates the overlappingSets array
    i = 0
    while i < len(textData)-1:
        # even single elements considered overlapping set
        # this is root of overlapping set
        overlappingSets.append([i])
        overlapIndex = len(overlappingSets) - 1
        theBegin = textData[i][0]
        
        # look at next words untill not overlap
        for j in range(i+1, len(textData)):
            # if next word starts before endiest one ends
            if textData[j][0] == theBegin:
                overlappingSets[overlapIndex].append(j)
                i = j # make sure not to repeat overlap set
            else:
                # add final word
                if j == len(textData) - 1:
                    overlappingSets.append([j])
                break
        i += 1
                    
    # get only the best overlapping element of each set
    for oSet in overlappingSets:
        bestIndex = 0
        bestScore = -1
        for i in oSet:
            totalMentions = get_mention_count(textData[i][2])
            totalAppearances = get_solr_count(textData[i][2].replace(".", ""))
            if totalAppearances == 0:
                score = 0
            else:
                score = totalMentions/totalAppearances
            
            if score > bestScore:
                bestScore = score
                bestIndex = i
        
        # put right item in new textData
        newTextData.append(textData[bestIndex])
        
    return newTextData
                    
def getSolrMentions(text):
    """
    Description:
        A method to split the text and try to extract mentions using Solr.
    Args:
        text: The text to find mentions in.
    Return:
        The mentions as found from our method using Solr.
    """
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'ALL', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=text.encode('utf-8'))
    textData0 = r.json()['tags']
    
    textData = []
    # get rid of extra un-needed Solr data, and add in anchor probability
    for item in textData0:
        totalMentions = get_mention_count(text[item[1]:item[3]])
        totalAppearances = get_solr_count(text[item[1]:item[3]].replace(".", ""))
        if totalAppearances == 0:
            anchorProb = 0
        else:
            anchorProb = totalMentions/totalAppearances
        
        # put in the new clean textData
        textData.append([item[1], item[3], text[item[1]:item[3]], anchorProb])
    
    textData = destroyExclusiveOverlaps(textData)
    
    # gets the POS labels for the words
    postrs = []
    for item in textData:
        postrs.append(item[2])
    postrs = nltk.pos_tag(postrs)
    for i in range(0,len(textData)):
        textData[i].append(postrs[i]) # [4][1] is index of type of word
    
    mentions = []
    mentionPThrsh = 0.001
    
    for item in textData:
        if (item[3] >= mentionPThrsh
                and (item[4][1][0:2] == 'NN' or item[4][1] == 'JJ')):
            mentions.append([item[0], item[1], item[2]])
    
    return mentions

def precision(trueMentions, otherMentions):
    """
    Description:
        Calculates the precision of otherSet against the trueMentions.
    Args:
        trueMentions: The 'right' answers for what the mentions are.
        otherMentions: Our mentions obtained through some means.
    Return:
        The precision: (# of correct mentions)/(# of found mentions)
    """
    
    numFound = len(otherMentions)
    numCorrect = 0 # incremented in for loop
    
    trueIndex = 0
    otherIndex = 0
    
    while trueIndex < len(trueMentions) and otherIndex < len(otherMentions):
        # if mentions start and end on the same
        if (trueMentions[trueIndex][0] == otherMentions[otherIndex][0]
               and trueMentions[trueIndex][1] == otherMentions[otherIndex][1]):
            #print ('MATCH: [' + str(trueMentions[trueIndex][0]) + ',' + str(trueMentions[trueIndex][1]) + ']' + trueMentions[trueIndex][2] 
            #       + ' <===> [' + str(otherMentions[otherIndex][0]) + ',' + str(otherMentions[otherIndex][1]) + ']' + otherMentions[otherIndex][2])
            numCorrect += 1
            trueIndex += 1
            otherIndex += 1
        # if true mention starts before the other starts
        elif trueMentions[trueIndex][0] < otherMentions[otherIndex][0]:
            #print ('FAIL: [' + str(trueMentions[trueIndex][0]) + ',' + str(trueMentions[trueIndex][1]) + ']' + trueMentions[trueIndex][2] 
            #       + ' <XXX> [' + str(otherMentions[otherIndex][0]) + ',' + str(otherMentions[otherIndex][1]) + ']' + otherMentions[otherIndex][2])
            trueIndex += 1
        # if other mention starts before the true starts (same doesnt matter)
        elif trueMentions[trueIndex][0] >= otherMentions[otherIndex][0]:
            #print ('FAIL: [' + str(trueMentions[trueIndex][0]) + ',' + str(trueMentions[trueIndex][1]) + ']' + trueMentions[trueIndex][2] 
            #       + ' <XXX> [' + str(otherMentions[otherIndex][0]) + ',' + str(otherMentions[otherIndex][1]) + ']' + otherMentions[otherIndex][2])
            otherIndex += 1
        else:
            print 'AAAAAAAHHHHHHHHHHHHHHHHHHHHHHHHHHHHH!!!!!!!!!!!!!!!!!!!'

    print 'correct: ' + str(numCorrect) + '\nfound: ' + str(numFound)
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(trueMentions, otherMentions):
    """
    Description:
        Calculates the recall of otherSet against the trueMentions.
    Args:
        trueMentions: The 'right' answers for what the mentions are.
        otherMentions: Our mentions obtained through some means.
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(trueMentions)
    numCorrect = 0 # incremented in for loop)
    
    trueIndex = 0
    otherIndex = 0
    
    while trueIndex < len(trueMentions) and otherIndex < len(otherMentions):
        # if mentions start and end on the same
        if (trueMentions[trueIndex][0] == otherMentions[otherIndex][0]
               and trueMentions[trueIndex][1] == otherMentions[otherIndex][1]):
            numCorrect += 1
            trueIndex += 1
            otherIndex += 1
        # if true mention starts before the other starts
        elif trueMentions[trueIndex][0] < otherMentions[otherIndex][0]:
            trueIndex += 1
        # if other mention starts before the true starts (same doesnt matter)
        elif trueMentions[trueIndex][0] >= otherMentions[otherIndex][0]:
            otherIndex += 1
        
    print 'correct: ' + str(numCorrect) + '\nactual: ' + str(numActual)
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
    
    # reset counters
    totalPrec = 0
    totalRec = 0
    totalLines = 0

    # each method tests all lines
    for line in dataLines:

        print str(totalLines + 1)

        trueMentions = getTextMentions(line)
        solrMentions = getSolrMentions(" ".join(line['text']))
        
        print line['text']
        print trueMentions
        print str(solrMentions) + '\n'
        
        """solrMentions0 = tagme.mentions(" ".join(line['text']))
        solrMentions = []
        for item in solrMentions0.mentions:
            solrMentions.append([item.begin, item.end, item.mention])"""
        
        ## get statistical results from true mentions and solr mentions
        
        aNumber = len(solrMentions)/len(trueMentions)

        prec = precision(trueMentions, solrMentions)
        rec = recall(trueMentions, solrMentions)
        print str(prec) + ' ' + str(rec) + '\n'

        # track results
        totalPrec += prec
        totalRec += rec
        totalLines += 1

    # record results for this method on this dataset
    performances[dataset['name']] = {'Precision':totalPrec/totalLines, 
                                     'Recall':totalRec/totalLines}
            
print performances

In [None]:
text = "Dalhousie University"

print get_mention_count(text)
print get_solr_count(text)

print get_mention_count(text)/get_solr_count(text)

```curl -X POST \
  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'Hello New York City'```
 

In [1]:
"""
Wikification for evaluation purposes
"""

from wikipedia import *
from operator import itemgetter
import requests
import json
from __future__ import division
import nltk

MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored

def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """

    q='+text:(\"%s\")'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    try:
        if 'response' not in r.json():
            return 0
        else:
            return r.json()['response']['numFound']
    except:
        return 0

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    result = anchor2concept(s)
    rSum = 0
    for item in result:
        rSum += item[1]
        
    return rSum

def destroyExclusiveOverlaps(textData):
    """
    Description:
        Removes all overlaps from text data, so that only the best mention in an
        overlap set is left.
    Args:
        textData: [[start, end, text, anchProb],...]
    Return:
        textData minus the unesescary elements that overlap.
    """
    
    newTextData = [] # textData minus the unesescary parts of the overlapping
    overlappingSets = [] # stores arrays of the indexes of overlapping items from textData
    
    # creates the overlappingSets array
    i = 0
    while i < len(textData)-1:
        # even single elements considered overlapping set
        # this is root of overlapping set
        overlappingSets.append([i])
        overlapIndex = len(overlappingSets) - 1
        theBegin = textData[i][0]
        
        # look at next words untill not overlap
        for j in range(i+1, len(textData)):
            # if next word starts before endiest one ends
            if textData[j][0] == theBegin:
                overlappingSets[overlapIndex].append(j)
                i = j # make sure not to repeat overlap set
            else:
                # add final word
                if j == len(textData) - 1:
                    overlappingSets.append([j])
                break
        i += 1
                    
    # get only the best overlapping element of each set
    for oSet in overlappingSets:
        bestIndex = 0
        bestScore = -1
        for i in oSet:
            totalMentions = get_mention_count(textData[i][2])
            totalAppearances = get_solr_count(textData[i][2].replace(".", ""))
            if totalAppearances == 0:
                score = 0
            else:
                score = totalMentions/totalAppearances
            
            if score > bestScore:
                bestScore = score
                bestIndex = i
        
        # put right item in new textData
        newTextData.append(textData[bestIndex])
        
    return newTextData

def mentionStartsAndEnds(textData, forTruth = False):
    """
    Description:
        Takes in a list of mentions and turns each of its mentions into the form: [wIndex, start, end]. 
        Or if forTruth is true: [[start,end,entityId]]
    Args:
        textData: {'text': [w1,w2,w3,...] , 'mentions': [[wordIndex,entityTitle],...]}, to be transformed 
            as described above.
        forTruth: Changes form to use.
    Return:
        The mentions in the form [[wIndex, start, end],...]]. Or if forTruth is true: [[start,end,entityId]]
    """
    
    curWord = 0 
    curStart = 0
    for mention in textData['mentions']:
        while curWord < mention[0]:
            curStart += len(textData['text'][curWord]) + 1
            curWord += 1
            
        ent = mention[1] # store entity title in case of forTruth
        mention.pop() # get rid of entity text
        
        if forTruth:
            mention.pop() # get rid of wIndex too
            
        mention.append(curStart) # start of the mention
        mention.append(curStart + len(textData['text'][curWord])) # end of the mention
        
        if forTruth:
            mention.append(title2id(ent)) # put on entityId
    
    return textData['mentions']
     
def mentionExtract(text):
    """
    Description:
        Takes in a text and splits it into the different words/mentions.
    Args:
        phrase: The text to be split.
    Return:
        The text split it into the different words / mentions: 
        {'text':[w1,w2,...], 'mentions': [[wIndex,begin,end],...]}
    """
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'ALL', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=text.encode('utf-8'))
    textData0 = r.json()['tags']
    
    splitText = [] # the text now in split form
    mentions = [] # mentions before remove inadequate ones
    
    textData = [] # [[begin,end,word,anchorProb],...]
    
    i = 0 # for wordIndex
    # get rid of extra un-needed Solr data, and add in anchor probability
    for item in textData0:
        totalMentions = get_mention_count(text[item[1]:item[3]])
        totalAppearances = get_solr_count(text[item[1]:item[3]].replace(".", ""))
        if totalAppearances == 0:
            anchorProb = 0
        else:
            anchorProb = totalMentions/totalAppearances
        # put in the new clean textData
        textData.append([item[1], item[3], text[item[1]:item[3]], anchorProb, i])
        i += 1
        
        # also fill split text
        splitText.append(text[item[1]:item[3]])
    
    # get rid of overlaps
    textData = destroyExclusiveOverlaps(textData)
        
    # gets the POS labels for the words
    postrs = []
    for item in textData:
        postrs.append(item[2])
    postrs = nltk.pos_tag(postrs)
    for i in range(0,len(textData)):
        textData[i].append(postrs[i]) # [5][1] is index of type of word
    
    mentionPThrsh = 0.001 # for getting rid of unlikelies
    
    # put in only good mentions
    for item in textData:
        if (item[3] >= mentionPThrsh # if popular enough, and either some type of noun or JJ
                and (item[5][1][0:2] == 'NN' or item[5][1] == 'JJ')):
            mentions.append([item[4], item[0], item[1]]) # wIndex, start, end
    
    # get in same format as dataset provided data
    newTextData = {'text':splitText, 'mentions':mentions}
    
    return newTextData

def generateCandidates(textData, maxC):
    """
    Description:
        Generates up to maxC candidates for each possible mention word in phrase (most frequent).
    Args:
        textData: A text in split form along with its suspected mentions.
        maxC: The max amount of candidates to accept.
    Return:
        The top maxC candidates for each possible mention word in textData.
    """
    candidates = []
    
    for mention in textData['mentions']:
        results = sorted(anchor2concept(textData['text'][mention[0]]), key = itemgetter(1), 
                          reverse = True)
        candidates.append(results[:maxC]) # take up to maxC of the results
    
    return candidates

def precision(truthSet, mySet):
    """
    Description:
        Calculates the precision of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are. [[start,end,id],...]
        mySet: My code's output for what it thinks the right entities are. [[start,end,id],...]
    Return:
        The precision: (# of correct entities)/(# of found entities)
    """
    
    numFound = len(mySet)
    numCorrect = 0 # incremented in for loop
    
    truthIndex = 0
    myIndex = 0
    
    while truthIndex < len(truthSet) and myIndex < len(mySet):
        if mySet[myIndex][0] < truthSet[truthIndex][0]:
            if mySet[myIndex][1] > truthSet[truthIndex][0]:
                # overlap with mine behind
                if truthSet[truthIndex][2] == mySet[myIndex][2]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine not even reach truth
                myIndex += 1
                
        elif mySet[myIndex][0] == truthSet[truthIndex][0]:
            # same mention (same start atleast)
            if truthSet[truthIndex][2] == mySet[myIndex][2]:
                numCorrect += 1
                truthIndex += 1
                myIndex += 1
            elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                # truth ends first
                truthIndex += 1
            else:
                # mine ends first
                myIndex += 1
                  
        elif mySet[myIndex][0] > truthSet[truthIndex][0]:
            if mySet[myIndex][0] < truthSet[truthIndex][1]:
                # overlap with truth behind
                if truthSet[truthIndex][2] == mySet[myIndex][2]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine beyond mention, increment truth
                truthIndex += 1

    print 'correct: ' + str(numCorrect) + '\nfound: ' + str(numFound)
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(truthSet, mySet):
    """
    Description:
        Calculates the recall of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are. [[start,end,id],...]
        mySet: My code's output for what it thinks the right entities are. [[start,end,id],...]
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(truthSet)
    numCorrect = 0 # incremented in for loop)
    
    truthIndex = 0
    myIndex = 0
    
    while truthIndex < len(truthSet) and myIndex < len(mySet):
        if mySet[myIndex][0] < truthSet[truthIndex][0]:
            if mySet[myIndex][1] > truthSet[truthIndex][0]:
                # overlap with mine behind
                if truthSet[truthIndex][2] == mySet[myIndex][2]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine not even reach truth
                myIndex += 1
                
        elif mySet[myIndex][0] == truthSet[truthIndex][0]:
            # same mention (same start atleast)
            if truthSet[truthIndex][2] == mySet[myIndex][2]:
                numCorrect += 1
                truthIndex += 1
                myIndex += 1
            elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                # truth ends first
                truthIndex += 1
            else:
                # mine ends first
                myIndex += 1
                  
        elif mySet[myIndex][0] > truthSet[truthIndex][0]:
            if mySet[myIndex][0] < truthSet[truthIndex][1]:
                # overlap with truth behind
                if truthSet[truthIndex][2] == mySet[myIndex][2]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][1] < mySet[myIndex][1]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine beyond mention, increment truth
                truthIndex += 1
                
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)
    
def getSurroundingWords(text, mIndex, window):
    """
    Description:
        Returns the words as a list that surround the given axis. Expanding out branchSize elements
        on both sides.
    Args:
        text: A list of words.
        mIndex: The index of the word that is the center of where to get surrounding words.
        window: The amount of words to the left and right to get.
    Return:
        The words as a list that surround the given axis. Expanding out window elements
        on both sides.
    """
    
    imin = mIndex - window
    imax = mIndex + window + 1
    
    # fix extreme bounds
    if imin < 0:
        imin = 0
    if imax > len(text):
        imax = len(text)
        
    # return surrounding part of word minus the mIndex word
    return (text[imin:mIndex] + text[mIndex+1:imax])

def escapeStringSolr(text):
    """
    Description:
        Escapes a given string for use in Solr.
    Args:
        text: The string to escape.
    Return:
        The escaped text.
    """
    
    text = text.replace("\\", "\\\\\\")
    text = text.replace('+', r'\+')
    text = text.replace("-", "\-")
    text = text.replace("&&", "\&&")
    text = text.replace("||", "\||")
    text = text.replace("!", "\!")
    text = text.replace("(", "\(")
    text = text.replace(")", "\)")
    text = text.replace("{", "\{")
    text = text.replace("}", "\}")
    text = text.replace("[", "\[")
    text = text.replace("]", "\]")
    text = text.replace("^", "\^")
    text = text.replace("\"", "\\\"")
    text = text.replace("~", "\~")
    text = text.replace("*", "\*")
    text = text.replace("?", "\?")
    text = text.replace(":", "\:")
    
    return text

def bestContextMatch(mention, context, candidates):
    """
    Description:
        Uses Solr to find the candidate that gives the highest relevance when given the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The index of the candidate with the best relevance score from the context.
    """
    
    # put text in right format
    text = " ".join(context)
    text = escapeStringSolr(text)
    mention = escapeStringSolr(mention)
    
    strIds = ['id:' +  str(strId[0]) for strId in candidates]
    
    # select all the docs from Solr with the best scores, highest first.
    addr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'fl':'id score', 'fq':" ".join(strIds), 'indent':'on',
            'q':'text:('+text.encode('utf-8')+')^1 title:(' + mention.encode('utf-8')+')^1.35',
            'wt':'json'}
    r = requests.get(addr, params = params)
    
    if 'response' not in r.json():
        return 0 # default to most popular
    
    if 'docs' not in r.json()['response']:
        return 0
    
    results = r.json()['response']['docs']
    if len(results) == 0:
        return 0 # default to most popular
    
    bestId = long(r.json()['response']['docs'][0]['id'])
    
    # find which index has bestId
    bestIndex = 0
    for cand in candidates:
        if cand[0] == bestId:
            return bestIndex
        else:
            bestIndex += 1
            
    return bestIndex # in case it was missed
    
def wikifyPopular(textData, candidates):
    """
    Description:
        Chooses the most popular candidate for each mention.
    Args:
        textData: A text in split form along with its suspected mentions.
        candidates: A list of list of candidates that each have the entity id and its frequency/popularity.
    Return:
        All of the proposed entities for the mentions, of the form: [[start,end,entityId],...].
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in textData['mentions']:
        if len(candidates[i]) > 0:
            topCandidates.append([mention[1], mention[2], candidates[i][0][0]])
        i += 1 # move to list of candidates for next mention
            
    return topCandidates

def wikifyContext(textData, candidates, window = 7):
    """
    Description:
        Chooses the candidate that has the highest relevance with the surrounding window words.
    Args:
        textData: A textData in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
        window: How many words on both sides of a mention to search.
    Return:
        All of the proposed entities for the mentions, of the form: [[start,end,entityId],...].
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in textData['mentions']:
        if len(candidates[i]) > 0:
            # get the 
            context = getSurroundingWords(textData['text'], mention[0], window)
            bestIndex = bestContextMatch(textData['text'][mention[0]], context, candidates[i])
            topCandidates.append([mention[1], mention[2], candidates[i][bestIndex][0]])
        i += 1 # move to list of candidates for next mention
        
    return topCandidates

def wikifyEval(text, mentionsGiven, maxC = 20, method='popular', strict = False):
    """
    Description:
        Takes the text (maybe text data), and wikifies it for evaluation purposes using the desired method.
    Args:
        text: The string to wikify. Either as just the original string to be modified, or in the 
            form of: [[w1,w2,...], [[wid,entityId],...] if the mentions are given.
        mentionsGiven: Whether the mentions are given to us and the text is already split.
        maxC: The max amount of candidates to extract.
        method: The method used to wikify.
        strict: Whether to use such rules as minimum metion length, or minimum frequency of concept.
    Return:
        All of the proposed entities for the mentions, of the form: [[start,end,entityId],...].
    """
    
    if not(mentionsGiven): # if words are not in pre-split form
        textData = mentionExtract(text) # extract mentions from text
    else: # if they are
        textData = text
        textData['mentions'] = mentionStartsAndEnds(textData) # put mentions in right form
    
    # get rid of small mentions
    if strict:
        textData['mentions'] = [item for item in textData['mentions']
                    if  len(textData['text'][item[0]]) >= MIN_MENTION_LENGTH]
    
    candidates = generateCandidates(textData, maxC)
    
    if method == 'popular':
        wikified = wikifyPopular(textData, candidates)
    elif method == 'context':
        wikified = wikifyContext(textData, candidates, window = 7)
    
    # get rid of very unpopular mentions
    if strict:
        wikified = [item for item in wikified
                    if item[3] >= MIN_FREQUENCY]
        
    return wikified

In [2]:
"""
This is for testing performance of different wikification methods.
"""

from IPython.display import clear_output
import copy
from datetime import datetime
import tagme

tagme.GCUBE_TOKEN = "f6c2ba6c-751b-4977-a94c-c140c30e9b92-843339462"
    

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

methods = ['popular','context']

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    
    # put in all lines that contain proper ascii
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name'] + '\n'
    
    # run each method on the data set
    for mthd in methods:
        print mthd
        print str(datetime.now()) + '\n'
        
        # reset counters
        totalPrecS = 0
        totalPrecM = 0
        totalRecS = 0
        totalRecM = 0
        totalLines = 0
        
        # each method tests all lines
        for line in dataLines:
            
            print str(totalLines + 1)
            
            # original split string with mentions given
            resultS = wikifyEval(copy.deepcopy(line), True, maxC = 7, method = mthd)
            # unsplit string to be manually split and mentions found
            resultM = wikifyEval(" ".join(line['text']), False, maxC = 7, method = mthd)
            # get absolute text indexes and entity id of each given mention
            trueEntities = mentionStartsAndEnds(copy.deepcopy(line), forTruth = True) # the ground truth
                
            ## get statistical results from true entities and results S and M
            precS = precision(trueEntities, resultS) # precision of pre-split
            precM = precision(trueEntities, resultM) # precision of manual split
            recS = recall(trueEntities, resultS) # recall of pre-split
            recM = recall(trueEntities, resultM) # recall of manual split
            
            #clear_output() # delete this after
            print str(precS) + ' ' + str(precM) + ' ' + str(recS) + ' ' + str(recM) + '\n'
            #print str(precS) + ' ' + str(recS)
            
            # track results
            totalPrecS += precS
            totalPrecM += precM
            totalRecS += recS
            totalRecM += recM
            totalLines += 1
        
        # record results for this method on this dataset
        # [avg precision split, avg precision manual, avg recall split, avg recall manual]
        performances[dataset['name']][mthd] = {'Pre-Split Precision':totalPrecS/totalLines, 
                                               'Manual Split Precision':totalPrecM/totalLines,
                                              'Pre-Split Recall':totalRecS/totalLines, 
                                               'Manual Split Recall':totalRecM/totalLines}
            
print performances

kore

popular

1
correct: 0
found: 2
correct: 0
found: 8
0.0 0.0 0.0 0.0

2
correct: 0
found: 2
correct: 0
found: 4
0.0 0.0 0.0 0.0

3
correct: 1
found: 2
correct: 1
found: 2
0.5 0.5 0.5 0.5

4
correct: 0
found: 2
correct: 0
found: 2
0.0 0.0 0.0 0.0

5
correct: 1
found: 1
correct: 1
found: 3
1.0 0.333333333333 1.0 1.0

6
correct: 1
found: 2
correct: 1
found: 4
0.5 0.25 0.5 0.5

7
correct: 0
found: 3
correct: 0
found: 7
0.0 0.0 0.0 0.0

8
correct: 1
found: 3
correct: 1
found: 6
0.333333333333 0.166666666667 0.333333333333 0.333333333333

9
correct: 0
found: 2
correct: 0
found: 6
0.0 0.0 0.0 0.0

10
correct: 2
found: 5
correct: 2
found: 7
0.4 0.285714285714 0.4 0.4

11
correct: 2
found: 4
correct: 2
found: 7
0.5 0.285714285714 0.5 0.5

12
correct: 1
found: 3
correct: 1
found: 5
0.333333333333 0.2 0.333333333333 0.333333333333

13
correct: 0
found: 3
correct: 0
found: 5
0.0 0.0 0.0 0.0

14
correct: 3
found: 5
correct: 3
found: 9
0.6 0.333333333333 0.6 0.6

15
correct: 3
found: 6
correct: 

In [7]:
"""
This is for testing performance of TagMe wikification method.
"""

from IPython.display import clear_output
import copy
from datetime import datetime
import tagme

tagme.GCUBE_TOKEN = "f6c2ba6c-751b-4977-a94c-c140c30e9b92-843339462"
    

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    
    # put in all lines that contain proper ascii
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name'] + '\n'

    print str(datetime.now()) + '\n'

    # reset counters
    totalPrecM = 0
    totalRecM = 0
    totalLines = 0

    # each method tests all lines
    for line in dataLines:

        print str(totalLines + 1)
        
        antns = tagme.annotate(" ".join(line['text']))
        resultM = []
        for an in antns.get_annotations(0.1):
            resultM.append([an.begin,an.end,title2id(an.entity_title)])
        trueEntities = mentionStartsAndEnds(copy.deepcopy(line), forTruth = True) # the ground truth

        ## get statistical results from true entities and results
        precM = precision(trueEntities, resultM)
        recM = recall(trueEntities, resultM)

        #clear_output() # delete this after
        print str(precM) + str(recM) + '\n'
        #print str(precS) + ' ' + str(recS)

        # track results
        totalPrecM += precM
        totalRecM += recM
        totalLines += 1

    # record results for this method on this dataset
    # [avg precision split, avg precision manual, avg recall split, avg recall manual]
    performances[dataset['name']] = {'Precision':totalPrecM/totalLines,
                                          'Recall':totalRecM/totalLines}
            
print performances

kore

2017-05-25 17:44:09.721014

1
correct: 0
found: 8
0.00.0

2
correct: 0
found: 4
0.00.0

3
correct: 0
found: 2
0.00.0

4
correct: 1
found: 1
1.00.5

5
correct: 1
found: 3
0.3333333333331.0

6
correct: 1
found: 3
0.3333333333330.5

7
correct: 0
found: 0
00.0

8
correct: 2
found: 4
0.50.666666666667

9
correct: 0
found: 7
0.00.0

10
correct: 2
found: 6
0.3333333333330.4

11
correct: 0
found: 1
0.00.0

12
correct: 1
found: 3
0.3333333333330.333333333333

13
correct: 2
found: 7
0.2857142857140.666666666667

14
correct: 4
found: 5
0.80.8

15
correct: 5
found: 8
0.6250.833333333333

16
correct: 1
found: 6
0.1666666666670.333333333333

17
correct: 3
found: 6
0.51.0

18
correct: 3
found: 4
0.751.0

19
correct: 3
found: 6
0.50.6

20
correct: 1
found: 5
0.20.25

21
correct: 2
found: 5
0.40.666666666667

22
correct: 3
found: 5
0.60.75

23
correct: 1
found: 6
0.1666666666670.333333333333

24
correct: 0
found: 4
0.00.0

25
correct: 0
found: 1
0.00.0

26
correct: 1
found: 5
0.20.333333333333

2

KeyboardInterrupt: 

In [None]:
"""
Test individual text on wikification.
"""

data = json.loads("""{"text": ["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."], "mentions": [[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]}
""".decode('utf-8').strip())

print str(data) + '\n'

print " ".join(data['text']).encode('utf-8').strip()

#results = wikifyEval(data['text'], True, 'popular', True)
results = wikifyEval(" ".join(data['text']).encode('utf-8').strip(), False, method='popular')
print results[0]
for result in results[1]:
    print id2title(result[1])

prec = precision(data['mentions'], results[1])
rec = recall(data['mentions'], results[1])

print '\nprecision: ' + str(prec) + ', rec: ' + str(rec) + '\n'

In [None]:
"""
This is for testing if the wikification works.
"""

from IPython.core.display import display, HTML

phrase = 'Three of the greatest guitarists started their career in a single band : Clapton , Beck , and Page'
print phrase + "\n"

anchors = wikify(phrase, False)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print

anchors = wikify(phrase, True)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print
    
newText = ""

anchors = sorted(anchors, key=itemgetter('start')) # make sure anchors are sorted
anchorIndex = 0 # keep track of current anchor added
i = 0 
while i < len(phrase):
    if anchorIndex < len(anchors) and i == anchors[anchorIndex]['start']:
        anchor = anchors[anchorIndex]
        newText += ("<a href=\"https://en.wikipedia.org/wiki/" + anchor['wikiTitle']
                   + "\" target=\"_blank\">" + anchor['mention'] + "</a>")
        i = anchors[anchorIndex]['end']
        anchorIndex += 1
    else:
        newText += phrase[i]
        i += 1
    
display(HTML(newText))

In [None]:
"""
Ideas:
    -In wikifyContext make the current sentence worth 1 and each surrounding sentence worth 0.5.
    -anchor frequency adjuster
    -use similarity with other anchors

Sample Querries:
    'I walked down to the park and found a duck and a pebble'
    'I walked into an electronic store and bought a pebble'
    'I walked down to the park and found a duck studying quantum mechanics'
    'I walked down to the park and found a duck studying quantum mechanical systems'
    'I met David in Spain'
    'An entomologist spots what might be a rare subspecies of beetle, due to the pattern on its back'
"""

In [None]:
tmp = sorted(anchor2concept("David Edgar"), key = itemgetter(1), 
                          reverse = True)

for tmpp in tmp:
    print 'id: ' + str(tmpp[0]) + ', title: ' + id2title(tmpp[0])

In [None]:
phrase = {u'text': [u'Voller', u'presidential', u'preferences', u'How', u'will', u'American', u'voters', u'compensate', u'in', u'the', u'next', u'search', u'for', u'a', u'president?', u'WASHINGTON', u'-', u'Now', u'that', u'the', u'38th', u'president', u'has', u'been', u'laid', u'to', u'rest,', u'the', u'capital', u'can', u'take', u'up', u'the', u'main', u'business', u'of', u'2007:', u'trying', u'to', u'figure', u'out', u'who', u'will', u'be', u'the', u'44th.', u'What', u'type', u'of', u'leader', u'does', u'the', u'country', u'want?', u'Here', u'is', u'my', u'sense', u'of', u'it,', u'based', u'on', u'talking', u'to', u'politicians,', u'strategists', u'and', u'voters', u'here', u'and', u'around', u'the', u'nation.', u'No', u'ideologues,', u'please', u'There', u'was', u'a', u'time', u'when', u'President George W. Bush', u"'s", u'ideological', u'certitude', u'was', u'politically', u'appealing', u'and', u'perhaps', u'functionally', u'necessary.', u'That', u'time', u'has', u'long', u'since', u'passed.', u'The', u'country', u'is', u'tired,', u'even', u'fearful,', u'of', u'leaders', u'with', u'fervent', u'beliefs', u'that', u'seem', u'impervious', u'to', u'new', u'(or', u'even', u'old)', u'facts.', u'Voters', u'see', u'the', u'war', u'in', u'Iraq', u'as', u'an', u'"idea,"', u'not', u'a', u'solution', u'-', u'and', u'Americans', u'do', u'not', u'like', u'ideas', u'that', u'do', u'not', u'work.', u'Voters', u'likely', u'will', u'view', u'Bush', u"'s", u'"surge"', u'of', u'troops', u'into', u'Iraq', u'as', u'new', u'evidence', u'of', u'failure,', u'and', u'the', u'dangers', u'of', u'a', u'leader', u'who', u'depends', u'on', u'preconceived', u'ideas.', u'Serious', u'student', u'Presidential', u'elections', u'are', u'a', u'never-ending', u'series', u'of', u'mid-course', u'corrections.', u'Voters', u'look', u'to', u'compensate', u'for', u'the', u'leadership', u'weaknesses', u'of', u'the', u'incumbent.', u'An', u'example', u'comes', u'from', u'the', u'life', u'and', u'career', u'of', u'Gerald Ford', u'.', u'In', u'1976,', u'voters', u'wanted', u'a', u'pure', u'antidote', u'to', u'Richard Nixon', u"'s", u'paranoid', u'megalomania.', u'Once', u'Ford', u'pardoned', u'Nixon', u',', u'he', u'could', u'not', u'be', u'that', u'candidate.', u'Instead,', u'Americans', u'chose', u'Jimmy Carter', u',', u'a', u'peanut', u'farmer', u'who', u'had', u'never', u'worked', u'in', u'Washington', u',', u'and', u'who', u'promised', u'never', u'to', u'lie', u'to', u'the', u'American people', u'.', u'The', u'counterpoint', u'thinking', u'continues.', u'Voters', u'in', u'2008', u'are', u'going', u'to', u'want', u'someone', u'who', u'prides', u'himself', u'(or', u'herself)', u'on', u'spending', u'time', u'in', u'the', u'library', u'-', u'who', u'has', u'a', u'hands-on', u'curiosity', u'about', u'the', u'details.', u'Washington', u'experience', u'not', u'necessary', u'Voters', u'these', u'days', u'not', u'only', u'do', u'not', u'value', u'Washington', u'experience', u'-', u'or', u'any', u'office-holding', u'experience', u'-', u'it', u'can', u'make', u'them', u'suspicious.', u'That', u'is', u'what', u'strategists', u'and', u'polltakers', u'for', u'Sen.', u'Evan Bayh', u'found', u'when', u'they', u'studied', u'whether', u'he', u'should', u'run', u'for', u'president.', u'They', u'found', u'that', u'his', u'remarkably', u'deep', u'resume', u'-', u'the', u'son', u'of', u'a', u'senator,', u'he', u'was', u'the', u'"boy', u'governor"', u'of', u'Indiana', u'before', u'going', u'to', u'the', u'Senate', u'-', u'was', u'as', u'handicap.', u'Americans', u'always', u'are', u'dubious', u'about', u'the', u'capital,', u'but', u'that', u'sentiment', u'seems', u'particularly', u'strong.', u'Bayh', u'decided', u'not', u'to', u'run.', u'"`', u'Washington', u"'", u"doesn't", u'make', u'the', u'case,"', u'said', u'Dan Pfeiffer', u',', u'who', u'worked', u'for', u'Bayh', u'.', u'No', u'more', u'boomer', u'obsessions', u'Not', u'all', u'elections', u'are', u'about', u'change,', u'but', u'2008', u'will', u'be.', u'Americans', u'are', u'moderately', u'upbeat', u'about', u'the', u"country's", u'prospects,', u'but', u'deeply', u'worried', u'about', u'the', u'world', u'-', u'and', u'they', u'have', u'come', u'to', u'realize', u'that', u'they', u"can't", u'separate', u'one', u'from', u'the', u'other.', u'One', u'thing', u'for', u'sure,', u'says', u'Pfeiffer', u',', u'voters', u'are', u'tired', u'of', u'arguing', u'about', u'the', u'culture', u'of', u'the', u'1960s', u'and', u'other', u'Boomer', u'issues.', u'"There', u'is', u'a', u'sense', u'that', u'the', u'2004', u'election', u'was', u'too', u'much', u'about', u'who', u'did', u'or', u'did', u'not', u'do', u'what', u'in', u'Vietnam', u',"', u'said', u'Pfeiffer', u',', u'referring', u'to', u'the', u'Bush campaign', u'against', u'Sen.', u'John Kerry', u'.', u'In', u'2000,', u'Bush', u'won', u'in', u'part', u'by', u'selling', u'himself', u'as', u'a', u'"grown', u'up"', u'Boomer', u'answer', u'to', u'Bill Clinton', u'.', u'"Voters', u'are', u'tired', u'of', u'that', u'era', u'and', u'its', u'concerns,"', u'said', u'Pfeiffer', u'said.', u'"They', u'want', u'to', u'move', u'on."', u'Know', u'the', u'middle', u'class', u'Bushes', u'have', u'a', u'congenital', u'family', u'problem', u'with', u'this,', u'and', u'it', u'leaves', u'an', u'opening', u'for', u'someone', u'-', u'of', u'either', u'party', u'-', u'who', u'can', u'prove', u'that', u'he', u'or', u'she', u'really', u'understands', u'the', u'strains', u'of', u'middle', u'class', u'life.', u"It's", u'not', u'just', u'about', u'money,', u'but', u'about', u'cultural', u'assaults', u'and', u'the', u'lack', u'of', u'time', u'for', u'family', u'in', u'an', u'era', u'when', u'both', u'parents', u'or', u'partners', u'need', u'to', u'work.', u'In', u'his', u'forthcoming', u'book,', u'Positively', u'American,', u'Sen.', u'Charles Schumer', u'of', u'New York', u'imagines', u'the', u'hard', u'life', u'of', u'a', u'fictitious', u'middle', u'class', u'family', u'-', u'and', u'offers', u'a', u'series', u'of', u'governmental', u'proposals', u'to', u'address', u'them.', u'A', u'shrewd', u'student', u'of', u'the', u'American', u'mood,', u'Schumer', u'is', u'aiming', u'in', u'the', u'right', u'direction.', u'The', u'next', u'president', u'will', u'need', u'to', u'show', u'that', u'he', u'or', u'she', u'understands', u'that', u'family.'], u'mentions': [[15, u'Washington,_D.C.', 0, 106, 116], [81, u'George_W._Bush', 0, 459, 483], [123, u'Iraq', 0, 743, 747], [145, u'George_W._Bush', 0, 853, 857], [151, u'Iraq', 0, 884, 888], [199, u'Gerald_Ford', 0, 1191, 1202], [209, u'Richard_Nixon', 0, 1247, 1260], [214, u'Gerald_Ford', 0, 1291, 1295], [216, u'Richard_Nixon', 0, 1305, 1310], [227, u'Jimmy_Carter', 0, 1370, 1382], [237, u'Washington,_D.C.', 0, 1425, 1435], [247, u'Demographics_of_the_United_States', 0, 1475, 1490], [281, u'Washington,_D.c.', 0, 1685, 1695], [293, u'Washington,_D.c.', 0, 1761, 1771], [314, u'Evan_Bayh', 0, 1898, 1907], [344, u'Indiana', 0, 2065, 2072], [349, u'United_States_Senate', 0, 2093, 2099], [367, u'Evan_Bayh', 0, 2213, 2217], [373, u'Washington,_D.C.', 0, 2241, 2251], [380, u'Dan_Pfeiffer', 0, 2283, 2295], [385, u'Evan_Bayh', 0, 2313, 2317], [435, u'Dan_Pfeiffer', 0, 2600, 2608], [450, u'Boomer', 0, 2680, 2686], [472, u'Vietnam_War', 0, 2785, 2792], [475, u'Dan_Pfeiffer', 0, 2801, 2809], [480, u'George_W._Bush_presidential_campaign,_2004', 0, 2829, 2842], [483, u'John_Kerry', 0, 2856, 2866], [487, u'George_W._Bush', 0, 2878, 2882], [501, u'Bill_Clinton', 0, 2947, 2959], [513, u'Dan_Pfeiffer', 0, 3016, 3024], [593, u'Charles_Schumer', 0, 3459, 3474], [595, u'New_York', 0, 3478, 3486], [624, u'Charles_Schumer', 0, 3650, 3657]]}
wikified = [phrase['text']]
cands = generateCandidates(phrase, 7)
wikified.append(wikifyContext(phrase, cands, ctxBrchSz = len(phrase['text'])))

for mention in wikified[1]:
    mention[1] = id2title(mention[1])
    
print (" ".join(wikified[0])).encode('utf-8').strip()
print wikified


In [None]:
id2title(33509L)

In [None]:
import requests
import json

text = " ".join(["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."])
print text

text = text.replace("\\", "\\\\\\")
text = text.replace('+', r'\+')
text = text.replace("-", "\-")
text = text.replace("&&", "\&&")
text = text.replace("||", "\||")
text = text.replace("!", "\!")
text = text.replace("(", "\(")
text = text.replace(")", "\)")
text = text.replace("{", "\{")
text = text.replace("}", "\}")
text = text.replace("[", "\[")
text = text.replace("]", "\]")
text = text.replace("^", "\^")
text = text.replace("\"", "\\\"")
text = text.replace("~", "\~")
text = text.replace("*", "\*")
text = text.replace("?", "\?")
text = text.replace(":", "\:")

text = text.decode('string_escape')

print text + '\n\n'

addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
r = requests.post(addr, params=params, data=text)
textData = r.json()['tags']

print textData

In [None]:
phraseData = {"text": ["David", "and", "Victoria", "named", "their", "children", "Brooklyn", ",", "Romeo", ",", "Cruz", ",", "and", "Harper Seven", "."], "mentions": [[0, "David_Beckham"], [2, "Victoria_Beckham"]]}
print str(phraseData) + '\n'
phraseData = mentionStartsAndEnds(phraseData)
print phraseData

In [None]:
for item in sorted(anchor2concept('Muller'), key=itemgetter(1), reverse = True):
    print id2title(item[0]) + ' ----- ' + str(item[1])

In [5]:
import tagme

tagme.GCUBE_TOKEN = "f6c2ba6c-751b-4977-a94c-c140c30e9b92-843339462"

antns = tagme.annotate("I definitely like ice cream better than tomatoes.")

for an in antns.get_annotations(0.1):
    print an

ice cream -> Ice cream (score: 0.368746161461)
tomatoes -> Tomato (score: 0.277118563652)
