In [None]:
from wikipedia import *
from operator import itemgetter
import requests
anchor2concept('Hello')

In [None]:
id2title(48324759L)

In [11]:
import requests
import json
from wikipedia import *
from __future__ import division
import nltk
from nltk.corpus import stopwords
from unidecode import unidecode

"""
Testing the Solr splitting
"""

def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """

    q='+text:(\"%s\")'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    try:
        if 'response' not in r.json():
            return 0
        else:
            return r.json()['response']['numFound']
    except:
        return 0

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    result = anchor2concept(s)
    rSum = 0
    for item in result:
        rSum += item[1]
        
    return rSum

def getTextMentions(line, isWiki):
    """
    Description:
        A helper method to get the mentions in an evaluable format, includes the mentions'
        start and end.
    Args:
        line: The json data that has info that needs to be converted.
        isWiki: Whether the inputted line is from the wiki 5000 dataset, which needs alternate
            handling.
    Return:
        The mentions in the form [[start, end],...].
    """
    
    mentions = []
    
    if isWiki:
        for item in json.loads(line['opening_annotation']):
            mentions.append([item['from'], item['to']])
    else:
        curWord = 0 
        curStart = 0
        for mention in line['mentions']:
            while curWord < mention[0]:
                curStart += len(line['text'][curWord]) + 1
                curWord += 1
            mentions.append([curStart, curStart + len(line['text'][curWord])])
        
    return mentions

def getSolrMentions(text):
    """
    Description:
        A method to split the text and try to extract mentions using Solr.
    Args:
        text: The text to find mentions in.
    Return:
        The mentions as found from our method using Solr.
    """
    
    print unidecode(text.decode('utf-8'))
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=text)
    textData = r.json()['tags']
    
    postrs = []
    for item in textData:
        postrs.append(text[item[1]:item[3]])
        
    postrs = nltk.pos_tag(postrs)
    for i in range(0,len(textData)):
        textData[i].append(postrs[i]) # [6][1] is index of type of word

    print postrs
    
    mentions = []
    mentionPThrsh = 0.005
    
    for item in textData:
        totalMentions = get_mention_count(text[item[1]:item[3]])
        totalAppearances = get_solr_count(text[item[1]:item[3]].replace(".", ""))
        if (totalAppearances > 0 and
                (totalMentions/totalAppearances) >= mentionPThrsh
                and (item[6][1] == 'NNP' or item[6][1] == 'NNPS')):
            mentions.append([item[1], item[3]])
    
    return mentions

def precision(trueMentions, otherMentions):
    """
    Description:
        Calculates the precision of otherSet against the trueMentions.
    Args:
        trueMentions: The 'right' answers for what the mentions are.
        otherMentions: Our mentions obtained through some means.
    Return:
        The precision: (# of correct mentions)/(# of found mentions)
    """
    
    numFound = len(otherMentions)
    numCorrect = 0 # incremented in for loop
    
    trueIndex = 0
    otherIndex = 0
    
    while trueIndex < len(trueMentions) and otherIndex < len(otherMentions):
        # if mentions start and end on the same
        if trueMentions[trueIndex] == otherMentions[otherIndex]:
            numCorrect += 1
            trueIndex += 1
            otherIndex += 1
        # if true mention starts before the other starts
        elif trueMentions[trueIndex][0] < otherMentions[otherIndex][0]:
            trueIndex += 1
        # if other mention starts before the true starts (same doesnt matter)
        elif trueMentions[trueIndex][0] >= otherMentions[otherIndex][0]:
            otherIndex += 1

    print 'correct: ' + str(numCorrect) + '\nfound: ' + str(numFound)
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(trueMentions, otherMentions):
    """
    Description:
        Calculates the recall of otherSet against the trueMentions.
    Args:
        trueMentions: The 'right' answers for what the mentions are.
        otherMentions: Our mentions obtained through some means.
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(trueMentions)
    numCorrect = 0 # incremented in for loop)
    
    trueIndex = 0
    otherIndex = 0
    
    while trueIndex < len(trueMentions) and otherIndex < len(otherMentions):
        # if mentions start and end on the same
        if trueMentions[trueIndex] == otherMentions[otherIndex]:
            numCorrect += 1
            trueIndex += 1
            otherIndex += 1
        # if true mention starts before the other starts
        elif trueMentions[trueIndex][0] < otherMentions[otherIndex][0]:
            trueIndex += 1
        # if other mention starts before the true starts (same doesnt matter)
        elif trueMentions[trueIndex][0] >= otherMentions[otherIndex][0]:
            otherIndex += 1
        
    print 'correct: ' + str(numCorrect) + '\nactual: ' + str(numActual)
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    for line in dataFile:
        doAppend = True
        theLine = json.loads(line.decode('utf-8').strip())
        
        if dataset['name'] == 'wiki5000':
            textName = 'opening_text'
        else:
            textName = 'text'
        
        #for i in " ".join(theLine[textName]):
        #    if ord(i) >= 128:
        #        doAppend = False
        #        break
                
        if doAppend:
            dataLines.append(json.loads(line.decode('utf-8').strip()))

    # reset counters
    totalPrec = 0
    totalRec = 0
    totalLines = 0

    # each method tests all lines
    for line in dataLines:

        print str(totalLines + 1)

        trueMentions = getTextMentions(line, dataset['name'] == 'wiki5000')
        if dataset['name'] == 'wiki5000':  
            solrMentions = getSolrMentions(unidecode(line['opening_text']))
        else:
            solrMentions = getSolrMentions(unidecode(" ".join(line['text'])))

        ## get statistical results from true mentions and solr mentions

        prec = precision(trueMentions, solrMentions)
        rec = recall(trueMentions, solrMentions)
        print str(prec) + ' ' + str(rec) + '\n'

        # track results
        totalPrec += prec
        totalRec += rec
        totalLines += 1

    # record results for this method on this dataset
    performances[dataset['name']] = {'Precision':totalPrec/totalLines, 
                                           'Recall':totalRec/totalLines}
            
print performances

1
David and Victoria named their children Brooklyn , Romeo , Cruz , and Harper Seven .
[('David', 'NNP'), ('and', 'CC'), ('Victoria', 'NNP'), ('named', 'VBD'), ('their children', 'JJ'), ('Brooklyn', 'NNP'), ('Romeo', 'NNP'), ('Cruz', 'NNP'), ('and', 'CC'), ('Harper', 'NNP'), ('Seven', 'NNP')]
correct: 2
found: 5
correct: 2
actual: 2
0.4 1.0

2
David and Victoria added spice to their marriage .
[('David', 'NNP'), ('and', 'CC'), ('Victoria', 'NNP'), ('added', 'VBD'), ('spice', 'NN'), ('to', 'TO'), ('their', 'PRP$'), ('marriage', 'NN')]
correct: 2
found: 2
correct: 2
actual: 2
1.0 1.0

3
Tiger was lost in the woods when he got divorced from Elin .
[('Tiger', 'NN'), ('was lost', 'NN'), ('in', 'IN'), ('the woods', 'NNS'), ('when', 'WRB'), ('he', 'PRP'), ('got', 'VBD'), ('divorced', 'VBN'), ('from', 'IN'), ('Elin', 'NNP')]
correct: 1
found: 1
correct: 1
actual: 2
1.0 0.5

4
Tiger lost the US Open .
[('Tiger', 'NN'), ('lost', 'VBD'), ('the', 'DT'), ('US Open', 'NNP')]
correct: 1
found: 1
corr

KeyboardInterrupt: 

In [None]:
text = "Dalhousie University"

print get_mention_count(text)
print get_solr_count(text)

print get_mention_count(text)/get_solr_count(text)

```curl -X POST \
  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'Hello New York City'```
 

In [16]:
"""
Wikification for evaluation purposes
"""

from wikipedia import *
from operator import itemgetter
import requests
import json
from __future__ import division
from nltk.corpus import stopwords
import nltk

MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored

def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """

    q='+text:(\"%s\")'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    try:
        if 'response' not in r.json():
            return 0
        else:
            return r.json()['response']['numFound']
    except:
        return 0

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    result = anchor2concept(s)
    rSum = 0
    for item in result:
        rSum += item[1]
        
    return rSum

def mentionStartsAndEnds(phraseData):
    """
    Description:
        Takes in a phraseData object and appends it's mentions with the start and end
        index of each mention in the original string.
    Args:
        phraseData: [['words','split','like','this'],[[wordId,entityId,frequency,start,end],...]]
    Return:
        The same phraseData but with each mention containing the start and end of that
        mention in the source text
    """
    
    curWord = 0 
    curStart = 0
    for mention in phraseData['mentions']:
        while curWord < mention[0]:
            curStart += len(phraseData['text'][curWord]) + 1
            curWord += 1
        mention.append(0) # frequency placeholder
        mention.append(curStart) # start of the mention
        mention.append(curStart + len(phraseData['text'][curWord])) # end of the mention

    return phraseData
     
def splitWords(phrase):
    """
    Description:
        Takes in a phrase and splits it into the different words/mentions.
    Args:
        phrase: The text to be split.
    Return:
        The text split it into the different words / mentions.
    """
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=phrase)
    textData = r.json()['tags']
    
    splitText = []
    mentions = []
    
    postrs = []
    for item in textData:
        postrs.append(phrase[item[1]:item[3]])
        
    postrs = nltk.pos_tag(postrs)
    for i in range(0,len(textData)):
        textData[i].append(postrs[i]) # [6][1] is index of type of word
    
    mentionPThrsh = 0.005
    
    stopWords = set(stopwords.words('english'))
    
    i = 0
    for item in textData:    
        totalMentions = get_mention_count(phrase[item[1]:item[3]])
        totalAppearances = get_solr_count(phrase[item[1]:item[3]].replace(".", ""))
        if (totalAppearances > 0 
                and (totalMentions/totalAppearances) >= mentionPThrsh 
                and phrase[item[1]:item[3]] not in stopWords
                and (item[6][1] == 'NNP' or item[6][1] == 'NNPS' or item[6][1] == 'NN')):
            mentions.append([i, '0', 0, item[1], item[3]])
            
        splitText.append(phrase[item[1]:item[3]])
        i += 1
    
    # get in same format as dataset provided data
    newTextData = {'text':splitText, 'mentions':mentions}
    
    return newTextData

def generateCandidates(phrase, maxC):
    """
    Description:
        Generates up to maxC candidates for each possible mention word in phrase.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
    Return:
        The top maxC candidates for each possible mention word in phrase.
    """
    candidates = []
    
    for mention in phrase['mentions']:
        results = sorted(anchor2concept(phrase['text'][mention[0]]), key = itemgetter(1), 
                          reverse = True)
        candidates.append(results[:maxC]) # take up to maxC of the results
    
    return candidates

def precision(truthSet, mySet):
    """
    Description:
        Calculates the precision of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The precision: (# of correct entities)/(# of found entities)
    """
    
    numFound = len(mySet)
    numCorrect = 0 # incremented in for loop
    
    truthIndex = 0
    myIndex = 0
    
    while truthIndex < len(truthSet) and myIndex < len(mySet):
        if mySet[myIndex][3] < truthSet[truthIndex][3]:
            if mySet[myIndex][4] > truthSet[truthIndex][3]:
                # overlap with mine behind
                if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine not even reach truth
                myIndex += 1
                
        elif mySet[myIndex][3] == truthSet[truthIndex][3]:
            # same mention (same start atleast)
            if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                numCorrect += 1
                truthIndex += 1
                myIndex += 1
            elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                # truth ends first
                truthIndex += 1
            else:
                # mine ends first
                myIndex += 1
                  
        elif mySet[myIndex][3] > truthSet[truthIndex][3]:
            if mySet[myIndex][3] < truthSet[truthIndex][4]:
                # overlap with truth behind
                if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine beyond mention, increment truth
                truthIndex += 1

    print 'correct: ' + str(numCorrect) + '\nfound: ' + str(numFound)
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(truthSet, mySet):
    """
    Description:
        Calculates the recall of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(truthSet)
    numCorrect = 0 # incremented in for loop)
    
    truthIndex = 0
    myIndex = 0
    
    while truthIndex < len(truthSet) and myIndex < len(mySet):
        if mySet[myIndex][3] < truthSet[truthIndex][3]:
            if mySet[myIndex][4] > truthSet[truthIndex][3]:
                # overlap with mine behind
                if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine not even reach truth
                myIndex += 1
                
        elif mySet[myIndex][3] == truthSet[truthIndex][3]:
            # same mention (same start atleast)
            if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                numCorrect += 1
                truthIndex += 1
                myIndex += 1
            elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                # truth ends first
                truthIndex += 1
            else:
                # mine ends first
                myIndex += 1
                  
        elif mySet[myIndex][3] > truthSet[truthIndex][3]:
            if mySet[myIndex][3] < truthSet[truthIndex][4]:
                # overlap with truth behind
                if title2id(truthSet[truthIndex][1]) == mySet[myIndex][1]:
                    numCorrect += 1
                    truthIndex += 1
                    myIndex += 1
                elif truthSet[truthIndex][4] < mySet[myIndex][4]:
                    # truth ends first
                    truthIndex += 1
                else:
                    # mine ends first
                    myIndex += 1
            else:
                # mine beyond mention, increment truth
                truthIndex += 1
                
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)
    
def getSentenceOfMention():
    pass
    
def getSurroundingSentences(phrase, axis):
    """
    Description:
        Returns the words as a list that belong to the sentence of this axis, and the surrounding
        ones.
    Args:
        phrase: A list of words.
        axis: The index of the word that is the center of where to get surrounding sentences.
    Return:
        Returns the words as a list that belong to the sentence of this axis, and the surrounding
        ones: [[w3,w4,w5],[w0,w1,w2,w6,w7,w8]]
    """
    
    frstSentenceStart = 0
    # end of first sentence is just start of middle sentence
    mdlSentenceStart = 0
    mdlSentenceEnd = 0
    # start of last sentence is just end of middle sentence
    lstSentenceEnd = 0
    
    # get start index of middle sentence
    # look back untill period or absolute start
    for i in range(axis,-1,-1):
        if phrase[i][-1] == '.' or phrase[i][-1] == '?' or phrase[i][-1] == '!':
            mdlSentenceStart = i + 1
            break
            
    # get end index of middle sentence
    # look forward untill next period or end
    for i in range(axis, len(phrase)):
        if phrase[i][-1] == '.' or phrase[i][-1] == '?' or phrase[i][-1] == '!':
            mdlSentenceEnd = i + 1
            break
        elif i == len(phrase)-1:
            mdlSentenceEnd = len(phrase)
            
    # get start index of first sentence
    # look back untill period or absolute start
    for i in range(mdlSentenceStart - 2, -1, -1):
        if phrase[i][-1] == '.' or phrase[i][-1] == '?' or phrase[i][-1] == '!':
            frstSentenceStart = i + 1
            break
            
    # get end index of last sentence
    # look forward untill next period or end
    for i in range(mdlSentenceEnd + 1, len(phrase)):
        if phrase[i][-1] == '.' or phrase[i][-1] == '?' or phrase[i][-1] == '!':
            lstSentenceEnd = i + 1
            break
        elif i == len(phrase)-1:
            lstSentenceEnd = len(phrase)
            
    sentences = [phrase[mdlSentenceStart:axis]+phrase[axis+1:mdlSentenceEnd],
                phrase[frstSentenceStart:mdlSentenceStart]+phrase[mdlSentenceEnd:lstSentenceEnd]]
    
    return sentences
    
def getSurroundingWords(phrase, axis, branchSize):
    """
    Description:
        Returns the words as a list that surround the given axis. Expanding out branchSize elements
        on both sides.
    Args:
        phrase: A list of words.
        axis: The index of the word that is the center of where to get surrounding words.
        branchSize: The amount of words to the left and right to get.
    Return:
        The words as a list that surround the given axis. Expanding out branchSize elements
        on both sides.
    """
    
    imin = axis - branchSize
    imax = axis + branchSize
    
    # fix extreme bounds
    if imin < 0:
        imin = 0
    if imax > len(phrase):
        imax = len(phrase)
        
    # return surrounding part of word minus the axis word
    return (phrase[imin:axis] + phrase[axis+1:imax])

def escapeStringSolr(text):
    """
    Description:
        Escapes a given string for use in Solr.
    Args:
        text: The string to escape.
    Return:
        The escaped text.
    """
    
    text = text.replace("\\", "\\\\\\")
    text = text.replace('+', r'\+')
    text = text.replace("-", "\-")
    text = text.replace("&&", "\&&")
    text = text.replace("||", "\||")
    text = text.replace("!", "\!")
    text = text.replace("(", "\(")
    text = text.replace(")", "\)")
    text = text.replace("{", "\{")
    text = text.replace("}", "\}")
    text = text.replace("[", "\[")
    text = text.replace("]", "\]")
    text = text.replace("^", "\^")
    text = text.replace("\"", "\\\"")
    text = text.replace("~", "\~")
    text = text.replace("*", "\*")
    text = text.replace("?", "\?")
    text = text.replace(":", "\:")
    
    return text

def bestMultiContextMatch(mention, context, contextSurround, candidates):
    """
    Description:
        Uses Solr to find the candidate that gives the highest relevance when given the context.
    Args:
        mention: The mention as it appears in the text
        context: The words in the sentence of the target.
        contextSurround: The words in the sentences that surround the target.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The index of the candidate with the best relevance score from the context.
    """
    
    # put texts in right format
    text = " ".join(context)
    textSurround = " ".join(contextSurround)
    text = escapeStringSolr(text)
    textSurround = escapeStringSolr(textSurround)
    mention = escapeStringSolr(mention)
    
    strIds = ['id:' +  str(strId[0]) for strId in candidates]
    
    # select all the docs from Solr with the best scores, highest first.
    addr = 'http://localhost:8983/solr/enwiki20160305/select'
    if len(contextSurround) > 0:
        params={'fl':'id score', 'fq':" ".join(strIds), 'indent':'on',
                'q':'text:('+text.decode('string_escape')+')^1 text:('+textSurround.decode('string_escape')+')^0 title:('+mention.decode('string_escape')+')^1.35',
                'wt':'json'}
    else:
        params={'fl':'id score', 'fq':" ".join(strIds), 'indent':'on',
                'q':'text:('+text.decode('string_escape')+') title:('+mention.decode('string_escape')+')^1.35',
                'wt':'json'}
    r = requests.get(addr, params = params)
    
    if 'response' not in r.json():
        return 0 # default to most popular
    
    results = r.json()['response']['docs']
    if len(results) == 0:
        return 0 # default to most popular
    
    bestId = long(r.json()['response']['docs'][0]['id'])
    
    # find which index has bestId
    bestIndex = 0
    for cand in candidates:
        if cand[0] == bestId:
            return bestIndex
        else:
            bestIndex += 1
            
    return bestIndex # in case it was missed

def bestContextMatch(mention, context, candidates):
    """
    Description:
        Uses Solr to find the candidate that gives the highest relevance when given the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that suround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The index of the candidate with the best relevance score from the context.
    """
    
    # put text in right format
    text = (" ".join(context)).encode('utf-8')
    text = escapeStringSolr(text)
    mention = escapeStringSolr(mention.encode('utf-8'))
    
    strIds = ['id:' +  str(strId[0]) for strId in candidates]
    
    # select all the docs from Solr with the best scores, highest first.
    addr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'fl':'id score', 'fq':" ".join(strIds), 'indent':'on',
            'q':'text:('+text.decode('string_escape')+') title:(' + mention.decode('string_escape') + ')^0.6',
            'wt':'json'}
    r = requests.get(addr, params = params)
    
    if 'response' not in r.json():
        return 0 # default to most popular
    
    results = r.json()['response']['docs']
    if len(results) == 0:
        return 0 # default to most popular
    
    bestId = long(r.json()['response']['docs'][0]['id'])
    
    # find which index has bestId
    bestIndex = 0
    for cand in candidates:
        if cand[0] == bestId:
            return bestIndex
        else:
            bestIndex += 1
            
    return bestIndex # in case it was missed
    
def wikifyPopular(phrase, candidates):
    """
    Description:
        Chooses the most popular candidate for each mention.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The word index, entity id, and entity frequency of each winning candidate.
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in phrase['mentions']:
        if len(candidates[i]) > 0:
            topCandidates.append([mention[0], candidates[i][0][0], candidates[i][0][1], mention[3], mention[4]])
        i += 1 # move to list of candidates for next mention
            
    return topCandidates

# the orginal version, with just surrounding words.
def wikifyContexty(phrase, candidates, ctxBrchSz = 5):
    """
    Description:
        Chooses the candidate that has the highest relevance with the surrounding contextBranchSize words.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
        ctxBrchSz: How many words on both sides of a mention to search.
    Return:
        The word index, entity id, and entity frequency of each winning candidate.
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in phrase['mentions']:
        if len(candidates[i]) > 0:
            # get the 
            context = getSurroundingWords(phrase['text'], mention[0], ctxBrchSz)
            bestIndex = bestContextMatch(phrase['text'][mention[0]], context, candidates[i])
            topCandidates.append([mention[0], candidates[i][bestIndex][0], mention[2], mention[3]])
        else:
            topCandidates.append([mention[0], 0, -1, -1]) # a bad mention
        i += 1 # move to list of candidates for next mention
        
    return topCandidates

# new version with surrounding sentences
def wikifyContext(phrase, candidates, ctxBrchSz = 5):
    """
    Description:
        Chooses the candidate that has the highest relevance with the surrounding sentences and its own
        serving as context.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
        ctxBrchSz: How many words on both sides of a mention to search.
    Return:
        The word index, entity id, and entity frequency of each winning candidate.
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in phrase['mentions']:
        if len(candidates[i]) > 0:
            # get the 
            contexts = getSurroundingSentences(phrase['text'], mention[0])
            bestIndex = bestMultiContextMatch(phrase['text'][mention[0]], contexts[0], contexts[1], candidates[i])
            topCandidates.append([mention[0], candidates[i][bestIndex][0], candidates[i][bestIndex][1], mention[3], mention[4]])
        i += 1 # move to list of candidates for next mention
        
    return topCandidates

def wikifyEval(phrase, mentionsGiven, maxC = 20, method='popular', strict = False):
    """
    Description:
        Takes the phrase string, and wikifies it for evaluation purposes using the desired method.
    Args:
        phrase: The string to wikify. Either as just the original string to be modified, or in the 
            form of: [[w1,w2,...], [[wid,entityId],...] if the mentions are given.
        mentionsGiven: Whether the mentions are given to us and the text is already split.
        maxC: The max amount of candidates to extract.
        method: The method used to wikify.
        strict: Whether to use such rules as minimum metion length, or minimum frequency of concept.
    Return:
        The original split text and the anchors along with their best matched concept from wikipedia.
        Of the form: [[w1,w2,...], [[wid,entityId],...]]
    """
    
    # words are not in pre-split form
    if not(mentionsGiven):
        phrase = splitWords(phrase) # modify phrase into split form
    else:
        phrase = mentionStartsAndEnds(phrase)
    
        
    wikified = [phrase['text']] # second index with proposed entities filled later
    
    # get rid of small mentions
    if strict:
        phrase['mentions'] = [item for item in phrase['mentions']
                    if  len(phrase['text'][item[0]]) >= MIN_MENTION_LENGTH]
    
    candidates = generateCandidates(phrase, maxC)
    
    if method == 'popular':
        wikified.append(wikifyPopular(phrase, candidates))
    elif method == 'context':
        wikified.append(wikifyContext(phrase, candidates, ctxBrchSz = len(phrase['text'])))
    
    # get rid of very unpopular mentions
    if strict:
        wikified[1] = [item for item in wikified[1]
                    if item[2] >= MIN_FREQUENCY]
    
    """# remove duplicates
    idsHad = [] # a list of entities to check for duplicates
    newWikified1 = [] # to replace old wikified[1]
    for item in wikified[1]:
        if item[1] not in idsHad:
            newWikified1.append(item)
            idsHad.append(item[1])
    wikified[1] = newWikified1"""
        
    return wikified

In [None]:
from IPython.display import clear_output

"""
This is for testing performance of different wikification methods.
"""

def getWiki5000Entities(annotationData):
    """
    Description:
        A helper method to get the entities of wiki5000 into the right form.
    Args:
        annotationData: The json data that has info that needs to be converted.
    Return:
        The entities in the usual format of [[something, entity],...].
    """
    
    entities = []
    for item in json.loads(annotationData):
        entities.append([None, item['url'].replace(' ', '_'), 0, item['from'], item['to']])
    
    return entities

def wikilineLine(inLine):
    """
    Puts the inLine in the right format if it came from wikipedia.
    """
    newLine = {'text':[], 'mentions':[]}
    

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

methods = ['context','popular']

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    
    # put in all lines that contain proper ascii
    for line in dataFile:
        doAppend = True
        theLine = json.loads(line.decode('utf-8').strip())
        
        if dataset['name'] == 'wiki5000':
            textName = 'opening_text'
        else:
            textName = 'text'
        
        for i in "".join(theLine[textName]):
            if ord(i) >= 128:
                doAppend = False
                break
                
        if doAppend:
            dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name'] + '\n'
    
    # run each method on the data set
    for mthd in methods:
        print mthd + '\n'
        
        # reset counters
        totalPrecS = 0
        totalPrecM = 0
        totalRecS = 0
        totalRecM = 0
        totalLines = 0
        
        # each method tests all lines
        for line in dataLines:
            
            print str(totalLines + 1)
            
            # different structure for wiki
            if dataset['name'] == 'wiki5000':
                # for unification of format for statistical testing
                trueEntities = getWiki5000Entities(line['opening_annotation'])

                resultS = None # no pre-split text
                resultM = wikifyEval(line['opening_text'].encode('utf-8').strip(), False, method = mthd, maxC = 7)
            else:
                trueEntities = mentionStartsAndEnds(line)['mentions'] # the ground truth
                
                # original split string
                resultS = wikifyEval(line, True, method = mthd, maxC = 7)
                # unsplit string
                resultM = wikifyEval((" ".join(line['text'])).encode('utf-8').strip(), False, method = mthd, maxC = 7)
                
            #resultM = [[],[]]
                
            ## get statistical results from true entities and results S and M
            
            # wiki5000 exception
            if resultS <> None:
                precS = precision(trueEntities, resultS[1]) # precision of pre-split
            else:
                precS = 0
                
            precM = precision(trueEntities, resultM[1]) # precision of manual split
            
            # wiki5000 exception
            if resultS <> None:
                recS = recall(trueEntities, resultS[1]) # recall of pre-split
            else:
                recS = 0
                
            recM = recall(trueEntities, resultM[1]) # recall of manual split
            
            #clear_output() # delete this after
            print str(precS) + ' ' + str(precM) + ' ' + str(recS) + ' ' + str(recM) + '\n'
            #print str(precS) + ' ' + str(recS)
            
            # track results
            totalPrecS += precS
            totalPrecM += precM
            totalRecS += recS
            totalRecM += recM
            totalLines += 1
        
        # record results for this method on this dataset
        # [avg precision split, avg precision manual, avg recall split, avg recall manual]
        performances[dataset['name']][mthd] = {'Pre-Split Precision':totalPrecS/totalLines, 
                                               'Manual Split Precision':totalPrecM/totalLines,
                                              'Pre-Split Recall':totalRecS/totalLines, 
                                               'Manual Split Recall':totalRecM/totalLines}
            
print performances

In [None]:
"""
Test individual text on wikification.
"""

data = json.loads("""{"text": ["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."], "mentions": [[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]}
""".decode('utf-8').strip())

print str(data) + '\n'

print " ".join(data['text']).encode('utf-8').strip()

#results = wikifyEval(data['text'], True, 'popular', True)
results = wikifyEval(" ".join(data['text']).encode('utf-8').strip(), False, method='popular')
print results[0]
for result in results[1]:
    print id2title(result[1])

prec = precision(data['mentions'], results[1])
rec = recall(data['mentions'], results[1])

print '\nprecision: ' + str(prec) + ', rec: ' + str(rec) + '\n'

In [18]:
"""
This is for testing if the wikification works.
"""

from IPython.core.display import display, HTML

phrase = 'Three of the greatest guitarists started their career in a single band : Clapton , Beck , and Page'
print phrase + "\n"

anchors = wikify(phrase, False)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print

anchors = wikify(phrase, True)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print
    
newText = ""

anchors = sorted(anchors, key=itemgetter('start')) # make sure anchors are sorted
anchorIndex = 0 # keep track of current anchor added
i = 0 
while i < len(phrase):
    if anchorIndex < len(anchors) and i == anchors[anchorIndex]['start']:
        anchor = anchors[anchorIndex]
        newText += ("<a href=\"https://en.wikipedia.org/wiki/" + anchor['wikiTitle']
                   + "\" target=\"_blank\">" + anchor['mention'] + "</a>")
        i = anchors[anchorIndex]['end']
        anchorIndex += 1
    else:
        newText += phrase[i]
        i += 1
    
display(HTML(newText))

Three of the greatest guitarists started their career in a single band : Clapton , Beck , and Page



NameError: name 'wikify' is not defined

In [None]:
"""
Ideas:
    -In wikifyContext make the current sentence worth 1 and each surrounding sentence worth 0.5.
    -anchor frequency adjuster
    -use similarity with other anchors

Sample Querries:
    'I walked down to the park and found a duck and a pebble'
    'I walked into an electronic store and bought a pebble'
    'I walked down to the park and found a duck studying quantum mechanics'
    'I walked down to the park and found a duck studying quantum mechanical systems'
    'I met David in Spain'
    'An entomologist spots what might be a rare subspecies of beetle, due to the pattern on its back'
"""

In [None]:
tmp = sorted(anchor2concept("David Edgar"), key = itemgetter(1), 
                          reverse = True)

for tmpp in tmp:
    print 'id: ' + str(tmpp[0]) + ', title: ' + id2title(tmpp[0])

In [17]:
phrase = {u'text': [u'Voller', u'presidential', u'preferences', u'How', u'will', u'American', u'voters', u'compensate', u'in', u'the', u'next', u'search', u'for', u'a', u'president?', u'WASHINGTON', u'-', u'Now', u'that', u'the', u'38th', u'president', u'has', u'been', u'laid', u'to', u'rest,', u'the', u'capital', u'can', u'take', u'up', u'the', u'main', u'business', u'of', u'2007:', u'trying', u'to', u'figure', u'out', u'who', u'will', u'be', u'the', u'44th.', u'What', u'type', u'of', u'leader', u'does', u'the', u'country', u'want?', u'Here', u'is', u'my', u'sense', u'of', u'it,', u'based', u'on', u'talking', u'to', u'politicians,', u'strategists', u'and', u'voters', u'here', u'and', u'around', u'the', u'nation.', u'No', u'ideologues,', u'please', u'There', u'was', u'a', u'time', u'when', u'President George W. Bush', u"'s", u'ideological', u'certitude', u'was', u'politically', u'appealing', u'and', u'perhaps', u'functionally', u'necessary.', u'That', u'time', u'has', u'long', u'since', u'passed.', u'The', u'country', u'is', u'tired,', u'even', u'fearful,', u'of', u'leaders', u'with', u'fervent', u'beliefs', u'that', u'seem', u'impervious', u'to', u'new', u'(or', u'even', u'old)', u'facts.', u'Voters', u'see', u'the', u'war', u'in', u'Iraq', u'as', u'an', u'"idea,"', u'not', u'a', u'solution', u'-', u'and', u'Americans', u'do', u'not', u'like', u'ideas', u'that', u'do', u'not', u'work.', u'Voters', u'likely', u'will', u'view', u'Bush', u"'s", u'"surge"', u'of', u'troops', u'into', u'Iraq', u'as', u'new', u'evidence', u'of', u'failure,', u'and', u'the', u'dangers', u'of', u'a', u'leader', u'who', u'depends', u'on', u'preconceived', u'ideas.', u'Serious', u'student', u'Presidential', u'elections', u'are', u'a', u'never-ending', u'series', u'of', u'mid-course', u'corrections.', u'Voters', u'look', u'to', u'compensate', u'for', u'the', u'leadership', u'weaknesses', u'of', u'the', u'incumbent.', u'An', u'example', u'comes', u'from', u'the', u'life', u'and', u'career', u'of', u'Gerald Ford', u'.', u'In', u'1976,', u'voters', u'wanted', u'a', u'pure', u'antidote', u'to', u'Richard Nixon', u"'s", u'paranoid', u'megalomania.', u'Once', u'Ford', u'pardoned', u'Nixon', u',', u'he', u'could', u'not', u'be', u'that', u'candidate.', u'Instead,', u'Americans', u'chose', u'Jimmy Carter', u',', u'a', u'peanut', u'farmer', u'who', u'had', u'never', u'worked', u'in', u'Washington', u',', u'and', u'who', u'promised', u'never', u'to', u'lie', u'to', u'the', u'American people', u'.', u'The', u'counterpoint', u'thinking', u'continues.', u'Voters', u'in', u'2008', u'are', u'going', u'to', u'want', u'someone', u'who', u'prides', u'himself', u'(or', u'herself)', u'on', u'spending', u'time', u'in', u'the', u'library', u'-', u'who', u'has', u'a', u'hands-on', u'curiosity', u'about', u'the', u'details.', u'Washington', u'experience', u'not', u'necessary', u'Voters', u'these', u'days', u'not', u'only', u'do', u'not', u'value', u'Washington', u'experience', u'-', u'or', u'any', u'office-holding', u'experience', u'-', u'it', u'can', u'make', u'them', u'suspicious.', u'That', u'is', u'what', u'strategists', u'and', u'polltakers', u'for', u'Sen.', u'Evan Bayh', u'found', u'when', u'they', u'studied', u'whether', u'he', u'should', u'run', u'for', u'president.', u'They', u'found', u'that', u'his', u'remarkably', u'deep', u'resume', u'-', u'the', u'son', u'of', u'a', u'senator,', u'he', u'was', u'the', u'"boy', u'governor"', u'of', u'Indiana', u'before', u'going', u'to', u'the', u'Senate', u'-', u'was', u'as', u'handicap.', u'Americans', u'always', u'are', u'dubious', u'about', u'the', u'capital,', u'but', u'that', u'sentiment', u'seems', u'particularly', u'strong.', u'Bayh', u'decided', u'not', u'to', u'run.', u'"`', u'Washington', u"'", u"doesn't", u'make', u'the', u'case,"', u'said', u'Dan Pfeiffer', u',', u'who', u'worked', u'for', u'Bayh', u'.', u'No', u'more', u'boomer', u'obsessions', u'Not', u'all', u'elections', u'are', u'about', u'change,', u'but', u'2008', u'will', u'be.', u'Americans', u'are', u'moderately', u'upbeat', u'about', u'the', u"country's", u'prospects,', u'but', u'deeply', u'worried', u'about', u'the', u'world', u'-', u'and', u'they', u'have', u'come', u'to', u'realize', u'that', u'they', u"can't", u'separate', u'one', u'from', u'the', u'other.', u'One', u'thing', u'for', u'sure,', u'says', u'Pfeiffer', u',', u'voters', u'are', u'tired', u'of', u'arguing', u'about', u'the', u'culture', u'of', u'the', u'1960s', u'and', u'other', u'Boomer', u'issues.', u'"There', u'is', u'a', u'sense', u'that', u'the', u'2004', u'election', u'was', u'too', u'much', u'about', u'who', u'did', u'or', u'did', u'not', u'do', u'what', u'in', u'Vietnam', u',"', u'said', u'Pfeiffer', u',', u'referring', u'to', u'the', u'Bush campaign', u'against', u'Sen.', u'John Kerry', u'.', u'In', u'2000,', u'Bush', u'won', u'in', u'part', u'by', u'selling', u'himself', u'as', u'a', u'"grown', u'up"', u'Boomer', u'answer', u'to', u'Bill Clinton', u'.', u'"Voters', u'are', u'tired', u'of', u'that', u'era', u'and', u'its', u'concerns,"', u'said', u'Pfeiffer', u'said.', u'"They', u'want', u'to', u'move', u'on."', u'Know', u'the', u'middle', u'class', u'Bushes', u'have', u'a', u'congenital', u'family', u'problem', u'with', u'this,', u'and', u'it', u'leaves', u'an', u'opening', u'for', u'someone', u'-', u'of', u'either', u'party', u'-', u'who', u'can', u'prove', u'that', u'he', u'or', u'she', u'really', u'understands', u'the', u'strains', u'of', u'middle', u'class', u'life.', u"It's", u'not', u'just', u'about', u'money,', u'but', u'about', u'cultural', u'assaults', u'and', u'the', u'lack', u'of', u'time', u'for', u'family', u'in', u'an', u'era', u'when', u'both', u'parents', u'or', u'partners', u'need', u'to', u'work.', u'In', u'his', u'forthcoming', u'book,', u'Positively', u'American,', u'Sen.', u'Charles Schumer', u'of', u'New York', u'imagines', u'the', u'hard', u'life', u'of', u'a', u'fictitious', u'middle', u'class', u'family', u'-', u'and', u'offers', u'a', u'series', u'of', u'governmental', u'proposals', u'to', u'address', u'them.', u'A', u'shrewd', u'student', u'of', u'the', u'American', u'mood,', u'Schumer', u'is', u'aiming', u'in', u'the', u'right', u'direction.', u'The', u'next', u'president', u'will', u'need', u'to', u'show', u'that', u'he', u'or', u'she', u'understands', u'that', u'family.'], u'mentions': [[15, u'Washington,_D.C.', 0, 106, 116], [81, u'George_W._Bush', 0, 459, 483], [123, u'Iraq', 0, 743, 747], [145, u'George_W._Bush', 0, 853, 857], [151, u'Iraq', 0, 884, 888], [199, u'Gerald_Ford', 0, 1191, 1202], [209, u'Richard_Nixon', 0, 1247, 1260], [214, u'Gerald_Ford', 0, 1291, 1295], [216, u'Richard_Nixon', 0, 1305, 1310], [227, u'Jimmy_Carter', 0, 1370, 1382], [237, u'Washington,_D.C.', 0, 1425, 1435], [247, u'Demographics_of_the_United_States', 0, 1475, 1490], [281, u'Washington,_D.c.', 0, 1685, 1695], [293, u'Washington,_D.c.', 0, 1761, 1771], [314, u'Evan_Bayh', 0, 1898, 1907], [344, u'Indiana', 0, 2065, 2072], [349, u'United_States_Senate', 0, 2093, 2099], [367, u'Evan_Bayh', 0, 2213, 2217], [373, u'Washington,_D.C.', 0, 2241, 2251], [380, u'Dan_Pfeiffer', 0, 2283, 2295], [385, u'Evan_Bayh', 0, 2313, 2317], [435, u'Dan_Pfeiffer', 0, 2600, 2608], [450, u'Boomer', 0, 2680, 2686], [472, u'Vietnam_War', 0, 2785, 2792], [475, u'Dan_Pfeiffer', 0, 2801, 2809], [480, u'George_W._Bush_presidential_campaign,_2004', 0, 2829, 2842], [483, u'John_Kerry', 0, 2856, 2866], [487, u'George_W._Bush', 0, 2878, 2882], [501, u'Bill_Clinton', 0, 2947, 2959], [513, u'Dan_Pfeiffer', 0, 3016, 3024], [593, u'Charles_Schumer', 0, 3459, 3474], [595, u'New_York', 0, 3478, 3486], [624, u'Charles_Schumer', 0, 3650, 3657]]}
wikified = [phrase['text']]
cands = generateCandidates(phrase, 7)
wikified.append(wikifyContext(phrase, cands, ctxBrchSz = len(phrase['text'])))

for mention in wikified[1]:
    mention[1] = id2title(mention[1])
    
print (" ".join(wikified[0])).encode('utf-8').strip()
print wikified


Voller presidential preferences How will American voters compensate in the next search for a president? WASHINGTON - Now that the 38th president has been laid to rest, the capital can take up the main business of 2007: trying to figure out who will be the 44th. What type of leader does the country want? Here is my sense of it, based on talking to politicians, strategists and voters here and around the nation. No ideologues, please There was a time when President George W. Bush 's ideological certitude was politically appealing and perhaps functionally necessary. That time has long since passed. The country is tired, even fearful, of leaders with fervent beliefs that seem impervious to new (or even old) facts. Voters see the war in Iraq as an "idea," not a solution - and Americans do not like ideas that do not work. Voters likely will view Bush 's "surge" of troops into Iraq as new evidence of failure, and the dangers of a leader who depends on preconceived ideas. Serious student Presid

In [None]:
id2title(33509L)

In [None]:
import requests
import json

text = " ".join(["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."])
print text

text = text.replace("\\", "\\\\\\")
text = text.replace('+', r'\+')
text = text.replace("-", "\-")
text = text.replace("&&", "\&&")
text = text.replace("||", "\||")
text = text.replace("!", "\!")
text = text.replace("(", "\(")
text = text.replace(")", "\)")
text = text.replace("{", "\{")
text = text.replace("}", "\}")
text = text.replace("[", "\[")
text = text.replace("]", "\]")
text = text.replace("^", "\^")
text = text.replace("\"", "\\\"")
text = text.replace("~", "\~")
text = text.replace("*", "\*")
text = text.replace("?", "\?")
text = text.replace(":", "\:")

text = text.decode('string_escape')

print text + '\n\n'

addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
r = requests.post(addr, params=params, data=text)
textData = r.json()['tags']

print textData

In [None]:
phraseData = {"text": ["David", "and", "Victoria", "named", "their", "children", "Brooklyn", ",", "Romeo", ",", "Cruz", ",", "and", "Harper Seven", "."], "mentions": [[0, "David_Beckham"], [2, "Victoria_Beckham"]]}
print str(phraseData) + '\n'
phraseData = mentionStartsAndEnds(phraseData)
print phraseData

In [24]:
for item in sorted(anchor2concept('Muller'), key=itemgetter(1), reverse = True):
    print id2title(item[0])

Hermann_Joseph_Muller
Müller_(lunar_crater)
Müller_(footballer)
Muller
Müller_(company)
Heiner_Müller
Müller_(surname)
Harold_Muller
Georg_Elias_Müller
Cornelius_Herman_Muller
Muller_automaton
Muller_(restaurant)
Müller_(German_trade_company)
