In [4]:
import math
from functools import partial
from elasticsearch_dsl.connections import connections
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser
from qanta.datasets.quiz_bowl import QuizBowlDataset

In [5]:
connections.create_connection(hosts=['localhost'])
gspec = AbstractGuesser.list_enabled_guessers()[0]
guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
        gspec.guesser_class, '')
guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True)
questions = db.questions_in_folds(['guessdev'])



In [6]:
def kl(dict1, dict2):
    def normalize(d):
        s = sum(d.values())
        for k, v in d.items():
            d[k] = v / s
        return d

    x1 = normalize(dict1)
    x2 = normalize(dict2)
    nil_value = min(x2.values())
    score = 0
    for k in x1.keys():
        score += x1[k] * math.log(x1[k] / x2.get(k, nil_value))
    return score

def drop(question, dict1, i):
    '''Get the divergence between dict1 and question with ith word dropped'''
    if isinstance(question, str):
        question = question.split()
    question = ' '.join(question[:i] + question[i+1:])
    dict2 = dict(guesser.guess_single(question))
    return kl(dict1, dict2)

def greedy_drop(question, n):
    '''Drop n words from the question.'''
    if isinstance(question, str):
        question = question.split()
    assert n < len(question)
    dropped = []
    indices = list(range(len(question)))
    dict1 = dict(guesser.guess_single(' '.join(question)))
    for i in range(n):
        worker = partial(drop, question, dict1)
        scores = [worker(j) for j in range(len(question))]
        bext = sorted(list(enumerate(scores)), key=lambda x: x[1])[0][0]
        dropped.append(indices[bext])
        question = question[:bext] + question[bext + 1:]
        indices = indices[:bext] + indices[bext + 1:]
    return question, dropped

In [19]:
before = ' '.join(list(questions[200].text.values())).split()
after, dropped = greedy_drop(before, len(before) - 10)

print(' '.join(before))
print()
print(' '.join(after))
print()
print([before[x] for x in dropped])
print()
print(guesser.guess_single(' '.join(before)))
print()
print(guesser.guess_single(' '.join(after)))

Pardoned at the urging of Jaime Nebot in 1990, Abdala Bucara, former mayor of Guyaquil, defeated the latter in July (*) Presidential elections in--for 10 points--what Andean nation of twelve million bordering Peru and Colombia with a capital Quito?

1990, defeated Presidential elections Andean nation Peru Colombia capital Quito?

['Nebot', 'Bucara,', 'Guyaquil,', '(*)', 'a', 'in', 'in', 'and', 'with', 'the', 'Abdala', 'the', 'of', 'of', 'of', 'in--for', 'at', 'July', 'points--what', 'urging', 'former', '10', 'mayor', 'latter', 'Pardoned', 'twelve', 'million', 'Jaime', 'bordering']

[('Ecuador', 1.650645128205128), ('Bolivia', 1.2395446923076925), ('Peru', 1.2148188205128205), ('Panama', 1.1031594871794872), ('Inca_Empire', 0.9085789743589743), ('Chile', 0.8578690769230769), ('Guinea', 0.8022850256410257), ('Nicaragua', 0.7745476153846154), ('El_Salvador', 0.7693789743589744), ('Venezuela', 0.7571482564102564)]

[('Ecuador', 4.7798244), ('Peru', 3.8017757000000003), ('Bolivia', 3.282285