# Step 3 - Predict Relation and Finish

Our goal during this step is to predict the relation and compute the end-to-end accuracy.

In [1]:
import sys
sys.path.insert(0, '../../')

In [2]:
from scripts.utils.connect import get_connection 
from scripts.utils.data import FB2M_NAME_TABLE
from scripts.utils.data import FB2M_KG_TABLE

connection = get_connection()
cursor = connection.cursor()

In [3]:
import pandas as pd
from tqdm import tqdm_notebook

tqdm_notebook().pandas()

df = pd.read_pickle('step_2_generate_candidates.pkl')
df[:5]




Unnamed: 0,end_index,object,predicted_question_tokens,predicted_subject_names,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens,candidate_mids,predicted_start_index,predicted_end_index,predicted_subject_name
6219,,0bs56bp,"[name, an, american, thoroughbread, racehorse]","[{'name': 'american thoroughbread', 'score': 4...",Name an American Thoroughbread racehorse,,biology/organism_classification/organisms_of_t...,,03k3r,,,"[01z1jf2, 04q7gbh]",2.0,3.0,american
3364,9.0,01sjng,"[what, kind, of, game, is, vision, racing, dri...","[{'name': 'vision racing driving simulator', '...",what kind of game is vision racing driving sim...,"[what, kind, of, game, is, vision, racing, dri...",cvg/computer_videogame/cvg_genre,5.0,02qlppc,vision racing driving simulator,"(vision, racing, driving, simulator)",[02qlppc],5.0,9.0,vision racing driving simulator
9374,6.0,0dlmm88,"[what, tv, program, is, romance, film]","[{'name': 'romance film', 'score': 55.87209701...",what tv program is romance film,"[what, tv, program, is, romance, film]",tv/tv_genre/programs,4.0,02l7c8,romance film,"(romance, film)",[02l7c8],4.0,6.0,romance film
10142,4.0,04rrx,"[what, state, is, polaski, located, in]","[{'name': 'polaski', 'score': 57.6216735839843...",what state is polaski located in,"[what, state, is, polaski, located, in]",location/location/containedby,3.0,049_zj3,polaski,"(polaski,)",[049_zj3],3.0,4.0,polaski
97,8.0,0qcr0,"[what, disease, claimed, the, life, of, fern, ...","[{'name': 'fern emmett', 'score': 69.716217041...",what disease claimed the life of fern emmett,"[what, disease, claimed, the, life, of, fern, ...",people/deceased_person/cause_of_death,6.0,02w9ycr,fern emmett,"(fern, emmett)",[02w9ycr],6.0,8.0,fern emmett


## Generate Facts

Given the candidate mids, the we generate candidate facts.

In [4]:
from collections import defaultdict

def generate_facts(row):
    cursor.execute("""SELECT subject_mid, relation, object_mid
                      FROM {kg}
                      WHERE subject_mid = ANY(%s)""".format(kg=FB2M_KG_TABLE), (row['candidate_mids'],))
    rows = cursor.fetchall()
    candidate_facts = defaultdict(lambda: defaultdict(set))
    
    for subject_mid, relation, object_mid in rows:
        candidate_facts[relation][subject_mid].add(object_mid)
    
    # Convert it back to dict
    for relation in candidate_facts:
        candidate_facts[relation] = dict(candidate_facts[relation])
    candidate_facts = dict(candidate_facts)
        
    return candidate_facts

In [5]:
df['candidate_facts'] = df.progress_apply(generate_facts, axis=1)




In [6]:
print('Average Number of Relations:', sum(len(r) for r in df['candidate_facts']) / df.shape[0])

Average Number of Relations: 17.962010142923006


## Upperbounds

Check the accuracy of the candidates.

In [7]:
from tqdm import tqdm_notebook

correct_object = 0
correct_relation = 0
correct_subject = 0
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    facts = row['candidate_facts']

    if row['relation'] in facts:
        correct_relation += 1
        if row['subject'] in facts[row['relation']]:
            correct_subject += 1
            if row['object'] in facts[row['relation']][row['subject']]:
                correct_object += 1    
        

print('Object Canditate Accuracy:', correct_object / df.shape[0])
print('Relation Canditate Accuracy:', correct_relation / df.shape[0])
print('Subject Canditate Accuracy:', correct_subject / df.shape[0])


Object Canditate Accuracy: 0.9550023052097741
Relation Canditate Accuracy: 0.9674504379898571
Subject Canditate Accuracy: 0.9550023052097741


## Models

Load models to be used in our final step.

In [8]:
from lib.checkpoint import Checkpoint

def load_checkpoint(path):
    checkpoint = Checkpoint(checkpoint_path=path, device=0)
    checkpoint.model.train(mode=False)
    checkpoint.model.apply(lambda m: m.flatten_parameters() if hasattr(m, 'flatten_parameters') else None)
    return checkpoint

cuda = lambda v: v.cuda() if torch.cuda.is_available() else t

In [12]:
def format_question(row):
    formatted_question =  ''
    for i, token in enumerate(row['predicted_question_tokens']):
        if i == row['predicted_start_index']:
            formatted_question += '<e>'
        elif i > row['predicted_start_index'] and i < row['predicted_end_index']:
            continue
        else:
            formatted_question += token.lower().strip()
        formatted_question += ' '
    formatted_question = formatted_question.strip()
    return formatted_question

### Softmax Relation Model

Load the softmax relation model.

In [10]:
from torch.autograd import Variable
import math
import torch

def get_softmax_relation_score(checkpoint, question, relations):
    relations = [r for r in relations]
    relations = [checkpoint.relation_encoder.encode(r)[0] for r in relations]
    mask = set(relations)
    mask = [1 if i in relations else 0 for i in range(checkpoint.relation_encoder.vocab_size)]
    mask = cuda(Variable(torch.FloatTensor(mask), volatile=True))
    question = checkpoint.text_encoder.encode(question)
    question = cuda(Variable(torch.LongTensor(question).unsqueeze(1), volatile=True))
    output_batch = checkpoint.model(question, mask).exp_().data
    output_batch = output_batch.squeeze(0)
    return [output_batch[r] for r in relations]
    
## Test ##
# Checkpoint with 0.880
checkpoint = load_checkpoint('../../results/relation_classifier.02_02_07:59:28/1746.pt')
question = 'where was <e> born?'
print('Question:', question)
print('Scores:')
print(get_softmax_relation_score(checkpoint, question, ['people/person/place_of_birth',
                                                        'location/location/people_born_here']))

Question: where was <e> born?
Scores:
[0.9917193651199341, 0.00010895365267060697]


In [27]:
# 1212212=seed, 88.38%=dev_accuracy
checkpoint = load_checkpoint('../../experiments/relation_classifier.02_02_13:31:11/189.pt')
def add_softmax_scores(row):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        return get_softmax_relation_score(checkpoint, format_question(row), candidate_relations)
    else:
        return None
    
df['softmax_scores'] = df.progress_apply(add_softmax_scores, axis=1)

In [28]:
import torch
from functools import partial

def add_softmax_ensemble_scores(row, checkpoint):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        scores = get_softmax_relation_score(checkpoint, format_question(row), candidate_relations)
        # Sum together...
        return [ensemble_score + score for ensemble_score, score in zip(row['softmax_ensemble_scores'], scores)]
    else:
        return None

# Copy previous single model scores
df['softmax_ensemble_scores'] = df['softmax_scores']
ensemble = [
    '../../experiments/relation_classifier.02_02_13:23:02/223.pt', # 457=seed, 88.26%=dev_accuracy
    '../../results/relation_classifier.02_02_07:59:28/1746.pt', # 123=seed, 88.25%=dev_accuracy
]

for path in ensemble:
    checkpoint = load_checkpoint(path)
    df['softmax_ensemble_scores'] = df.progress_apply(partial(add_softmax_ensemble_scores,
                                                              checkpoint=checkpoint), axis=1)
    torch.cuda.empty_cache()

### Relation Model Yu et Al.

Load the predictor for Yu et Al. relation model.

In [None]:
import re
import torch
import pprint
from torch.autograd import Variable
from functools import lru_cache
from lib.checkpoint import Checkpoint
from lib.utils import pad_batch

RELATION_CLASSIFIER = '../../scripts/Simple QA Models/logs/yu_relation_model.01-29_17:00:01/7196.pt'

pretty_printer = pprint.PrettyPrinter(indent=2)

relation_classifier = Checkpoint(checkpoint_path=RELATION_CLASSIFIER, device=0)
relation_classifier.model.relation_word_rnn.flatten_parameters()
relation_classifier.model.text_rnn_layer_one.flatten_parameters()
relation_classifier.model.text_rnn_layer_two.flatten_parameters()
relation_classifier.model.relation_rnn.flatten_parameters()
relation_classifier.model.train(mode=False)

cuda = lambda v: v.cuda() if torch.cuda.is_available() else t
to_variable = lambda b: cuda(Variable(torch.stack(b).t_().contiguous(), volatile=True))
    
def get_relation_scores(question, relations):
    questions = [question for _ in range(len(relations))]
    questions_encoded, _ = pad_batch([relation_classifier.text_encoder.encode(q) for q in questions])
    relations_encoded, _ = pad_batch([relation_classifier.relation_encoder.encode(r) for r in relations])
    relations_word_encoded, _ = pad_batch([relation_classifier.relation_word_encoder.encode(r) for r in relations])

    questions_encoded = to_variable(questions_encoded)
    relations_encoded = to_variable(relations_encoded)
    relations_word_encoded = to_variable(relations_word_encoded)

    return relation_classifier.model(questions_encoded, relations_encoded, relations_word_encoded).data

# To test this cell
question = 'where was <e> born ?'
print('Question:', question)
# print('Scores:')
print(get_relation_scores(question, ['people/person/place_of_birth']))
print(get_relation_scores(question, ['location/location/people_born_here']))

## Final End-To-End Metric

Given candidate facts compute the end-to-end metric. Below we start a couple code blocks with basic utilities used by each version.

In [14]:
from collections import defaultdict
from scripts.utils.simple_qa import load_simple_qa 
from tqdm import tqdm_notebook

df_train, = load_simple_qa(train=True)

total_rows_train = df_train.shape[0]
cursor.execute('SELECT count(*) FROM ' + FB2M_KG_TABLE)
total_rows_fb_two_kg = cursor.fetchone()[0]

# Given we see a relation occuring with %x probability in KG, we use `transform_probability_from_kg_to_train`
# to get the probability of the relation occuring in SimpleQuestions.
transform_probability_from_kg_to_train = defaultdict(int)
for relation, n_rows_train in tqdm_notebook(df_train.relation.value_counts().iteritems(), total=total_rows_train):
    cursor.execute('SELECT count(*) FROM ' + FB2M_KG_TABLE + ' WHERE relation = %s', (relation, ))
    n_facts_fb2m = cursor.fetchone()[0]
    
    relation_probability_kg = n_facts_fb2m / total_rows_fb_two_kg
    relation_probability_train = n_rows_train / total_rows_train
    
    transform_probability_from_kg_to_train[relation] = relation_probability_train / relation_probability_kg

In [15]:
from scripts.utils.table import format_pipe_table

def evaluate(predicted):
    subject_and_relation_correct = 0 # Official metric from the Simple Questions dataset
    object_correct = 0
    relation_correct = 0
    subject_correct = 0
    negative_samples = []
    
    for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
        relation, subject, objects = predicted[i]
        if relation == row['relation'] and subject == row['subject']:
            subject_and_relation_correct += 1
        else:
            if relation and relation in row['candidate_facts']:
                subjects = list(row['candidate_facts'][relation].keys())[:3]
                cursor.execute('SELECT mid, alias FROM fb_two_name WHERE mid = ANY(%s)', (subjects,))
                subject_aliases = list(cursor.fetchall())[:3]
            else:
                subjects = None
                subject_aliases = None
            negative_samples.append({
                'Candidate Relations (Max 3)': list(row['candidate_facts'].keys())[:3],
                'Predicate': format_question(row),
                'True Relation': row['relation'],
                'Predicted Subject Name': row['predicted_subject_name'],
                'Predicted Subject Aliases (Max 3 MIDs)': subject_aliases,
                'MIDs (Max 3)': subjects,
            })
        if objects and row['object'] in objects:
            object_correct += 1
        if relation == row['relation']:
            relation_correct += 1
        if subject == row['subject']:
            subject_correct += 1
        
    print('Subject & Relation Accuracy (SOTA 78.7%%): %f [%d of %d]' %
          (subject_and_relation_correct / df.shape[0], subject_and_relation_correct, df.shape[0]))
    print('Object Accuracy: %f [%d of %d]' %
              (object_correct / df.shape[0], object_correct, df.shape[0]))
    print('Relation Accuracy (SOTA 88.4%%): %f [%d of %d]' %
              (relation_correct / df.shape[0], relation_correct, df.shape[0]))
    print('Subject Accuracy (SOTA 79%%): %f [%d of %d]' %
              (subject_correct / df.shape[0], subject_correct, df.shape[0]))
    print('Negative Sample:')
    print(format_pipe_table(negative_samples[:50]))

### Version 0: Softmax

For the version 0 of softmax, we use the highest predicted relation with not additional considerations.

In [29]:
from tqdm import tqdm_notebook
import random

predicted = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        max_score = max(row['softmax_scores'])
        predicted_relations = [r for i, r in enumerate(candidate_relations)
                               if row['softmax_scores'][i] == max_score]
        predicted_relation = random.choice(predicted_relations)
        subject_mid, object_mids = random.choice(list(row['candidate_facts'][predicted_relation].items()))
        predicted.append(tuple([predicted_relation, subject_mid, object_mids]))
    else:
        predicted.append(tuple([None, None, None]))

evaluate(predicted)

Subject & Relation Accuracy (SOTA 78.7%): 0.767358 [8322 of 10845]
Object Accuracy: 0.818718 [8879 of 10845]
Relation Accuracy (SOTA 88.4%): 0.881051 [9555 of 10845]
Subject Accuracy (SOTA 79%): 0.817704 [8868 of 10845]
Negative Sample:
| Index | Candidate Relations (Max 3) | MIDs (Max 3) | Predicate | Predicted Subject Aliases (Max 3 MIDs) | Predicted Subject Name | True Relation |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | ['symbols/namesake/named_after', 'people/ethnicity/languages_spoken', 'dataworld/gardening_hint/split_to'] | ['04q7gbh'] | name an <e> thoroughbread racehorse | [('04q7gbh', 'american'), ('04q7gbh', 'americans')] | american | biology/organism_classification/organisms_of_this_type |
| 1 | ['music/album/artist', 'music/album/release_type', 'music/album/album_content_type'] | ['0dp76p7', '0f34691'] | who was the artist on the album <e> | [('0dp76p7', 'just you, just me'), ('0f34691', 'just you, just me')] | just you , just me | music/album/artist |
| 2 | ['comm

### Version 1: Softmax & Most Facts

For the version 1 of softmax, we use the highest predicted relation and additionally guess the subject with the most objects associated with it.

In [30]:
from tqdm import tqdm_notebook
import random

predicted = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        max_score = max(row['softmax_scores'])
        predicted_relations = [r for i, r in enumerate(candidate_relations)
                               if row['softmax_scores'][i] == max_score]
        predicted_relation = random.choice(predicted_relations)
        
        # We use the `Better than random guessing` from notebook 
        # `HYPOTHESIS - Question Refers to Multiple Subjects`.
        subject_mid, object_mids = sorted(row['candidate_facts'][predicted_relation].items(),
                                          key=lambda i: len(i[1]), reverse=True)[0]
        predicted.append(tuple([predicted_relation, subject_mid, object_mids]))
    else:
        predicted.append(tuple([None, None, None]))

evaluate(predicted)

Subject & Relation Accuracy (SOTA 78.7%): 0.787091 [8536 of 10845]
Object Accuracy: 0.839373 [9103 of 10845]
Relation Accuracy (SOTA 88.4%): 0.881051 [9555 of 10845]
Subject Accuracy (SOTA 79%): 0.837805 [9086 of 10845]
Negative Sample:
| Index | Candidate Relations (Max 3) | MIDs (Max 3) | Predicate | Predicted Subject Aliases (Max 3 MIDs) | Predicted Subject Name | True Relation |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | ['symbols/namesake/named_after', 'people/ethnicity/languages_spoken', 'dataworld/gardening_hint/split_to'] | ['04q7gbh'] | name an <e> thoroughbread racehorse | [('04q7gbh', 'american'), ('04q7gbh', 'americans')] | american | biology/organism_classification/organisms_of_this_type |
| 1 | ['common/topic/notable_types', 'computer/computer_processor/used_in_computers', 'computer/computer_processor/processor_family'] | ['0d0gvn'] | what company made <e> ? | [('0d0gvn', 'intel core 2')] | intel core 2 | computer/computer_processor/manufacturers |
| 2 | ['common/t

### Version 2: Softmax & Most Facts & Entity Relation Distribution

The hypothesis of version 2 is that we can use the distribution of relations for a particular set of facts to our advantage. Particular entities favor some relations more than others; therefore, we use that probability in a product of experts model.

In [31]:
from tqdm import tqdm_notebook
import random

predicted = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        kg_relation_probability = [sum(len(row['candidate_facts'][r][mid]) for mid in row['candidate_facts'][r])
                                     for r in candidate_relations]
        scores = [(row['softmax_scores'][i] *
                   transform_probability_from_kg_to_train[r] *
                   kg_relation_probability[i]) for i, r in enumerate(candidate_relations)]
        max_score = max(scores)
        predicted_relations = [r for i, r in enumerate(candidate_relations) if scores[i] == max_score]
        predicted_relation = random.choice(predicted_relations)
        
        # We use the `Better than random guessing` from notebook 
        # `HYPOTHESIS - Question Refers to Multiple Subjects`.
        subject_mid, object_mids = sorted(row['candidate_facts'][predicted_relation].items(),
                                          key=lambda i: len(i[1]), reverse=True)[0]
        predicted.append(tuple([predicted_relation, subject_mid, object_mids]))
    else:
        predicted.append(tuple([None, None, None]))

evaluate(predicted)

Subject & Relation Accuracy (SOTA 78.7%): 0.791886 [8588 of 10845]
Object Accuracy: 0.845367 [9168 of 10845]
Relation Accuracy (SOTA 88.4%): 0.886768 [9617 of 10845]
Subject Accuracy (SOTA 79%): 0.839189 [9101 of 10845]
Negative Sample:
| Index | Candidate Relations (Max 3) | MIDs (Max 3) | Predicate | Predicted Subject Aliases (Max 3 MIDs) | Predicted Subject Name | True Relation |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | ['symbols/namesake/named_after', 'people/ethnicity/languages_spoken', 'dataworld/gardening_hint/split_to'] | ['04q7gbh'] | name an <e> thoroughbread racehorse | [('04q7gbh', 'american'), ('04q7gbh', 'americans')] | american | biology/organism_classification/organisms_of_this_type |
| 1 | ['common/topic/notable_types', 'computer/computer_processor/used_in_computers', 'computer/computer_processor/processor_family'] | ['0d0gvn'] | what company made <e> ? | [('0d0gvn', 'intel core 2')] | intel core 2 | computer/computer_processor/manufacturers |
| 2 | ['common/t

### Version 3: Softmax & Most Facts & Entity Relation Distribution & Ensemble

In version 3, we try to ensemble 3 similary performing softmax models.

In [32]:
from tqdm import tqdm_notebook
import random

predicted = []
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    if len(row['candidate_facts']) != 0:
        candidate_relations = list(row['candidate_facts'].keys())
        kg_relation_probability = [sum(len(row['candidate_facts'][r][mid]) for mid in row['candidate_facts'][r])
                                     for r in candidate_relations]
        scores = [(row['softmax_ensemble_scores'][i] *
                   transform_probability_from_kg_to_train[r] *
                   kg_relation_probability[i]) for i, r in enumerate(candidate_relations)]
        max_score = max(scores)
        predicted_relations = [r for i, r in enumerate(candidate_relations) if scores[i] == max_score]
        predicted_relation = random.choice(predicted_relations)
        
        # We use the `Better than random guessing` from notebook 
        # `HYPOTHESIS - Question Refers to Multiple Subjects`.
        subject_mid, object_mids = sorted(row['candidate_facts'][predicted_relation].items(),
                                          key=lambda i: len(i[1]), reverse=True)[0]
        predicted.append(tuple([predicted_relation, subject_mid, object_mids]))
    else:
        predicted.append(tuple([None, None, None]))

evaluate(predicted)

Subject & Relation Accuracy (SOTA 78.7%): 0.790779 [8576 of 10845]
Object Accuracy: 0.844813 [9162 of 10845]
Relation Accuracy (SOTA 88.4%): 0.886030 [9609 of 10845]
Subject Accuracy (SOTA 79%): 0.838543 [9094 of 10845]
Negative Sample:
| Index | Candidate Relations (Max 3) | MIDs (Max 3) | Predicate | Predicted Subject Aliases (Max 3 MIDs) | Predicted Subject Name | True Relation |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | ['symbols/namesake/named_after', 'people/ethnicity/languages_spoken', 'dataworld/gardening_hint/split_to'] | ['04q7gbh'] | name an <e> thoroughbread racehorse | [('04q7gbh', 'american'), ('04q7gbh', 'americans')] | american | biology/organism_classification/organisms_of_this_type |
| 1 | ['common/topic/notable_types', 'computer/computer_processor/used_in_computers', 'computer/computer_processor/processor_family'] | ['0d0gvn'] | what company made <e> ? | [('0d0gvn', 'intel core 2')] | intel core 2 | computer/computer_processor/manufacturers |
| 2 | ['common/t