# Relation Ranking Yu et al. 2017 Model Ensemble Evaluation

Here we evaluate an ensemble strategy with the same ordering as Step 3 in our end to end process.

In [1]:
import sys
print('Python Version:', sys.version)
import pandas as pd
import logging
sys.path.insert(0, '../../')

from lib.utils import setup_training
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)


Python Version: 3.6.4 (default, Dec 19 2017, 17:29:45) 
[GCC 5.4.0 20160609]


In [16]:
import pandas as pd

df = pd.read_table(
    '../../data/relation_ranking/dev.txt',
    header=None,
    names=['True Relation', 'Relation Pool', 'Question', 'Entity'])

ensemble = [
    'logs/6885.01-29_15:03:41.yu_relation_model/01m_29d_16h_43m_01s.pt', # 88.39
    'logs/6223.01-29_13:18:52.yu_relation_model/01m_29d_15h_00m_22s.pt', # 88.2
    'logs/yu_relation_model.01-29_17:00:01/7196.pt', # 88.4
    'logs/yu_relation_model.01-29_17:00:01/1308.pt' # 0.870018
]
print('Ensemble:', ensemble)
df[:5]

Ensemble: ['logs/6885.01-29_15:03:41.yu_relation_model/01m_29d_16h_43m_01s.pt', 'logs/6223.01-29_13:18:52.yu_relation_model/01m_29d_15h_00m_22s.pt', 'logs/yu_relation_model.01-29_17:00:01/7196.pt', 'logs/yu_relation_model.01-29_17:00:01/1308.pt']


Unnamed: 0,True Relation,Relation Pool,Question,Entity
0,biology/organism_classification/organisms_of_t...,dining/cuisine/ingredients dataworld/gardening...,name an <e> thoroughbread racehorse,american
1,cvg/computer_videogame/cvg_genre,cvg/computer_videogame/cvg_genre cvg/computer_...,what kind of game is <e> ?,vision racing driving simulator
2,tv/tv_genre/programs,media_common/media_genre/child_genres film/fil...,what tv program is <e>,romance film
3,location/location/containedby,location/location/containedby,what state is <e> located in,polaski
4,people/deceased_person/cause_of_death,people/deceased_person/cause_of_death people/p...,what disease claimed the life of <e>,fern emmett


In [17]:
import torch
from torch.autograd import Variable
from lib.utils import pad_batch
import re

cuda = lambda t: t.cuda(device=0) if torch.cuda.is_available() else t
to_variable = lambda b: cuda(Variable(torch.stack(b).t_().contiguous(), volatile=True))

def get_relation_scores(checkpoint, question, relations):
    # Get scores for relations given a question
    questions = [question for _ in range(len(relations))]
    questions_encoded, _ = pad_batch([checkpoint.text_encoder.encode(q) for q in questions])
    relations_encoded, _ = pad_batch([checkpoint.relation_encoder.encode(r) for r in relations])
    relations_word_encoded, _ = pad_batch(
        [checkpoint.relation_word_encoder.encode(r) for r in relations])

    questions_encoded = to_variable(questions_encoded)
    relations_encoded = to_variable(relations_encoded)
    relations_word_encoded = to_variable(relations_word_encoded)

    output = checkpoint.model(questions_encoded, relations_encoded, relations_word_encoded)
    ret = output.data.cpu().tolist()
    output = None
    questions_encoded = None
    relations_encoded = None
    relations_word_encoded = None
    return ret

In [18]:
from lib.checkpoint import Checkpoint

def load_checkpoint_path(path):
    checkpoint = Checkpoint(path)
    cuda(checkpoint.model)
    checkpoint.model.relation_word_rnn.flatten_parameters()
    checkpoint.model.text_rnn_layer_one.flatten_parameters()
    checkpoint.model.text_rnn_layer_two.flatten_parameters()
    checkpoint.model.relation_rnn.flatten_parameters()
    return checkpoint

## Score Relations Given Question 

Save the score every model in the ensemble gives each relation given the question.

In [19]:
from tqdm import tqdm_notebook
import pandas as pd
import random

scores = [[] for _ in range(df.shape[0])]
for model_path in ensemble:
    print('Model Path', model_path)
    checkpoint = load_checkpoint_path(model_path)
    
    relation_correct = 0
    for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
        candidate_relations = set(row['Relation Pool'].split())
        question = row['Question'].strip()
        scores[i].append(get_relation_scores(checkpoint, question, candidate_relations))
        
        max_score = max(scores[i][-1])
        top_relations = [r for j, r in enumerate(candidate_relations) if scores[i][-1][j] == max_score]
        predicted_relation = random.choice(top_relations)
        if predicted_relation == row['True Relation']:
            relation_correct += 1

    print('Relation Accuracy: %f [%d of %d]' % (relation_correct / df.shape[0],
                                                     relation_correct, df.shape[0]))
    checkpoint = None
    torch.cuda.empty_cache()
    print('------------------------------------------------------------------------------')

Model Path logs/6885.01-29_15:03:41.yu_relation_model/01m_29d_16h_43m_01s.pt
2018-01-29 22:40:11,728 | INFO : Loading checkpoints from logs/6885.01-29_15:03:41.yu_relation_model/01m_29d_16h_43m_01s.pt onto device 0



[logs/6885.01-29_15:03:41.yu_relation_model/01m_29d_16h_43m_01s.pt] Relation Accuracy: 0.876107 [9497 of 10840]
Model Path logs/6223.01-29_13:18:52.yu_relation_model/01m_29d_15h_00m_22s.pt
2018-01-29 22:41:02,995 | INFO : Loading checkpoints from logs/6223.01-29_13:18:52.yu_relation_model/01m_29d_15h_00m_22s.pt onto device 0



[logs/6223.01-29_13:18:52.yu_relation_model/01m_29d_15h_00m_22s.pt] Relation Accuracy: 0.880351 [9543 of 10840]
Model Path logs/yu_relation_model.01-29_17:00:01/7196.pt
2018-01-29 22:41:53,928 | INFO : Loading checkpoints from logs/yu_relation_model.01-29_17:00:01/7196.pt onto device 0



[logs/yu_relation_model.01-29_17:00:01/7196.pt] Relation Accuracy: 0.881458 [9555 of 10840]
Model Path logs/yu_relation_model.01-29_17:00:01/1308.pt
2018-01-29 22:42:48,277 | INFO : Loading checkpoints from logs/yu_relation_model.01-29_17:00:01/1308.pt onto device 0



[logs/yu_relation_model.01-29_17:00:01/1308.pt] Relation Accuracy: 0.870756 [9439 of 10840]


## Ensemble Stratigies

Test different strategies for combining the models. 

### Ensemble Version: Sum Scores

Just sum the scores...

In [20]:
# Ensemble Relation Accuracy: 0.883303 [9575 of 10840]

from tqdm import tqdm_notebook

relation_correct = 0

for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
    sum_scores = [0] * len(scores[i][0])
    for j in range(len(scores[i])):
        for k in range(len(scores[i][j])):
            sum_scores[k] += scores[i][j][k]

    candidate_relations = set(row['Relation Pool'].split())
    max_score = max(sum_scores)
    top_relations = [relation for i, relation in enumerate(candidate_relations) if sum_scores[i] == max_score]
    predicted_relation = random.choice(top_relations)

    if predicted_relation == row['True Relation']:
        relation_correct += 1

print('Ensemble Relation Accuracy: %f [%d of %d]' % (relation_correct / df.shape[0],
                                                     relation_correct, df.shape[0]))


Ensemble Relation Accuracy: 0.883303 [9575 of 10840]


### Ensemble Version: Sum Rank

Figure out how each model ranked each relation and sum the rankings.

In [21]:
from tqdm import tqdm_notebook

relation_correct = 0

for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
    candidate_relations = set(row['Relation Pool'].split())
    sum_scores = [0] * len(scores[i][0])
    for j in range(len(scores[i])):
        normalized_scores = [(i, score) for i, score in enumerate(scores[i][j])]
        normalized_scores = sorted(normalized_scores, key=lambda k: k[1], reverse=True)
        normalized_scores = [(i, rank) for rank, (i, score) in enumerate(normalized_scores)]
        normalized_scores = sorted(normalized_scores, key=lambda k: k[0])
        normalized_scores = [rank for (i, rank) in normalized_scores]
        for k in range(len(scores[i][j])):
            sum_scores[k] += normalized_scores[k]

    min_score = min(sum_scores)
    top_relations = [relation for i, relation in enumerate(candidate_relations) if sum_scores[i] == min_score]
    predicted_relation = random.choice(top_relations)

    if predicted_relation == row['True Relation']:
        relation_correct += 1

print('Ensemble Relation Accuracy: %f [%d of %d]' % (relation_correct / df.shape[0],
                                                     relation_correct, df.shape[0]))


Ensemble Relation Accuracy: 0.880812 [9548 of 10840]


### Ensemble Version: Sum Scores w/ Normalize Minimum

Sum the scores and normalize the minimum. The distance by the model is a consine distance; therefore, if we normalize the minimum we should be invariant of rotations.

In [23]:
# Ensemble Relation Accuracy: 0.883303 [9575 of 10840]

from tqdm import tqdm_notebook

relation_correct = 0

for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
    sum_scores = [0] * len(scores[i][0])
    for j in range(len(scores[i])):
        min_score = min(scores[i][j])
        assert min_score >= -1
        difference = min_score - (-1)
        normalized_scores = [s - difference for s in scores[i][j]]
        for k in range(len(scores[i][j])):
            sum_scores[k] += normalized_scores[k]

    candidate_relations = set(row['Relation Pool'].split())
    max_score = max(sum_scores)
    top_relations = [relation for i, relation in enumerate(candidate_relations) if sum_scores[i] == max_score]
    predicted_relation = random.choice(top_relations)

    if predicted_relation == row['True Relation']:
        relation_correct += 1

print('Ensemble Relation Accuracy: %f [%d of %d]' % (relation_correct / df.shape[0],
                                                     relation_correct, df.shape[0]))


Ensemble Relation Accuracy: 0.883303 [9575 of 10840]


### Ensemble Version: Top Vote

Figure out the max_relation for all models. Then vote for that relation.

In [24]:
from tqdm import tqdm_notebook
import random

relation_correct = 0

for i, (_, row) in tqdm_notebook(enumerate(df.iterrows()), total=df.shape[0]):
    candidate_relations = set(row['Relation Pool'].split())
    votes = [0] * len(scores[i][0])
    for j in range(len(scores[i])):
        vote_relation = max(list(enumerate(candidate_relations)), key=lambda k: scores[i][j][k[0]])
        votes[vote_relation[0]] += 1

    max_score = max(votes)
    top_relations = [relation for i, relation in enumerate(candidate_relations) if votes[i] == max_score]
    predicted_relation = random.choice(top_relations)

    if predicted_relation == row['True Relation']:
        relation_correct += 1

print('Ensemble Relation Accuracy: %f [%d of %d]' % (relation_correct / df.shape[0],
                                                     relation_correct, df.shape[0]))


Ensemble Relation Accuracy: 0.880904 [9549 of 10840]
