# Step 1 - Predict Subject Name

In [1]:
import sys
sys.path.insert(0, '../../../allennlp')
sys.path.insert(0, '../../')
from tqdm import tqdm_notebook
from scripts.utils.connect import get_connection 
import random
import numpy as np

tqdm_notebook().pandas()
connection = get_connection()
cursor = connection.cursor()




In [6]:
# Run the end-to-end pipeline on the development set

from scripts.utils.simple_qa import load_simple_qa 
from sklearn.utils import shuffle

df_dev, = load_simple_qa(dev=True)
df_dev = shuffle(df_dev, random_state=123)
df_dev[:5]

Unnamed: 0,subject,relation,object,question
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett


## Load Subject Name Model

In [5]:
import importlib
import scripts.utils.import_notebook
from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor

archive = load_archive('../../results/0000.01-15_21:34:34.subject_recognition_crf_tagger/model.tar.gz',
                       cuda_device=0)
predictor = Predictor.from_archive(archive, 'sentence-tagger')

## TEST ##
question = 'what major cities does u.s. route 2 run through ?'
print('Question:', question)
print('Predicted Tags:', predictor.predict_json({'sentence': question}, 0)['tags'])

Question: what major cities does u.s. route 2 run through ?
Predicted Tags: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']


## Top K Model Decoder

The best subject name span is not always found in our KG; therefore, here we define a top k viterbi decoder. This allows us to get the top k subject names.

In [7]:
import torch

# FROM: https://gist.github.com/Deepblue129/afaa3613a99a8e7213d2efdd02ae4762 
def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int=5):
    """
    Perform Viterbi decoding in log space over a sequence given a transition matrix
    specifying pairwise (transition) potentials between tags and a matrix of shape
    (sequence_length, num_tags) specifying unary potentials for possible tags per
    timestep.
    Parameters
    ----------
    tag_sequence : torch.Tensor, required.
        A tensor of shape (sequence_length, num_tags) representing scores for
        a set of tags over a given sequence.
    transition_matrix : torch.Tensor, required.
        A tensor of shape (num_tags, num_tags) representing the binary potentials
        for transitioning between a given pair of tags.
    top_k : int, required.
        Integer defining the top number of paths to decode.
    Returns
    -------
    viterbi_path : List[int]
        The tag indices of the maximum likelihood tag sequence.
    viterbi_score : float
        The score of the viterbi path.
    """
    sequence_length, num_tags = list(tag_sequence.size())

    path_scores = []
    path_indices = []
    # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0)
    # to allow for 1 permutation.
    path_scores.append(tag_sequence[0, :].unsqueeze(0))
    # assert path_scores[0].size() == (n_permutations, num_tags)

    # Evaluate the scores for all possible paths.
    for timestep in range(1, sequence_length):
        # Add pairwise potentials to current scores.
        # assert path_scores[timestep - 1].size() == (n_permutations, num_tags)
        summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix
        summed_potentials = summed_potentials.view(-1, num_tags)

        # Best pairwise potential path score from the previous timestep. 
        max_k = min(summed_potentials.size()[0], top_k)
        scores, paths = torch.topk(summed_potentials, k=max_k, dim=0)
        # assert scores.size() == (n_permutations, num_tags)
        # assert paths.size() == (n_permutations, num_tags)

        scores = tag_sequence[timestep, :] + scores
        # assert scores.size() == (n_permutations, num_tags)
        path_scores.append(scores)
        path_indices.append(paths.squeeze())

    # Construct the most likely sequence backwards.
    path_scores = path_scores[-1].view(-1)
    max_k = min(path_scores.size()[0], top_k)
    viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0)
    viterbi_paths = []
    for i in range(max_k):
        viterbi_path = [best_paths[i]]
        for backward_timestep in reversed(path_indices):
            viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]]))
        # Reverse the backward path.
        viterbi_path.reverse()
        # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo.
        viterbi_path = [j % num_tags for j in viterbi_path]
        viterbi_paths.append(viterbi_path)
    return viterbi_paths, viterbi_scores

## TEST ##
sequence_logits = torch.FloatTensor([[1, 0, 0, 4], [1, 0, 6, 2], [0, 3, 0, 4]])
transition_matrix = torch.zeros([4, 4])
transition_matrix[0, 0] = 1
transition_matrix[2, 1] = 5
indices, value = viterbi_decode(sequence_logits, transition_matrix)
assert indices[0] == [3, 2, 1]
assert value[0] == 18

In [15]:
from typing import List

import torch
from torch.autograd import Variable

# Originally From:
# https://github.com/allenai/allennlp/blob/master/allennlp/modules/conditional_random_field.py#L162
def viterbi_tags(logits: List[List[int]], mask: List[int], top_k: int) -> List[List[int]]:
    """
    Uses viterbi algorithm to find most likely tags for the given inputs.
    """
    logits = torch.FloatTensor(logits)
    mask = torch.LongTensor(mask)
    
    max_seq_length, num_tags = logits.size()

    # Augment transitions matrix with start and end transitions
    start_tag = num_tags
    end_tag = num_tags + 1
    transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.)

    transitions[:num_tags, :num_tags] = archive.model.crf.transitions.data
    transitions[start_tag, :num_tags] = archive.model.crf.start_transitions.data
    transitions[:num_tags, end_tag] = archive.model.crf.end_transitions.data

    # Pad the max sequence length by 2 to account for start_tag + end_tag.
    tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2)

    sequence_length = torch.sum(mask)

    # Start with everything totally unlikely
    tag_sequence.fill_(-10000.)
    # At timestep 0 we must have the START_TAG
    tag_sequence[0, start_tag] = 0.
    # At steps 1, ..., sequence_length we just use the incoming logits
    logits[:sequence_length]
    tag_sequence[1:(sequence_length + 1), :num_tags] = logits[:sequence_length]
    # And at the last timestep we must have the END_TAG
    tag_sequence[sequence_length + 1, end_tag] = 0.

    # We pass the tags and the transitions to ``viterbi_decode``.
    viterbi_paths, viterbi_scores = viterbi_decode(tag_sequence[:(sequence_length + 2)], transitions, top_k)
    # Get rid of START and END sentinels and append.
    viterbi_paths = [path[1:-1] for path in viterbi_paths]
    # Translate indexes to labels
    viterbi_paths = [
        [archive.model.vocab.get_token_from_index(i, namespace="labels")
         for i in paths] for paths in viterbi_paths
    ]
    return viterbi_paths, viterbi_scores

## TEST ##
top_k = 5
predicted = predictor.predict_json({'sentence': 'what major cities does u.s. route 2 run through ?'}, 0)
viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], top_k)

# Best viterbi_paths should equal to predicted tags
assert predicted['tags'] == viterbi_paths[0]

for i in range(top_k):
    print('[Score: %f] Path:' % viterbi_scores[i], viterbi_paths[i])

[Score: 39.895531] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']
[Score: 33.139980] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'O', 'O', 'O', 'O']
[Score: 31.901718] Path: ['O', 'O', 'O', 'O', 'I', 'O', 'I', 'O', 'O', 'O']
[Score: 31.412817] Path: ['O', 'O', 'O', 'O', 'O', 'I', 'I', 'O', 'O', 'O']
[Score: 31.158092] Path: ['O', 'O', 'O', 'O', 'I', 'O', 'O', 'O', 'O', 'O']


## Wrap Up

Put the model and top k decoder together. Predict the subject name accross all the examples in the dataframe.

In [49]:
import math
import re

def predict_subject_name(tokens):
    # Predict Tags
    predicted = predictor.predict_json({'sentence': ' '.join(tokens)}, 0)
    viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], 25)
    
    predicted_subject_names = []
    for tags, score in zip(viterbi_paths, viterbi_scores):
        assert len(tags) == len(tokens)
        # Ignore if multiple subject names are selected
        n_subjects = sum(tags[i] == 'I' and (i - 1 == -1 or tags[i - 1] == 'O') for i in range(len(tags)))
        if n_subjects == 1:
            predicted_subject_name = ' '.join([tokens[i] for i, tag in
                                               enumerate(tags) if tag == 'I'])
            start_index = [i for i, tag in enumerate(tags) 
                           if tag == 'I' and (i == 0 or tags[i - 1] == 'O')][0]
            end_index = [i for i, tag in enumerate(tags) 
                         if tag == 'I' and (i == len(tags) - 1 or tags[i + 1] == 'O')][0] + 1
            predicted_subject_names.append({
                'name': predicted_subject_name,
                'score': score,
                'start_index': start_index,
                'end_index': end_index,
            })
    return predicted_subject_names

## TEST ##
print('Sample Output:')
predict_subject_name(['what', 'major', 'cities', 'does', 'u.s.', 'route', '2', 'run', 'through', '?'])

Sample Output:


[{'end_index': 7,
  'name': 'u.s. route 2',
  'score': 39.895530700683594,
  'start_index': 4},
 {'end_index': 6,
  'name': 'u.s. route',
  'score': 33.13998031616211,
  'start_index': 4},
 {'end_index': 7,
  'name': 'route 2',
  'score': 31.412817001342773,
  'start_index': 5},
 {'end_index': 5,
  'name': 'u.s.',
  'score': 31.158092498779297,
  'start_index': 4},
 {'end_index': 7,
  'name': 'does u.s. route 2',
  'score': 30.100318908691406,
  'start_index': 3},
 {'end_index': 7, 'name': '2', 'score': 29.430931091308594, 'start_index': 6},
 {'end_index': 8,
  'name': 'u.s. route 2 run',
  'score': 27.478029251098633,
  'start_index': 4},
 {'end_index': 7,
  'name': 'cities does u.s. route 2',
  'score': 26.6679630279541,
  'start_index': 2},
 {'end_index': 6,
  'name': 'route',
  'score': 24.65726661682129,
  'start_index': 5},
 {'end_index': 6,
  'name': 'does u.s. route',
  'score': 23.34476661682129,
  'start_index': 3},
 {'end_index': 7,
  'name': 'major cities does u.s. route 2'

In [50]:
import pandas as pd
from numpy import nan

preprocess = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").preprocess
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

def add_predicted_subject_name(row):
    question_tokens = tokenize(preprocess(row['question']))
    predicted_subject_names = predict_subject_name(question_tokens)
    row['predicted_subject_names'] = predicted_subject_names
    row['question_tokens'] = question_tokens
    return row

df_dev = df_dev.progress_apply(add_predicted_subject_name, axis=1)
df_dev[:5]

Unnamed: 0,end_index,object,predicted_subject_names,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens
6219,,0bs56bp,"[{'name': 'american thoroughbread', 'score': 1...",Name an American Thoroughbread racehorse,"[name, an, american, thoroughbread, racehorse]",biology/organism_classification/organisms_of_t...,,03k3r,,
3364,9.0,01sjng,"[{'name': 'vision racing driving simulator', '...",what kind of game is vision racing driving sim...,"[what, kind, of, game, is, vision, racing, dri...",cvg/computer_videogame/cvg_genre,5.0,02qlppc,vision racing driving simulator,"(vision, racing, driving, simulator)"
9374,6.0,0dlmm88,"[{'name': 'romance film', 'score': 28.02931404...",what tv program is romance film,"[what, tv, program, is, romance, film]",tv/tv_genre/programs,4.0,02l7c8,romance film,"(romance, film)"
10142,4.0,04rrx,"[{'name': 'polaski', 'score': 32.1325416564941...",what state is polaski located in,"[what, state, is, polaski, located, in]",location/location/containedby,3.0,049_zj3,polaski,"(polaski,)"
97,8.0,0qcr0,"[{'name': 'fern emmett', 'score': 23.679399490...",what disease claimed the life of fern emmett,"[what, disease, claimed, the, life, of, fern, ...",people/deceased_person/cause_of_death,6.0,02w9ycr,fern emmett,"(fern, emmett)"


## Analysis Setup

Add the True `subject_name` and `start_index` / `end_index` to check the accuracy of our predicted values.

In [38]:
import importlib
from functools import partial

edit_distance_link_alias = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Question Refers to Multiple Subjects").edit_distance_link_alias
normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

# Create a column with the subject_name linked per example
df_dev['subject_name'] = df_dev.progress_apply(partial(edit_distance_link_alias, cursor, normalize), axis=1)

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Question Refers to Multiple Subjects.ipynb


In [39]:
import importlib
find_subject_name_span = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").find_subject_name_span

# Create a column with the subject_name linked per example
df_dev = df_dev.progress_apply(find_subject_name_span, axis=1)




## Analysis - Correct Span

We determine the correct subject name span and compare it to the predicted subject name.

In [58]:
from tqdm import tqdm_notebook

accuracies = [0] * 10
total = 0

def is_correct(row, top_k):
    for i in range(min(top_k, len(row['predicted_subject_names']))):
        if (row['start_index'] == row['predicted_subject_names'][i]['start_index'] and
            row['end_index'] == row['predicted_subject_names'][i]['end_index']):
            return True
    return False
    

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    accuracies = [count + is_correct(row, i + 1) for i, count in enumerate(accuracies)]

for i, count in enumerate(accuracies):
    print('Accuracy Top %d: %f [%d of %d]' % (i + 1, count / total, count, total))

Accuracy Top 1: 0.942618 [10037 of 10648]
Accuracy Top 2: 0.973892 [10370 of 10648]
Accuracy Top 3: 0.982438 [10461 of 10648]
Accuracy Top 4: 0.988449 [10525 of 10648]
Accuracy Top 5: 0.991736 [10560 of 10648]
Accuracy Top 6: 0.994083 [10585 of 10648]
Accuracy Top 7: 0.995586 [10601 of 10648]
Accuracy Top 8: 0.996056 [10606 of 10648]
Accuracy Top 9: 0.996901 [10615 of 10648]
Accuracy Top 10: 0.997558 [10622 of 10648]


## Analysis - Normalized Link

We normalize the correct subject name and compare it to the predicted name normalized.

We expect this to be lower than the span analysis because in "HYPOTHESIS - Subject Name not in Question", we found that 97.85% of the time the subject name normalized is not in the question; therefore, some spans that are correct will not be equal to the subject name normalized.

In [12]:
import importlib

normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

In [61]:
from tqdm import tqdm_notebook
from scripts.utils.table import format_pipe_table

negative_sample = []
accuracies = [0] * 10
total = 0

def is_correct(row, top_k):
    subject_name = normalize(' '.join(tokenize(row['subject_name'])))
    for i in range(min(top_k, len(row['predicted_subject_names']))):
        predicted_subject_name = normalize(row['predicted_subject_names'][i]['name'])
        if predicted_subject_name == subject_name:
            return True
    return False

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    accuracies = [count + is_correct(row, i + 1) for i, count in enumerate(accuracies)]
    if not is_correct(row, 1):
        negative_sample.append({
            'Subject Name': row['subject_name'],
            'Predicted Subject Name': [row['predicted_subject_names'][i]['name'] for i in range(5)],
        })

for i, count in enumerate(accuracies):
    print('Accuracy Top %d: %f [%d of %d]' % (i + 1, count / total, count, total))
print('Negative Sample:\n')
print(format_pipe_table(negative_sample[:50]))

Accuracy Top 1: 0.935105 [9957 of 10648]
Accuracy Top 2: 0.963749 [10262 of 10648]
Accuracy Top 3: 0.971638 [10346 of 10648]
Accuracy Top 4: 0.977273 [10406 of 10648]
Accuracy Top 5: 0.980372 [10439 of 10648]
Accuracy Top 6: 0.982532 [10462 of 10648]
Accuracy Top 7: 0.983847 [10476 of 10648]
Accuracy Top 8: 0.984316 [10481 of 10648]
Accuracy Top 9: 0.984974 [10488 of 10648]
Accuracy Top 10: 0.985631 [10495 of 10648]
Negative Sample:

| Index | Predicted Subject Name | Subject Name |
| --- | --- | --- |
| 0 | ['documentary film', 'documentary', 'short documentary film', 'short documentary', 'documentary film released'] | short |
| 1 | ['krgy station', 'krgy', 'station', 'does krgy station', 'krgy station play'] | krgy |
| 2 | ['austrailian screenwriter', 'screenwriter', 'austrailian', 'famous austrailian screenwriter', 'austrailian screenwriter ?'] | screenwriter |
| 3 | ['the water in the canche', 'water in the canche', 'in the canche', 'the canche', 'canche'] | canche |
| 4 | ['the re

## Save Data

In [63]:
df_dev.to_pickle('step_1_predict_subject_name.pkl')