# Step 1 - Predict Subject Name

In [1]:
import sys
sys.path.insert(0, '../../../allennlp')
sys.path.insert(0, '../../')
from tqdm import tqdm_notebook
from lib.connect import get_connection 
import random
import numpy as np

tqdm_notebook().pandas()
connection = get_connection()
cursor = connection.cursor()




In [2]:
# Run the end-to-end pipeline on the development set

from lib.simple_qa import load_simple_qa 
from sklearn.utils import shuffle

df_dev, = load_simple_qa(test=True)
df_dev = shuffle(df_dev, random_state=123)
df_dev[:5]

Unnamed: 0,subject,relation,object,question
17188,02_286,location/place_with_neighborhoods/neighborhoods,075s73,which town is in new york city
4793,01bg1k,baseball/baseball_player/position_s,017drs,does pee wee reese play shortstop or power for...
21187,06tw28,music/artist/track,0vp3fq,What is a track by lutricia mcneal?
18730,0slws_1,music/release_track/recording,0wzyx1,Name a recording by nelson mandela
10014,01m1y,media_common/netflix_genre/titles,0crryw4,Name a film in the netflix genre celtic music.


## Load Subject Name Model

In [3]:
import importlib
import lib.import_notebook
from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor
        
ARCHIVE = load_archive('../../pretrained_models/subject_recognition_grid_search_2.02_11_20:56:18/model.tar.gz',
                       cuda_device=0)
PREDICTOR = Predictor.from_archive(ARCHIVE, 'sentence-tagger')

## TEST ##
question = 'what major cities does u.s. route 2 run through ?'
print('Question:', question)
print('Predicted Tags:', PREDICTOR.predict_json({'sentence': question}, 0)['tags'])

Question: what major cities does u.s. route 2 run through ?
Predicted Tags: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']


## Top K Model Decoder

The best subject name span is not always found in our KG; therefore, here we define a top k viterbi decoder. This allows us to get the top k subject names.

In [4]:
import torch

# FROM: https://gist.github.com/Deepblue129/afaa3613a99a8e7213d2efdd02ae4762 
def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int=5):
    """
    Perform Viterbi decoding in log space over a sequence given a transition matrix
    specifying pairwise (transition) potentials between tags and a matrix of shape
    (sequence_length, num_tags) specifying unary potentials for possible tags per
    timestep.
    Parameters
    ----------
    tag_sequence : torch.Tensor, required.
        A tensor of shape (sequence_length, num_tags) representing scores for
        a set of tags over a given sequence.
    transition_matrix : torch.Tensor, required.
        A tensor of shape (num_tags, num_tags) representing the binary potentials
        for transitioning between a given pair of tags.
    top_k : int, required.
        Integer defining the top number of paths to decode.
    Returns
    -------
    viterbi_path : List[int]
        The tag indices of the maximum likelihood tag sequence.
    viterbi_score : float
        The score of the viterbi path.
    """
    sequence_length, num_tags = list(tag_sequence.size())

    path_scores = []
    path_indices = []
    # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0)
    # to allow for 1 permutation.
    path_scores.append(tag_sequence[0, :].unsqueeze(0))
    # assert path_scores[0].size() == (n_permutations, num_tags)

    # Evaluate the scores for all possible paths.
    for timestep in range(1, sequence_length):
        # Add pairwise potentials to current scores.
        # assert path_scores[timestep - 1].size() == (n_permutations, num_tags)
        summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix
        summed_potentials = summed_potentials.view(-1, num_tags)

        # Best pairwise potential path score from the previous timestep. 
        max_k = min(summed_potentials.size()[0], top_k)
        scores, paths = torch.topk(summed_potentials, k=max_k, dim=0)
        # assert scores.size() == (n_permutations, num_tags)
        # assert paths.size() == (n_permutations, num_tags)

        scores = tag_sequence[timestep, :] + scores
        # assert scores.size() == (n_permutations, num_tags)
        path_scores.append(scores)
        path_indices.append(paths.squeeze())

    # Construct the most likely sequence backwards.
    path_scores = path_scores[-1].view(-1)
    max_k = min(path_scores.size()[0], top_k)
    viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0)
    viterbi_paths = []
    for i in range(max_k):
        viterbi_path = [best_paths[i]]
        for backward_timestep in reversed(path_indices):
            viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]]))
        # Reverse the backward path.
        viterbi_path.reverse()
        # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo.
        viterbi_path = [j % num_tags for j in viterbi_path]
        viterbi_paths.append(viterbi_path)
    return viterbi_paths, viterbi_scores

## TEST ##
sequence_logits = torch.FloatTensor([[1, 0, 0, 4], [1, 0, 6, 2], [0, 3, 0, 4]])
transition_matrix = torch.zeros([4, 4])
transition_matrix[0, 0] = 1
transition_matrix[2, 1] = 5
indices, value = viterbi_decode(sequence_logits, transition_matrix)
assert indices[0] == [3, 2, 1]
assert value[0] == 18

In [5]:
from typing import List

import torch
from torch.autograd import Variable

# Originally From:
# https://github.com/allenai/allennlp/blob/master/allennlp/modules/conditional_random_field.py#L162
def viterbi_tags(logits: List[List[int]], mask: List[int], top_k: int) -> List[List[int]]:
    """
    Uses viterbi algorithm to find most likely tags for the given inputs.
    """
    logits = torch.FloatTensor(logits)
    mask = torch.LongTensor(mask)
    
    max_seq_length, num_tags = logits.size()

    # Augment transitions matrix with start and end transitions
    start_tag = num_tags
    end_tag = num_tags + 1
    transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.)

    transitions[:num_tags, :num_tags] = ARCHIVE.model.crf.transitions.data
    transitions[start_tag, :num_tags] = ARCHIVE.model.crf.start_transitions.data
    transitions[:num_tags, end_tag] = ARCHIVE.model.crf.end_transitions.data

    # Pad the max sequence length by 2 to account for start_tag + end_tag.
    tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2)

    sequence_length = torch.sum(mask)

    # Start with everything totally unlikely
    tag_sequence.fill_(-10000.)
    # At timestep 0 we must have the START_TAG
    tag_sequence[0, start_tag] = 0.
    # At steps 1, ..., sequence_length we just use the incoming logits
    logits[:sequence_length]
    tag_sequence[1:(sequence_length + 1), :num_tags] = logits[:sequence_length]
    # And at the last timestep we must have the END_TAG
    tag_sequence[sequence_length + 1, end_tag] = 0.

    # We pass the tags and the transitions to ``viterbi_decode``.
    viterbi_paths, viterbi_scores = viterbi_decode(tag_sequence[:(sequence_length + 2)], transitions, top_k)
    # Get rid of START and END sentinels and append.
    viterbi_paths = [path[1:-1] for path in viterbi_paths]
    # Ensure that hidden tokens START and END are not in path
    viterbi_paths = [path for path in viterbi_paths if 2 not in path and 3 not in path]
    # Translate indexes to labels
    viterbi_paths = [
        [ARCHIVE.model.vocab.get_token_from_index(i, namespace="labels")
         for i in paths] for paths in viterbi_paths
    ]
    return viterbi_paths, viterbi_scores

## TEST ##
top_k = 5
predicted = PREDICTOR.predict_json({'sentence': 'what major cities does u.s. route 2 run through ?'}, 0)
viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], top_k)

# Best viterbi_paths should equal to predicted tags
assert predicted['tags'] == viterbi_paths[0]

for i in range(top_k):
    print('[Score: %f] Path:' % viterbi_scores[i], viterbi_paths[i])

[Score: 101.382729] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']
[Score: 83.410507] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'I', 'O']
[Score: 82.760178] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'O', 'O', 'O', 'O']
[Score: 82.228653] Path: ['O', 'O', 'O', 'O', 'I', 'O', 'I', 'O', 'O', 'O']
[Score: 81.733086] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'I']


## Wrap Up

Put the model and top k decoder together. Predict the subject name accross all the examples in the dataframe.

In [6]:
import math
import re

def predict_subject_name(tokens, top_k=500):
    # Predict Tags
    predicted = PREDICTOR.predict_json({'sentence': ' '.join(tokens)}, 0)
    viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], top_k)
    
    predicted_subject_names = []
    for tags, score in zip(viterbi_paths, viterbi_scores):
        assert len(tags) == len(tokens)
        # Ignore if multiple subject names are selected
        n_subjects = sum(tags[i] == 'I' and (i - 1 == -1 or tags[i - 1] == 'O') for i in range(len(tags)))
        if n_subjects == 1:
            predicted_subject_name = ' '.join([tokens[i] for i, tag in
                                               enumerate(tags) if tag == 'I'])
            start_index = [i for i, tag in enumerate(tags) 
                           if tag == 'I' and (i == 0 or tags[i - 1] == 'O')][0]
            end_index = [i for i, tag in enumerate(tags) 
                         if tag == 'I' and (i == len(tags) - 1 or tags[i + 1] == 'O')][0] + 1
            predicted_subject_names.append({
                'name': predicted_subject_name,
                'score': score,
                'start_index': start_index,
                'end_index': end_index,
            })
    return predicted_subject_names

## TEST ##
print('Sample Output:')
predict_subject_name(['what', 'major', 'cities', 'does', 'u.s.', 'route', '2', 'run', 'through', '?'])

Sample Output:


[{'end_index': 7,
  'name': 'u.s. route 2',
  'score': 101.38272857666016,
  'start_index': 4},
 {'end_index': 6,
  'name': 'u.s. route',
  'score': 82.76017761230469,
  'start_index': 4},
 {'end_index': 7,
  'name': 'does u.s. route 2',
  'score': 80.73089599609375,
  'start_index': 3},
 {'end_index': 7,
  'name': 'route 2',
  'score': 80.72879791259766,
  'start_index': 5},
 {'end_index': 8,
  'name': 'u.s. route 2 run',
  'score': 79.83016967773438,
  'start_index': 4},
 {'end_index': 5,
  'name': 'u.s.',
  'score': 65.86043548583984,
  'start_index': 4},
 {'end_index': 9,
  'name': 'u.s. route 2 run through',
  'score': 64.1122817993164,
  'start_index': 4},
 {'end_index': 7, 'name': '2', 'score': 63.829063415527344, 'start_index': 6},
 {'end_index': 6,
  'name': 'does u.s. route',
  'score': 62.10835266113281,
  'start_index': 3},
 {'end_index': 6,
  'name': 'route',
  'score': 62.10625457763672,
  'start_index': 5},
 {'end_index': 7,
  'name': 'cities does u.s. route 2',
  'score

In [8]:
import pandas as pd
from numpy import nan

PREPROCESS = importlib.import_module(
                "notebooksSimple QA Models.Subject Recognition Data").preprocess
TOKENIZE = importlib.import_module(
                "notebooksSimple QA Models.Subject Recognition Data").spacy_tokenize

def add_predicted_subject_name(row):
    question_tokens = TOKENIZE(PREPROCESS(row['question']))
    predicted_subject_names = predict_subject_name(question_tokens)
    row['predicted_subject_names'] = predicted_subject_names
    row['predicted_question_tokens'] = question_tokens
    return row

df_dev = df_dev.progress_apply(add_predicted_subject_name, axis=1)
df_dev[:5]

Unnamed: 0,subject,relation,object,question,predicted_subject_names,predicted_question_tokens
17188,02_286,location/place_with_neighborhoods/neighborhoods,075s73,which town is in new york city,"[{'name': 'new york city', 'score': 64.1403045...","[which, town, is, in, new, york, city]"
4793,01bg1k,baseball/baseball_player/position_s,017drs,does pee wee reese play shortstop or power for...,"[{'name': 'pee wee reese', 'score': 91.0824661...","[does, pee, wee, reese, play, shortstop, or, p..."
21187,06tw28,music/artist/track,0vp3fq,What is a track by lutricia mcneal?,"[{'name': 'lutricia mcneal', 'score': 79.92938...","[what, is, a, track, by, lutricia, mcneal, ?]"
18730,0slws_1,music/release_track/recording,0wzyx1,Name a recording by nelson mandela,"[{'name': 'nelson mandela', 'score': 57.336029...","[name, a, recording, by, nelson, mandela]"
10014,01m1y,media_common/netflix_genre/titles,0crryw4,Name a film in the netflix genre celtic music.,"[{'name': 'celtic music', 'score': 106.7388153...","[name, a, film, in, the, netflix, genre, celti..."


## Analysis Setup

Add the True `subject_name` and `start_index` / `end_index` to check the accuracy of our predicted values.

In [9]:
import importlib
from functools import partial

edit_distance_link_alias = importlib.import_module(
                "notebooksSimple QA Numbers.HYPOTHESIS - Question Refers to Multiple Subjects").edit_distance_link_alias
normalize = importlib.import_module(
                "notebooksSimple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

# Create a column with the subject_name linked per example
df_dev['subject_name'] = df_dev.progress_apply(partial(edit_distance_link_alias, cursor, normalize), axis=1)

importing Jupyter notebook from ../../notebooks/Simple QA Numbers/HYPOTHESIS - Question Refers to Multiple Subjects.ipynb


In [10]:
import importlib
# Adds:
# - question_tokens
# - subject_name
# - start_index
# - end_index
find_subject_name_span = importlib.import_module(
                "notebooksSimple QA Models.Subject Recognition Data").find_subject_name_span

# Create a column with the subject_name linked per example
df_dev = df_dev.progress_apply(find_subject_name_span, axis=1)
df_dev[:5]

Unnamed: 0,end_index,object,predicted_question_tokens,predicted_subject_names,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens
17188,7.0,075s73,"[which, town, is, in, new, york, city]","[{'name': 'new york city', 'score': 64.1403045...",which town is in new york city,"[which, town, is, in, new, york, city]",location/place_with_neighborhoods/neighborhoods,4.0,02_286,new york city,"(new, york, city)"
4793,4.0,017drs,"[does, pee, wee, reese, play, shortstop, or, p...","[{'name': 'pee wee reese', 'score': 91.0824661...",does pee wee reese play shortstop or power for...,"[does, pee, wee, reese, play, shortstop, or, p...",baseball/baseball_player/position_s,1.0,01bg1k,pee wee reese,"(pee, wee, reese)"
21187,7.0,0vp3fq,"[what, is, a, track, by, lutricia, mcneal, ?]","[{'name': 'lutricia mcneal', 'score': 79.92938...",What is a track by lutricia mcneal?,"[what, is, a, track, by, lutricia, mcneal, ?]",music/artist/track,5.0,06tw28,lutricia mcneal,"(lutricia, mcneal)"
18730,6.0,0wzyx1,"[name, a, recording, by, nelson, mandela]","[{'name': 'nelson mandela', 'score': 57.336029...",Name a recording by nelson mandela,"[name, a, recording, by, nelson, mandela]",music/release_track/recording,4.0,0slws_1,nelson mandela,"(nelson, mandela)"
10014,9.0,0crryw4,"[name, a, film, in, the, netflix, genre, celti...","[{'name': 'celtic music', 'score': 106.7388153...",Name a film in the netflix genre celtic music.,"[name, a, film, in, the, netflix, genre, celti...",media_common/netflix_genre/titles,7.0,01m1y,celtic music,"(celtic, music)"


In [11]:
df_dev.to_pickle('step_1_predict_subject_name.pkl')

In [12]:
import pandas as pd

df_dev = pd.read_pickle('step_1_predict_subject_name.pkl')

## Analysis - Correct Span

We determine the correct subject name span and compare it to the predicted subject name.

In [13]:
from tqdm import tqdm_notebook

accuracies = [0] * 100
total = 0

def is_correct(row, top_k):
    for i in range(min(top_k, len(row['predicted_subject_names']))):
        if (row['start_index'] == row['predicted_subject_names'][i]['start_index'] and
            row['end_index'] == row['predicted_subject_names'][i]['end_index']):
            return True
    return False
    

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    accuracies = [count + is_correct(row, i + 1) for i, count in enumerate(accuracies)]

for i, count in enumerate(accuracies):
    print('Accuracy Top %d: %f [%d of %d]' % (i + 1, count / total, count, total))

Accuracy Top 1: 0.955093 [20311 of 21266]
Accuracy Top 2: 0.973526 [20703 of 21266]
Accuracy Top 3: 0.980720 [20856 of 21266]
Accuracy Top 4: 0.984905 [20945 of 21266]
Accuracy Top 5: 0.987633 [21003 of 21266]
Accuracy Top 6: 0.989138 [21035 of 21266]
Accuracy Top 7: 0.991066 [21076 of 21266]
Accuracy Top 8: 0.992006 [21096 of 21266]
Accuracy Top 9: 0.992946 [21116 of 21266]
Accuracy Top 10: 0.993793 [21134 of 21266]
Accuracy Top 11: 0.994545 [21150 of 21266]
Accuracy Top 12: 0.994968 [21159 of 21266]
Accuracy Top 13: 0.995298 [21166 of 21266]
Accuracy Top 14: 0.995674 [21174 of 21266]
Accuracy Top 15: 0.995862 [21178 of 21266]
Accuracy Top 16: 0.996191 [21185 of 21266]
Accuracy Top 17: 0.996285 [21187 of 21266]
Accuracy Top 18: 0.996802 [21198 of 21266]
Accuracy Top 19: 0.997414 [21211 of 21266]
Accuracy Top 20: 0.997555 [21214 of 21266]
Accuracy Top 21: 0.997884 [21221 of 21266]
Accuracy Top 22: 0.997978 [21223 of 21266]
Accuracy Top 23: 0.998072 [21225 of 21266]
Accuracy Top 24: 0.9

## Analysis - Normalized Link

We normalize the correct subject name and compare it to the predicted name normalized.

We expect this to be lower than the span analysis because in "HYPOTHESIS - Subject Name not in Question", we found that 97.85% of the time the subject name normalized is not in the question; therefore, some spans that are correct will not be equal to the subject name normalized.

In [14]:
import importlib

normalize = importlib.import_module(
                "notebooksSimple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize
tokenize = importlib.import_module(
                "notebooksSimple QA Models.Subject Recognition Data").spacy_tokenize

In [33]:
from tqdm import tqdm_notebook
from lib.table import format_pipe_table

negative_sample = []
accuracies = [0] * 10
total = 0

def is_correct(row, top_k):
    subject_name = normalize(' '.join(tokenize(row['subject_name'])))
    for i in range(min(top_k, len(row['predicted_subject_names']))):
        predicted_subject_name = normalize(row['predicted_subject_names'][i]['name'])
        if predicted_subject_name == subject_name:
            return True
    return False

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    accuracies = [count + is_correct(row, i + 1) for i, count in enumerate(accuracies)]
    if not is_correct(row, 1):
        negative_sample.append({
            'Subject Name': row['subject_name'],
            'Predicted Subject Name': [row['predicted_subject_names'][i]['name']
                                       for i in range(min(5, len(row['predicted_subject_names'])))],
        })

for i, count in enumerate(accuracies):
    print('Accuracy Top %d: %f [%d of %d]' % (i + 1, count / total, count, total))
print('Negative Sample:\n')
print(format_pipe_table(negative_sample[:50]))

Accuracy Top 1: 0.943081 [19982 of 21188]
Accuracy Top 2: 0.959836 [20337 of 21188]
Accuracy Top 3: 0.966774 [20484 of 21188]
Accuracy Top 4: 0.970880 [20571 of 21188]
Accuracy Top 5: 0.973428 [20625 of 21188]
Accuracy Top 6: 0.974986 [20658 of 21188]
Accuracy Top 7: 0.976543 [20691 of 21188]
Accuracy Top 8: 0.977534 [20712 of 21188]
Accuracy Top 9: 0.978573 [20734 of 21188]
Accuracy Top 10: 0.979281 [20749 of 21188]
Negative Sample:

| Index | Predicted Subject Name | Subject Name |
| --- | --- | --- |
| 0 | ['album', 'cover album', 'cover', 'album by', 'cover album by'] | ken hirai |
| 1 | ['green whiskers', 'the green whiskers', 'soldier with the green whiskers', 'with the green whiskers', 'whiskers'] | soldier with the green whiskers |
| 2 | ['marco ( animorphs )', 'marco ( animorphs', 'marco (', 'marco', 'marco ( animorphs ) appear'] | marco |
| 3 | ['st helens rlfc', 'st helens', 'helens rlfc', 'the st helens rlfc', 'st'] | st helens rfc |
| 4 | ['river dee , aberdeenshire', 'dee