# Step 1 - Predict Subject Name

In [1]:
import sys
sys.path.insert(0, '../../../allennlp')
sys.path.insert(0, '../../')
from tqdm import tqdm_notebook
from scripts.utils.connect import get_connection 
import random
import numpy as np

tqdm_notebook().pandas()
connection = get_connection()
cursor = connection.cursor()




In [6]:
# Run the end-to-end pipeline on the development set

from scripts.utils.simple_qa import load_simple_qa 
from sklearn.utils import shuffle

df_dev, = load_simple_qa(dev=True)
df_dev = shuffle(df_dev, random_state=123)
df_dev[:5]

Unnamed: 0,subject,relation,object,question
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett


## Load Subject Name Model

In [5]:
import importlib
import scripts.utils.import_notebook
from allennlp.models.archival import load_archive
from allennlp.service.predictors import Predictor

archive = load_archive('../../results/0000.01-15_21:34:34.subject_recognition_crf_tagger/model.tar.gz',
                       cuda_device=0)
predictor = Predictor.from_archive(archive, 'sentence-tagger')

## TEST ##
question = 'what major cities does u.s. route 2 run through ?'
print('Question:', question)
print('Predicted Tags:', predictor.predict_json({'sentence': question}, 0)['tags'])

Question: what major cities does u.s. route 2 run through ?
Predicted Tags: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']


## Top K Model Decoder

The best subject name span is not always found in our KG; therefore, here we define a top k viterbi decoder. This allows us to get the top k subject names.

In [7]:
import torch

# FROM: https://gist.github.com/Deepblue129/afaa3613a99a8e7213d2efdd02ae4762 
def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int=5):
    """
    Perform Viterbi decoding in log space over a sequence given a transition matrix
    specifying pairwise (transition) potentials between tags and a matrix of shape
    (sequence_length, num_tags) specifying unary potentials for possible tags per
    timestep.
    Parameters
    ----------
    tag_sequence : torch.Tensor, required.
        A tensor of shape (sequence_length, num_tags) representing scores for
        a set of tags over a given sequence.
    transition_matrix : torch.Tensor, required.
        A tensor of shape (num_tags, num_tags) representing the binary potentials
        for transitioning between a given pair of tags.
    top_k : int, required.
        Integer defining the top number of paths to decode.
    Returns
    -------
    viterbi_path : List[int]
        The tag indices of the maximum likelihood tag sequence.
    viterbi_score : float
        The score of the viterbi path.
    """
    sequence_length, num_tags = list(tag_sequence.size())

    path_scores = []
    path_indices = []
    # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0)
    # to allow for 1 permutation.
    path_scores.append(tag_sequence[0, :].unsqueeze(0))
    # assert path_scores[0].size() == (n_permutations, num_tags)

    # Evaluate the scores for all possible paths.
    for timestep in range(1, sequence_length):
        # Add pairwise potentials to current scores.
        # assert path_scores[timestep - 1].size() == (n_permutations, num_tags)
        summed_potentials = path_scores[timestep - 1].unsqueeze(2) + transition_matrix
        summed_potentials = summed_potentials.view(-1, num_tags)

        # Best pairwise potential path score from the previous timestep. 
        max_k = min(summed_potentials.size()[0], top_k)
        scores, paths = torch.topk(summed_potentials, k=max_k, dim=0)
        # assert scores.size() == (n_permutations, num_tags)
        # assert paths.size() == (n_permutations, num_tags)

        scores = tag_sequence[timestep, :] + scores
        # assert scores.size() == (n_permutations, num_tags)
        path_scores.append(scores)
        path_indices.append(paths.squeeze())

    # Construct the most likely sequence backwards.
    path_scores = path_scores[-1].view(-1)
    max_k = min(path_scores.size()[0], top_k)
    viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0)
    viterbi_paths = []
    for i in range(max_k):
        viterbi_path = [best_paths[i]]
        for backward_timestep in reversed(path_indices):
            viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]]))
        # Reverse the backward path.
        viterbi_path.reverse()
        # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo.
        viterbi_path = [j % num_tags for j in viterbi_path]
        viterbi_paths.append(viterbi_path)
    return viterbi_paths, viterbi_scores

## TEST ##
sequence_logits = torch.FloatTensor([[1, 0, 0, 4], [1, 0, 6, 2], [0, 3, 0, 4]])
transition_matrix = torch.zeros([4, 4])
transition_matrix[0, 0] = 1
transition_matrix[2, 1] = 5
indices, value = viterbi_decode(sequence_logits, transition_matrix)
assert indices[0] == [3, 2, 1]
assert value[0] == 18

In [15]:
from typing import List

import torch
from torch.autograd import Variable

# Originally From:
# https://github.com/allenai/allennlp/blob/master/allennlp/modules/conditional_random_field.py#L162
def viterbi_tags(logits: List[List[int]], mask: List[int], top_k: int) -> List[List[int]]:
    """
    Uses viterbi algorithm to find most likely tags for the given inputs.
    """
    logits = torch.FloatTensor(logits)
    mask = torch.LongTensor(mask)
    
    max_seq_length, num_tags = logits.size()

    # Augment transitions matrix with start and end transitions
    start_tag = num_tags
    end_tag = num_tags + 1
    transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.)

    transitions[:num_tags, :num_tags] = archive.model.crf.transitions.data
    transitions[start_tag, :num_tags] = archive.model.crf.start_transitions.data
    transitions[:num_tags, end_tag] = archive.model.crf.end_transitions.data

    # Pad the max sequence length by 2 to account for start_tag + end_tag.
    tag_sequence = torch.Tensor(max_seq_length + 2, num_tags + 2)

    sequence_length = torch.sum(mask)

    # Start with everything totally unlikely
    tag_sequence.fill_(-10000.)
    # At timestep 0 we must have the START_TAG
    tag_sequence[0, start_tag] = 0.
    # At steps 1, ..., sequence_length we just use the incoming logits
    logits[:sequence_length]
    tag_sequence[1:(sequence_length + 1), :num_tags] = logits[:sequence_length]
    # And at the last timestep we must have the END_TAG
    tag_sequence[sequence_length + 1, end_tag] = 0.

    # We pass the tags and the transitions to ``viterbi_decode``.
    viterbi_paths, viterbi_scores = viterbi_decode(tag_sequence[:(sequence_length + 2)], transitions, top_k)
    # Get rid of START and END sentinels and append.
    viterbi_paths = [path[1:-1] for path in viterbi_paths]
    # Translate indexes to labels
    viterbi_paths = [
        [archive.model.vocab.get_token_from_index(i, namespace="labels")
         for i in paths] for paths in viterbi_paths
    ]
    return viterbi_paths, viterbi_scores

## TEST ##
top_k = 5
predicted = predictor.predict_json({'sentence': 'what major cities does u.s. route 2 run through ?'}, 0)
viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], top_k)

# Best viterbi_paths should equal to predicted tags
assert predicted['tags'] == viterbi_paths[0]

for i in range(top_k):
    print('[Score: %f] Path:' % viterbi_scores[i], viterbi_paths[i])

[Score: 39.895531] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'I', 'O', 'O', 'O']
[Score: 33.139980] Path: ['O', 'O', 'O', 'O', 'I', 'I', 'O', 'O', 'O', 'O']
[Score: 31.901718] Path: ['O', 'O', 'O', 'O', 'I', 'O', 'I', 'O', 'O', 'O']
[Score: 31.412817] Path: ['O', 'O', 'O', 'O', 'O', 'I', 'I', 'O', 'O', 'O']
[Score: 31.158092] Path: ['O', 'O', 'O', 'O', 'I', 'O', 'O', 'O', 'O', 'O']


## Wrap Up

Put the model and top k decoder together. Predict the subject name accross all the examples in the dataframe.

In [23]:
import math
import re

def predict_subject_name(tokens):
    # Predict Tags
    predicted = predictor.predict_json({'sentence': ' '.join(tokens)}, 0)
    viterbi_paths, viterbi_scores = viterbi_tags(predicted['logits'], predicted['mask'], 20)
    
    predicted_subject_names = []
    for tags, score in zip(viterbi_paths, viterbi_scores):
        assert len(tags) == len(tokens)
        # Ignore if multiple subject names are selected
        n_subjects = sum(tags[i] == 'I' and (i - 1 == -1 or tags[i - 1] == 'O') for i in range(len(tags)))
        if n_subjects == 1:
            predicted_subject_name = ' '.join([tokens[i] for i, tag in
                                               enumerate(tags) if tag == 'I'])
            start_index = [i for i, tag in enumerate(tags) 
                           if tag == 'I' and (i == 0 or tags[i - 1] == 'O')][0]
            end_index = [i for i, tag in enumerate(tags) 
                         if tag == 'I' and (i == len(tags) - 1 or tags[i + 1] == 'O')][0] + 1
            predicted_subject_names.append({
                'name': predicted_subject_name,
                'score': score,
                'start_index': start_index,
                'end_index': end_index,
            })
    return predicted_subject_names

## TEST ##
print('Sample Output:')
predict_subject_name(['what', 'major', 'cities', 'does', 'u.s.', 'route', '2', 'run', 'through', '?'])

Sample Output:


[{'end_index': 7,
  'name': 'u.s. route 2',
  'score': 39.895530700683594,
  'start_index': 4},
 {'end_index': 6,
  'name': 'u.s. route',
  'score': 33.13998031616211,
  'start_index': 4},
 {'end_index': 7,
  'name': 'route 2',
  'score': 31.412817001342773,
  'start_index': 5},
 {'end_index': 5,
  'name': 'u.s.',
  'score': 31.158092498779297,
  'start_index': 4},
 {'end_index': 7,
  'name': 'does u.s. route 2',
  'score': 30.100318908691406,
  'start_index': 3},
 {'end_index': 7, 'name': '2', 'score': 29.430931091308594, 'start_index': 6},
 {'end_index': 8,
  'name': 'u.s. route 2 run',
  'score': 27.478029251098633,
  'start_index': 4},
 {'end_index': 7,
  'name': 'cities does u.s. route 2',
  'score': 26.6679630279541,
  'start_index': 2},
 {'end_index': 6,
  'name': 'route',
  'score': 24.65726661682129,
  'start_index': 5},
 {'end_index': 6,
  'name': 'does u.s. route',
  'score': 23.34476661682129,
  'start_index': 3}]

In [8]:
import pandas as pd
from numpy import nan

preprocess = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").preprocess
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

def add_predicted_subject_name(row):
    tokens = tokenize(preprocess(text))
    predicted_subject_name, start_index, end_index, question_tokens, confidence = predict_subject_name(row['question'])
    if len(predicted_subject_name) > 0:
        row['predicted_subject_name'] = predicted_subject_name
        row['predicted_start_index'] = start_index
        row['predicted_end_index'] = end_index
        row['question_tokens'] = question_tokens
        row['tag_confidence'] = confidence
    return row

df_dev = df_dev.progress_apply(add_predicted_subject_name, axis=1)
df_dev[:5]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[[6.663516044616699, -6.592164993286133], [3.103424310684204, -3.147894859313965], [3.1990606784820557, -3.2045555114746094], [-2.539365291595459, 2.3958916664123535], [5.994741439819336, -5.983532428741455], [1.9079399108886719, -1.9976216554641724], [4.952872276306152, -4.911064624786377], [-1.5801552534103394, 1.357745885848999], [0.5502100586891174, -0.6919055581092834]]
['who', "'s", 'a', 'forward', 'that', 'plays', 'for', 'manchester', 'united']

['O', 'O', 'O', 'I', 'O', 'O', 'O', 'I', 'O']
[[6.663516044616699, -6.592164993286133], [3.103424310684204, -3.147894859313965], [3.1990606784820557, -3.2045555114746094], [-2.539365291595459, 2.3958916664123535], [5.994741439819336, -5.983532428741455], [1.9079399108886719, -1.9976216554641724], [4.952872276306152, -4.911064624786377], [-1.5801552534103394, 1.357745885848999], [0.5502100586891174, -0.6919055581092834]]
['who', "'s", 'a', 'forward', 'that', 'plays', 'for', 'manchester', 'unit

['O', 'O', 'O', 'I', 'I', 'I', 'I', 'O', 'I', 'I']
[[6.783242225646973, -6.782492160797119], [0.9214878678321838, -1.0263453722000122], [4.241297721862793, -4.1707892417907715], [0.12111178040504456, -0.08436067402362823], [-0.3803764283657074, 0.32167553901672363], [-0.2856900095939636, 0.17400622367858887], [-0.5369302034378052, 0.43304139375686646], [4.236837387084961, -4.23635196685791], [-0.5917673707008362, 0.47474634647369385], [-0.9950591325759888, 0.8786640167236328]]
['what', 'property', 'is', 'gender', 'neutral', 'public', 'restroom', 'in', 'san', 'francisco']

['O', 'O', 'O', 'O', 'O', 'O']
[[7.039474964141846, -7.176802635192871], [-0.6737489104270935, 0.48991984128952026], [-1.3386386632919312, 1.2460561990737915], [5.059530735015869, -5.0557379722595215], [-2.5310871601104736, 2.2843270301818848], [4.455630302429199, -4.665682315826416]]
['what', 'film', 'festival', 'is', 'recurring', '?']

['O', 'I', 'I', 'O', 'I', 'O']
[[7.039474964141846, -7.176802635192871], [-0.6737

['O', 'O', 'O']
[[6.651897430419922, -6.64432954788208], [-2.196439504623413, 1.996200442314148], [5.767156600952148, -5.802806377410889]]
['which', 'country', 'contains']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[[6.809357643127441, -6.81069803237915], [3.8259990215301514, -3.849388360977173], [5.012765884399414, -4.969763278961182], [-2.3294007778167725, 2.0512211322784424], [0.28657323122024536, -0.3587338924407959], [2.642854928970337, -2.579658031463623], [1.0905508995056152, -1.09858238697052], [-2.2446892261505127, 2.040522813796997], [5.578249931335449, -5.725537300109863]]
['what', 'is', 'a', 'documentary', 'film', 'about', 'the', 'titanic', 'called']

['O', 'O', 'O', 'I', 'O', 'O', 'O', 'I', 'O']
[[6.809357643127441, -6.81069803237915], [3.8259990215301514, -3.849388360977173], [5.012765884399414, -4.969763278961182], [-2.3294007778167725, 2.0512211322784424], [0.28657323122024536, -0.3587338924407959], [2.642854928970337, -2.579658031463623], [1.0905508995056152, -1.09

Unnamed: 0,subject,relation,object,question,predicted_subject_name,predicted_start_index,predicted_end_index,question_tokens,tag_confidence
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse,american thoroughbread,2,3,"[name, an, american, thoroughbread, racehorse]","[0.0019413890604032915, 0.005350851396586186, ..."
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...,vision racing driving simulator,5,8,"[what, kind, of, game, is, vision, racing, dri...","[0.0006516680742489636, 0.010718954821512583, ..."
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film,romance film,4,5,"[what, tv, program, is, romance, film]","[0.000369574122630813, 0.03523506175438226, 0...."
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in,polaski,3,3,"[what, state, is, polaski, located, in]","[0.0005542748870903849, 0.05355539388993934, 0..."
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett,fern emmett,6,7,"[what, disease, claimed, the, life, of, fern, ...","[0.0009087953114309601, 0.11962091976879304, 0..."


## Analysis - Correct Span

We determine the correct subject name span and compare it to the predicted subject name.

In [9]:
import importlib
from functools import partial

edit_distance_link_alias = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Question Refers to Multiple Subjects").edit_distance_link_alias
normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

# Create a column with the subject_name linked per example
df_dev['subject_name'] = df_dev.progress_apply(partial(edit_distance_link_alias, cursor, normalize), axis=1)

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Question Refers to Multiple Subjects.ipynb





In [10]:
import importlib
find_subject_name_span = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").find_subject_name_span

# Create a column with the subject_name linked per example
df_dev = df_dev.progress_apply(find_subject_name_span, axis=1)




In [11]:
from tqdm import tqdm_notebook
from scripts.utils.table import format_pipe_table

negative_sample = []
correct = 0
total = 0

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    subject_name = row['question_tokens'][int(row['start_index']):int(row['end_index'])]
    subject_name = ' '.join(subject_name)
    if row['predicted_subject_name'] == subject_name:
        correct += 1
    else:
        negative_sample.append({
            'Subject Name': subject_name,
            'Predicted Subject Name': row['predicted_subject_name'],
        })
        
print('Accuracy: %f [%d of %d]' % (correct / total, correct, total))
print('Negative Sample:\n')
print(format_pipe_table(negative_sample[:50]))


Accuracy: 0.942337 [10034 of 10648]
Negative Sample:

| Index | Predicted Subject Name | Subject Name |
| --- | --- | --- |
| 0 | documentary film | short |
| 1 | krgy station | krgy |
| 2 | austrailian screenwriter | screenwriter |
| 3 | the water in the canche | canche |
| 4 | the red clouds war | red clouds war |
| 5 | nation book | corporation nation |
| 6 | o.k . ken | o.k . ken ? |
| 7 | the world museum liverpool | world museum liverpool |
| 8 | it must have been years be heard | it must have been years |
| 9 | through dundas , ontario | dundas , ontario |
| 10 | : so long to broadway | so long to broadway |
| 11 | aaron carter | album |
| 12 | pillows & prayers : cherry red 1982 - 1983 | pillows & prayers : cherry red 1982 |
| 13 | luxembourg | commune of luxembourg |
| 14 | a golden hour of ... | a golden hour of |
| 15 | the eye of the eagle | eye of the eagle |
| 16 | the blues collection 5 : jungle music | jungle music |
| 17 | hits album 6 | the hits album 6 |
| 18 | two 

## Analysis - Normalized Link

We normalize the correct subject name and compare it to the predicted name normalized.

We expect this to be lower than the span analysis because in "HYPOTHESIS - Subject Name not in Question", we found that 97.85% of the time the subject name normalized is not in the question; therefore, some spans that are correct will not be equal to the subject name normalized.

In [12]:
import importlib

normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

In [13]:
from tqdm import tqdm_notebook
from scripts.utils.table import format_pipe_table

negative_sample = []
correct = 0
total = 0

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    total += 1
    subject_name = normalize(' '.join(tokenize(row['subject_name'])))
    predicted_subject_name = normalize(row['predicted_subject_name'])
    if predicted_subject_name == subject_name:
        correct += 1
    else:
        negative_sample.append({
            'Subject Name': subject_name,
            'Predicted Subject Name': predicted_subject_name,
        })
        
print('Accuracy: %f [%d of %d]' % (correct / total, correct, total))
print('Negative Sample:\n')
print(format_pipe_table(negative_sample[:50]))


Accuracy: 0.934823 [9954 of 10648]
Negative Sample:

| Index | Predicted Subject Name | Subject Name |
| --- | --- | --- |
| 0 | documentary film | short |
| 1 | krgy station | krgy |
| 2 | austrailian screenwriter | screenwriter |
| 3 | the water in the canche | canche |
| 4 | the red clouds war | red cloud s war |
| 5 | nation book | corporation nation |
| 6 | the world museum liverpool | world museum liverpool |
| 7 | peters point plantation | peter s point plantation |
| 8 | it must have been years be heard | it must have been years |
| 9 | through dundas ontario | dundas ontario |
| 10 | aaron carter | album |
| 11 | pillows prayers cherry red 1982 1983 | pillows prayers cherry red 19821983 |
| 12 | luxembourg | commune of luxembourg |
| 13 | the eye of the eagle | eye of the eagle |
| 14 | the blues collection 5 jungle music | jungle music |
| 15 | hits album 6 | the hits album 6 |
| 16 | two women | between two women |
| 17 | battle of hudsons bay | battle of hudson s bay |
| 1

## Save Data

In [14]:
df_dev.to_csv('step_1_predict_subject_name.csv')