# Subject Recognition Data

The goal of this notebook is to create data for subject recognition tagging. For every token in a question, we tag it with a I for inside subject or O for outside subject.

In [1]:
import sys
sys.path.insert(0, '../../')
import pandas as pd
from tqdm import tqdm_notebook
import scripts.utils.import_notebook
from scripts.utils.connect import get_connection 

tqdm_notebook().pandas()

connection = get_connection()
cursor = connection.cursor()




In [2]:
from scripts.utils.simple_qa import load_simple_qa 

# Destination Filename
DEST_TRAIN = './../../data/subject_recognition/train.txt'
DEST_DEV = './../../data/subject_recognition/dev.txt'

df_dev, = load_simple_qa(dev=True)
print('Dev:')
display(df_dev[:5])
df_train, = load_simple_qa(train=True)
print('Train:')
display(df_train[:5])

Dev:


Unnamed: 0,subject,relation,object,question
0,0f3xg_,symbols/namesake/named_after,0cqt90,Who was the trump ocean club international hot...
1,07f3jg,people/person/place_of_birth,0565d,where was sasha vujačić born
2,031j8nn,music/release/region,07ssc,What is a region that dead combo was released in
3,0c1cyhd,film/director/film,0wxsz5y,What is a film directed by wiebke von carolsfeld?
4,0fvhc0g,music/release/region,0345h,what country was music for stock exchange rel...


Train:


Unnamed: 0,subject,relation,object,question
0,04whkz5,book/written_work/subjects,01cj3p,what is the book e about
1,0tp2p24,music/release_track/release,0sjc7c1,to what release does the release track cardiac...
2,04j0t75,film/film/country,07ssc,what country was the film the debt from
3,0ftqr,music/producer/tracks_produced,0p600l,what songs have nobuo uematsu produced?
4,036p007,music/release/producers,0677ng,Who produced eve-olution?


## Step 1 - Link Question to Subject Name

In [3]:
import importlib
from functools import partial
edit_distance_link_alias = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Question Refers to Multiple Subjects").edit_distance_link_alias
normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

# Create a column with the subject_name linked per example
df_dev['subject_name'] = df_dev.progress_apply(partial(edit_distance_link_alias, cursor, normalize), axis=1)
print('Dev Linked', sum(df_dev.subject_name.notnull()), 'examples')
display(df_dev[:5])
df_train['subject_name'] = df_train.progress_apply(partial(edit_distance_link_alias, cursor, normalize), axis=1)
print('Train Linked', sum(df_train.subject_name.notnull()), 'examples')
display(df_train[:5])

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Question Refers to Multiple Subjects.ipynb
importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Subject Name not in Question.ipynb



Dev Linked 10648 examples


Unnamed: 0,subject,relation,object,question,subject_name
0,0f3xg_,symbols/namesake/named_after,0cqt90,Who was the trump ocean club international hot...,trump ocean club international hotel and tower
1,07f3jg,people/person/place_of_birth,0565d,where was sasha vujačić born,sasha vujacic
2,031j8nn,music/release/region,07ssc,What is a region that dead combo was released in,dead combo
3,0c1cyhd,film/director/film,0wxsz5y,What is a film directed by wiebke von carolsfeld?,wiebke von carolsfeld
4,0fvhc0g,music/release/region,0345h,what country was music for stock exchange rel...,music for stock exchange



Train Linked 74520 examples


Unnamed: 0,subject,relation,object,question,subject_name
0,04whkz5,book/written_work/subjects,01cj3p,what is the book e about,e
1,0tp2p24,music/release_track/release,0sjc7c1,to what release does the release track cardiac...,cardiac arrest
2,04j0t75,film/film/country,07ssc,what country was the film the debt from,the debt
3,0ftqr,music/producer/tracks_produced,0p600l,what songs have nobuo uematsu produced?,nobuo uematsu
4,036p007,music/release/producers,0677ng,Who produced eve-olution?,eve-olution


## Step 2 - Determine the Span of the Subject Name

During the first step, we determined the subject name that best fits the question. We now need to determine the span of the subject name inside the question

In [1]:
import importlib
import re

# Otherwise, importing this notebook the strip_accents wont be picked up as a definition
def strip_accents(*args, **kwargs):
    return importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question"
    ).strip_accents(*args, **kwargs)

def preprocess(s):
    """
    Preprocess before tagging with IO.
    """
    # Represent characters in ASCII
    s = strip_accents(s)
    s = s.strip()
    s = s.lower()
    # Normalize quatations
    s = s.replace('“', '"').replace('”', '"').replace('’', "'").replace('‘', "'")
    # Substitue multiple spaces with one
    s = re.sub('\s+', ' ', s)
    return s

def normalize_abbreviations(s):
    # Only remove punctuation from words with punctuation
    if sum([c.isalnum() for c in s]) > 0 and s.count('.') > 0:
        s = s.replace('.', '')
    return s

normalize_abbreviations('A.C.D.C.')

'ACDC'

In [5]:
import spacy

NLP = spacy.load('en_core_web_sm')
    
def spacy_tokenize(s):
    doc = NLP(s, disable=['parser', 'tagger', 'ner'])
    return [w.text for w in doc]

In [6]:
import math
from numpy import nan
from scripts.utils.edit_distance import edit_token_distance

def find_subject_name_span(row):
    if not isinstance(row['subject_name'], str):
        return row
    
    # Get the start character and end character that best match
    question_tokens = spacy_tokenize(preprocess(row['question']))
    question_tokens_no_abbreviations = tuple([normalize_abbreviations(t) for t in question_tokens])
    best_token_distance = math.inf
    best_start_index = None
    best_end_index = None
    best_subject_name_tokens = None

    # NOTE: Try multiple tokenization techiniques to line up the subject name with question
    for tokenize in [lambda s: s.split(), spacy_tokenize]:
        subject_name_tokens = tokenize(preprocess(row['subject_name']))
        subject_name_tokens = tuple([normalize_abbreviations(t) for t in subject_name_tokens])
        token_distance, start_index, end_index = edit_token_distance(subject_name_tokens,
                                                                     question_tokens_no_abbreviations)
            
        assert start_index <= end_index
        if token_distance < best_token_distance:
            best_token_distance = token_distance
            best_start_index = start_index
            best_end_index = end_index
            best_subject_name_tokens = subject_name_tokens
            
    # Edge case
    # Include the closing parentheses if one was opened
    if best_end_index < len(question_tokens) and question_tokens[best_end_index] == ')':
        best_end_index += 1
        
    row['start_index'] = best_start_index
    row['end_index'] = best_end_index
    row['subject_name_tokens'] = best_subject_name_tokens
    row['question_tokens'] = question_tokens
    return row
    

df_dev = df_dev.progress_apply(find_subject_name_span, axis=1)
display(df_dev[:5])




Unnamed: 0,end_index,object,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens
0,10.0,0cqt90,Who was the trump ocean club international hot...,"[who, was, the, trump, ocean, club, internatio...",symbols/namesake/named_after,3.0,0f3xg_,trump ocean club international hotel and tower,"(trump, ocean, club, international, hotel, and..."
1,4.0,0565d,where was sasha vujačić born,"[where, was, sasha, vujacic, born]",people/person/place_of_birth,2.0,07f3jg,sasha vujacic,"(sasha, vujacic)"
2,7.0,07ssc,What is a region that dead combo was released in,"[what, is, a, region, that, dead, combo, was, ...",music/release/region,5.0,031j8nn,dead combo,"(dead, combo)"
3,9.0,0wxsz5y,What is a film directed by wiebke von carolsfeld?,"[what, is, a, film, directed, by, wiebke, von,...",film/director/film,6.0,0c1cyhd,wiebke von carolsfeld,"(wiebke, von, carolsfeld)"
4,7.0,0345h,what country was music for stock exchange rel...,"[what, country, was, music, for, stock, exchan...",music/release/region,3.0,0fvhc0g,music for stock exchange,"(music, for, stock, exchange)"


## Analysis - Worst Spans

Empirical analysis of the worst question tokens to subject name tokens link. 

In [7]:
from Levenshtein import distance
from scripts.utils.table import format_pipe_table

print_data = []

for index, row in tqdm_notebook(df_dev.iterrows(), total=df_dev.shape[0]):  
    if not isinstance(row['subject_name'], str):
        continue

    # Print the worst fit subject_name's just in case
    tagged_subject_name = row['question_tokens'][int(row['start_index']):int(row['end_index'])]
    tagged_subject_name = ' '.join([normalize_abbreviations(t) for t in tagged_subject_name])
    edit_distance = distance(tagged_subject_name, ' '.join(row['subject_name_tokens']))
    max_length = max(len(' '.join(row['subject_name_tokens'])), len(tagged_subject_name))
    normalized_edit_distance = (max_length - edit_distance) / max_length
    print_data.append({
        'Predicted Subject Name': tagged_subject_name,
        'Original Subject Name': row['subject_name'],
        'Normalized Edit Distance': normalized_edit_distance,
        'Question Tokens': row['question_tokens'],
    })

exact_match = [d for d in print_data if d['Normalized Edit Distance'] == 1.0]
not_matched = [d for d in print_data if d['Normalized Edit Distance'] != 1.0]
print('Exact Match: %f [%d of %d]' % (len(exact_match) / len(print_data), len(exact_match), len(print_data)))
print('Not Matched:\n')
print(format_pipe_table(not_matched))


Exact Match: 0.976991 [10403 of 10648]
Not Matched:

| Index | Normalized Edit Distance | Original Subject Name | Predicted Subject Name | Question Tokens |
| --- | --- | --- | --- | --- |
| 0 | 0.9655172413793104 | roman-parthian war of 58–63 | roman – parthian war of 58–63 | ['what', 'entity', 'was', 'involved', 'in', 'the', 'roman', '–', 'parthian', 'war', 'of', '58–63'] |
| 1 | 0.8333333333333334 | mecca | meccas | ['what', 'is', 'the', 'name', 'for', 'meccas', 'canonical', 'version'] |
| 2 | 0.9310344827586207 | assassin's creed: revelations | assassins creed : revelations | ['what', 'kind', 'of', 'game', 'is', 'assassins', 'creed', ':', 'revelations'] |
| 3 | 0.9285714285714286 | lim, hyung joo | lim hyung joo | ['what', 'kind', 'of', 'music', 'does', 'lim', 'hyung', 'joo', 'do'] |
| 4 | 0.9444444444444444 | this pud's for you | this puds for you | ['what', 'is', 'the', 'series', 'where', 'the', 'episode', 'this', 'puds', 'for', 'you', 'comes', 'from'] |
| 5 | 0.6666666666666666

### Discussion

#### Numbers:

Exact Match: 0.976991 [10403 of 10648]

We find that most of the time, we are able to determine exactly the span in the question that refers to the linked subject name.

#### Error Bucket:

**Discussion:**

We are able to determine the correct span 98% of time that there is not an exact match; therefore, we approximate that overall 10643/10648 (0.9995%). This makes sense due to the fact that subject name was only linked if it had a high similarity with the question.

**Buckets:**
- (49 / 50) Correct: The span correctly locations the subject name in the question.
- (1 / 50) Other: The alias is incorrect; therefore, the span is incorrect.

| Index | Bucket | Normalized Edit Distance | Original Subject Name | Predicted Subject Name | Question Tokens |
| --- | --- | --- | --- | --- | --- |
| 0 | Correct | 0.9655172413793104 | roman-parthian war of 58–63 | roman – parthian war of 58–63 | ['what', 'entity', 'was', 'involved', 'in', 'the', 'roman', '–', 'parthian', 'war', 'of', '58–63'] |
| 1 | Correct | 0.8333333333333334 | mecca | meccas | ['what', 'is', 'the', 'name', 'for', 'meccas', 'canonical', 'version'] |
| 2 | Correct | 0.9310344827586207 | assassin's creed: revelations | assassins creed : revelations | ['what', 'kind', 'of', 'game', 'is', 'assassins', 'creed', ':', 'revelations'] | 
| 3 | Correct | 0.9285714285714286 | lim, hyung joo | lim hyung joo | ['what', 'kind', 'of', 'music', 'does', 'lim', 'hyung', 'joo', 'do'] |
| 4 | Correct | 0.9444444444444444 | this pud's for you | this puds for you | ['what', 'is', 'the', 'series', 'where', 'the', 'episode', 'this', 'puds', 'for', 'you', 'comes', 'from'] |
| 5 | Other | 0.6666666666666666 | t-town | town | ['what', 'newspaper', 'circulates', 'in', 'the', 'town', 'of', 'kearny'] |
| 6 | Correct | 0.9259259259259259 | who want's to live forever? | who wants to live forever | ['who', 'is', 'the', 'producer', 'of', 'who', 'wants', 'to', 'live', 'forever'] |
| 7 | Correct | 0.9166666666666666 | paranoia: 1.0 | paranoia 10 | ['which', 'language', 'is', 'paranoia', '1.0', 'filmed', 'in', '?'] |
| 8 | Correct | 0.8571428571428571 | ¡three amigos! | three amigos | ['who', 'did', 'the', 'music', 'for', 'three', 'amigos'] |
| 9 | Correct  | 0.9545454545454546 | mexican-american war | mexican – american war | ['who', 'was', 'involved', 'in', 'mexican', '–', 'american', 'war'] |
| 10 | Correct | 0.9722222222222222 | the best of daniel o'donnell on film | the best of daniel odonnell on film | ['what', 'kind', 'of', 'music', 'is', 'the', 'film', 'the', 'best', 'of', 'daniel', 'odonnell', 'on', 'film', 'about', '?'] |
| 11 | Correct | 0.9333333333333333 | philip pullman | phillip pullman | ['what', 'works', 'written', 'by', 'phillip', 'pullman', '?'] |
| 12 | Correct | 0.975 | harry potter and the philosopher's stone | harry potter and the philosophers stone | ['what', 'genre', 'is', 'harry', 'potter', 'and', 'the', 'philosophers', 'stone'] |
| 13 | Correct | 0.95 | .977 the '80s channel | 977 the 80s channel | ['which', 'soul', 'artist', 'is', 'featured', 'on', 'the', '.977', 'the', '80s', 'channel'] |
| 14 | Correct | 0.9166666666666666 | kim hee-chul | kim heechul | ['which', 'city', 'is', 'kim', 'heechul', 'from'] |
| 15 | Correct | 0.9772727272727273 | intégrale, volume 2: the guitar don't lie | integrale , volume 2 : the guitar do nt lie | ['what', "'s", 'a', 'song', 'released', 'on', 'integrale', ',', 'volume', '2', ':', 'the', 'guitar', 'do', 'nt', 'lie'] |
| 16 | Correct | 0.9615384615384616 | bill & ted's bogus journey | bill & teds bogus journey | ['what', 'foreign', 'language', 'is', 'an', 'option', 'for', 'the', 'film', 'bill', '&', 'teds', 'bogus', 'journey'] |
| 17 | Correct | 0.8846153846153846 | single-player video game | single - player mode game | ['what', 'is', 'a', 'single', '-', 'player', 'mode', 'game', '?'] |
| 18 | Correct | 0.9473684210526315 | eve's beach fantasy | eves beach fantasy | ['what', 'language', 'is', 'spoken', 'in', 'eves', 'beach', 'fantasy'] |
| 19 | Correct | 0.9473684210526315 | live in denmark '72 | live in denmark 72 | ['what', 'are', 'songs', 'of', 'the', 'live', 'in', 'denmark', '72', 'album'] |
| 20 | Correct | 0.9230769230769231 | bonnie's kids | bonnies kids | ['what', 'is', 'the', 'genre', 'of', 'the', 'film', 'bonnies', 'kids', '?'] |
| 21 | Correct | 0.9166666666666666 | men's badminton, singles | mens badminton , singles | ['what', 'olympic', 'games', 'was', 'mens', 'badminton', ',', 'singles', 'apart', 'of'] |
| 22 | Correct | 0.8571428571428571 | germans | german | ['who', 'is', 'german', '?'] |
| 23 | Correct | 0.9444444444444444 | action-adventure | action / adventure | ['what', 'is', 'an', 'action', '/', 'adventure', 'netflix', 'title'] |
| 24 | Correct | 0.9 | love's labour's lost | loves labours lost | ['who', 'produced', 'the', 'film', 'loves', 'labours', 'lost'] |
| 25 | Correct | 0.9565217391304348 | rogers-o'daniel house | rogers - odaniel house | ['what', 'county', 'is', 'rogers', '-', 'odaniel', 'house', 'apart', 'of'] |
| 26 | Correct | 0.9285714285714286 | curtis pulley | curtis pulleys | ['what', 's', 'curtis', 'pulleys', 'profession'] |
| 27 | Correct | 0.9743589743589743 | manos hajidakis - eidolo ston kathrefti | manos hajidakis : eidolo ston kathrefti | ['what', 'country', 'is', 'manos', 'hajidakis', ':', 'eidolo', 'ston', 'kathrefti', 'from'] |
| 28 | Correct | 0.9 | supernews! | supernews | ['what', 'genre', 'of', 'program', 'is', 'supernews'] |
| 29 | Correct | 0.9375 | showgirls 2: penny's from heaven | showgirls 2 : pennys from heaven | ['what', 'language', 'is', 'showgirls', '2', ':', 'pennys', 'from', 'heaven', 'in', '?'] |
| 30 | Correct | 0.9230769230769231 | pop rock 80's | pop rock 80s | ['what', 'artists', 'broadcast', "'s", 'pop', 'rock', '80s', 'music'] |
| 31 | Correct | 0.7777777777777778 | macedonia | macedon | ['what', 'invasions', 'happened', 'in', 'macedon'] |
| 32 | Correct | 0.9565217391304348 | mockingbird don't sing | mockingbird do nt sing | ['what', 'kind', 'of', 'film', 'is', 'mockingbird', 'do', 'nt', 'sing'] |
| 33 | Correct | 0.9285714285714286 | zalman shazar | zalman shazars | ['what', 'is', 'zalman', 'shazars', 'area', 'of', 'activism'] |
| 34 | Correct | 0.9696969696969697 | atomic - the very best of blondie | atomic : the very best of blondie | ['what', 'type', 'of', 'release', 'is', 'atomic', ':', 'the', 'very', 'best', 'of', 'blondie', '?'] |
| 35 | Correct | 0.8333333333333334 | album | albums | ['what', 'albums', 'were', 'released', 'in', '2000', '?'] |
| 36 | Correct | 0.9 | zeke's pad | zekes pad | ['is', 'zekes', 'pad', 'an', 'action', ',', 'romance', ',', 'or', 'comedy', 'tv', 'program'] |
| 37 | Correct | 0.9565217391304348 | raffi's christmas album | raffis christmas album | ['what', 'kind', 'of', 'album', 'is', 'raffis', 'christmas', 'album'] |
| 38 | Correct | 0.8928571428571429 | u.s. office of war information | office of war information | ['which', 'film', 'did', 'the', 'office', 'of', 'war', 'information', 'help', 'produce'] |
| 39 | Correct | 0.967741935483871 | phillipa lord: she's the shit | phillipa lord : she s the shit | ['which', 'country', 'released', 'phillipa', 'lord', ':', 'she', 's', 'the', 'shit'] |
| 40 | Correct | 0.875 | pillows & prayers: cherry red 1982–1983 | pillows & prayers : cherry red 1982 | ['what', 'is', 'the', 'name', 'of', 'the', 'track', 'list', 'for', 'the', 'release', 'pillows', '&', 'prayers', ':', 'cherry', 'red', '1982', '-', '1983', '?'] |
| 41 | Correct | 0.9375 | wichita [kansas | wichita , kansas | ['who', 'was', 'born', 'in', 'wichita', ',', 'kansas'] |
| 42 | Correct | 0.8636363636363636 | victoria, australia | victoria ( australia ) | ['which', 'places', 'are', 'located', 'in', 'victoria', '(', 'australia', ')', '?'] |
| 43 | Correct | 0.8666666666666667 | eat, pray, love | eat pray love | ['who', 'composed', 'the', 'music', 'for', 'eat', 'pray', 'love'] |
| 44 | Correct | 0.8888888888888888 | lost (12\" inch version) | lost ( 12 inch version ) | ['what', 's', 'the', 'canonical', 'version', 'of', 'lost', '(', '12', 'inch', 'version', ')'] |
| 45 | Correct | 0.8 | “you don’t dream in cryo. ....” | " you do n't dream in cryo " | ['what', 'artist', 'recorded', '"', 'you', 'do', "n't", 'dream', 'in', 'cryo', '"', '?'] |
| 46 | Correct | 0.8 | drums | drum | ['which', 'musician', 'plays', 'the', 'drum', 'kit'] |
| 47 | Correct | 0.75 | the crystal city | crystal city | ['what', 'genre', 'is', 'crystal', 'city'] |
| 48 | Correct | 0.9130434782608695 | william e. mcanulty jr. | william e mcanulty , jr | ['what', 'is', 'william', 'e.', 'mcanulty', ',', 'jr', '.', "'s", 'gender'] |
| 49 | Correct | 0.8571428571428571 | somalis | somali | ['who', "'s", 'somebody', 'that', 'identifies', 'with', 'the', 'somali', 'people'] |

## Step 2 - Continued

In [8]:
df_train = df_train.progress_apply(find_subject_name_span, axis=1)
display(df_train[:5])




Unnamed: 0,end_index,object,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens
0,5.0,01cj3p,what is the book e about,"[what, is, the, book, e, about]",book/written_work/subjects,4.0,04whkz5,e,"(e,)"
1,9.0,0sjc7c1,to what release does the release track cardiac...,"[to, what, release, does, the, release, track,...",music/release_track/release,7.0,0tp2p24,cardiac arrest,"(cardiac, arrest)"
2,7.0,07ssc,what country was the film the debt from,"[what, country, was, the, film, the, debt, from]",film/film/country,5.0,04j0t75,the debt,"(the, debt)"
3,5.0,0p600l,what songs have nobuo uematsu produced?,"[what, songs, have, nobuo, uematsu, produced, ?]",music/producer/tracks_produced,3.0,0ftqr,nobuo uematsu,"(nobuo, uematsu)"
4,5.0,0677ng,Who produced eve-olution?,"[who, produced, eve, -, olution, ?]",music/release/producers,2.0,036p007,eve-olution,"(eve, -, olution)"


## Step 3 - Format Examples

In [9]:
from tqdm import tqdm_notebook


def get_formatted_examples(df):
    examples = []
    for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
        if not isinstance(row['subject_name'], str):
            continue

        ret =  ''
        for i, token in enumerate(row['question_tokens']):
            ret += token
            if i >= row['start_index'] and i < row['end_index']:
                ret += '/I '
            else:
                ret += '/O ' # IO – Inside Outside tagging schema
        examples.append(ret.strip())
    return examples

train_examples = get_formatted_examples(df_train)
print('Train:')
print(train_examples[:5])
dev_examples = get_formatted_examples(df_dev)
print('Dev:')
print(dev_examples[:5])


Train:
['what/O is/O the/O book/O e/I about/O', 'to/O what/O release/O does/O the/O release/O track/O cardiac/I arrest/I come/O from/O', 'what/O country/O was/O the/O film/O the/I debt/I from/O', 'what/O songs/O have/O nobuo/I uematsu/I produced/O ?/O', 'who/O produced/O eve/I -/I olution/I ?/O']



Dev:
['who/O was/O the/O trump/I ocean/I club/I international/I hotel/I and/I tower/I named/O after/O', 'where/O was/O sasha/I vujacic/I born/O', 'what/O is/O a/O region/O that/O dead/I combo/I was/O released/O in/O', 'what/O is/O a/O film/O directed/O by/O wiebke/I von/I carolsfeld/I ?/O', 'what/O country/O was/O music/I for/I stock/I exchange/I released/O in/O']


## Step 4 - Write

In [10]:
file_ = open(DEST_TRAIN, 'w')
file_.write('\n'.join(train_examples))

file_ = open(DEST_DEV, 'w')
file_.write('\n'.join(dev_examples))

633151