# Step 2 - Generate Candidates

Our goal during this step is to generate candidate relations and aliases.

In [1]:
import sys
sys.path.insert(0, '../../')

In [9]:
from scripts.utils.connect import get_connection 
from scripts.utils.data import FB2M_NAME_TABLE

connection = get_connection()
cursor = connection.cursor()

In [3]:
import pandas as pd
from tqdm import tqdm_notebook

tqdm_notebook().pandas()

df = pd.read_csv('step_1_predict_subject_name.csv', index_col=0)
df[:5]




Unnamed: 0,end_index,object,predicted_end_index,predicted_start_index,predicted_subject_name,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens,tag_confidence
6219,,0bs56bp,3,2,american thoroughbread,Name an American Thoroughbread racehorse,"['name', 'an', 'american', 'thoroughbread', 'r...",biology/organism_classification/organisms_of_t...,,03k3r,,,"[0.0019413890604032915, 0.005350851396586186, ..."
3364,9.0,01sjng,8,5,vision racing driving simulator,what kind of game is vision racing driving sim...,"['what', 'kind', 'of', 'game', 'is', 'vision',...",cvg/computer_videogame/cvg_genre,5.0,02qlppc,vision racing driving simulator,"('vision', 'racing', 'driving', 'simulator')","[0.0006516680742489636, 0.010718954821512583, ..."
9374,6.0,0dlmm88,5,4,romance film,what tv program is romance film,"['what', 'tv', 'program', 'is', 'romance', 'fi...",tv/tv_genre/programs,4.0,02l7c8,romance film,"('romance', 'film')","[0.000369574122630813, 0.03523506175438226, 0...."
10142,4.0,04rrx,3,3,polaski,what state is polaski located in,"['what', 'state', 'is', 'polaski', 'located', ...",location/location/containedby,3.0,049_zj3,polaski,"('polaski',)","[0.0005542748870903849, 0.05355539388993934, 0..."
97,8.0,0qcr0,7,6,fern emmett,what disease claimed the life of fern emmett,"['what', 'disease', 'claimed', 'the', 'life', ...",people/deceased_person/cause_of_death,6.0,02w9ycr,fern emmett,"('fern', 'emmett')","[0.0009087953114309601, 0.11962091976879304, 0..."


Define text preprocessing the same as the training data and step 1.

In [4]:
import importlib
import scripts.utils.import_notebook
import re

preprocess = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").preprocess
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

def text_preprocess(s):
    # Define `text_preprocess` the way the input text was preprocessed before step 1
    s = preprocess(s)
    s = tokenize(s)
    s = ' '.join(s)
    return s

def text_normalize(s):
    s = text_preprocess(s)
    # In `Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb` we found that
    # aliases and questions match up more if punctuation is removed.
    
    # Remove punctuation
    s = re.sub(r'[^\w\s]','',s)
    # Removing characters can create gaps of multiple spaces
    # Substitue multiple spaces with one
    s = re.sub('\s+', ' ', s)
    s = s.strip()
    return s

importing Jupyter notebook from ../../scripts/Simple QA Models/Subject Recognition Data.ipynb


## Index Subject Aliases

Create an index of subject aliases that are preprocessed similar to the predicted subect name. Allowing for a database lookup.

In [None]:
cursor.execute('ALTER TABLE ' + FB2M_NAME_TABLE + ' ADD COLUMN normalized_alias varchar')

In [None]:
cursor.execute('ALTER TABLE ' + FB2M_NAME_TABLE + ' ADD COLUMN preprocessed_alias varchar')

In [None]:
from tqdm import tqdm_notebook
import psycopg2

chunk_size = 10000

def update_chunk(rows):
    query = ('UPDATE ' + FB2M_NAME_TABLE + ' SET normalized_alias = %s, preprocessed_alias = %s' +
            ' WHERE mid = %s and alias = %s')
    psycopg2.extras.execute_batch(cursor, query, rows)

cursor.execute('SELECT mid, alias FROM ' + FB2M_NAME_TABLE)
rows = []
for mid, alias in tqdm_notebook(cursor.fetchall()):
    normalized_alias = text_normalize(alias)
    preprocessed_alias = text_preprocess(alias)
    rows.append(tuple([normalized_alias, preprocessed_alias, mid, alias]))
    
    # Insert Chunk
    if len(rows) > chunk_size:
        update_chunk(rows)
        rows = []
        
update_chunk(rows)

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Subject Name not in Question.ipynb


In [15]:
connection.commit()

In [None]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_normalized_alias ON ' + 
               FB2M_NAME_TABLE + '(normalized_alias);')
connection.commit()

In [16]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_preprocessed_alias ON ' + 
               FB2M_NAME_TABLE + '(preprocessed_alias);')
connection.commit()

In [None]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_normalized_alias_trgm ON ' + 
               FB2M_NAME_TABLE + ' USING gist(normalized_alias gist_trgm_ops);')
connection.commit()

## Generate Candidates

If subject name is null, then the question does not refer to the true alias. The example is then unanswerable.

In [5]:
df_answerable = df[df.subject_name.notnull()]

Metrics used to evaluate different versions.

In [6]:
def evaluate_candidates(candidates_mids):
    correct = 0
    skipped = 0
    expected_accuracy = 0
    n_examples = df_answerable.shape[0]

    for i, (_, row) in enumerate(df_answerable.iterrows()):
        mids = candidates_mids[i]
        if len(mids) == 0:
            skipped += 1
        elif row['subject'] in mids:
            correct += 1
            expected_accuracy += 1 / len(mids)
        
    print('Precision: %f [%d of %d]' %
              (correct / (n_examples - skipped), correct, (n_examples - skipped)))
    print('Recall: %f [%d of %d]' %
              ((n_examples - skipped) / n_examples, (n_examples - skipped), n_examples))
    print('Expected Guessing Accuracy: %f [%d of %d]' % 
              (expected_accuracy / n_examples, expected_accuracy, n_examples))

Basic helper functions to run experiments quickly.

In [7]:
from functools import lru_cache

@lru_cache(maxsize=65536)
def cached_alias_to_mid(text):
    cursor.execute("""SELECT mid FROM fb_two_subject_name 
                  WHERE alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

def cached_aliases_to_mids(aliases):
    mids = []
    for alias in aliases:
        mids.extend(cached_alias_to_mid(alias))
    return mids

@lru_cache(maxsize=65536)
def cached_normalized_alias_to_alias(text):
    cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE normalized_alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

@lru_cache(maxsize=65536)
def cached_preprocessed_alias_to_alias(text):
    cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE preprocessed_alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

# @lru_cache(maxsize=65536)
# def cached_normalized_alias_to_alias(text):
#     cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
#                   WHERE normalized_alias = %s
#                   OR normalized_alias LIKE %s""", (text, '%' + text + '%'))
#     return list([r[0] for r in cursor.fetchall()])

## Generate Candidates - Baseline

In [10]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table

negative_sample = []
candidates_mids = []

for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    predicted_subject_name = row['predicted_subject_name']
    candidate_aliases = cached_preprocessed_alias_to_alias(predicted_subject_name)
    candidates_mids.append(cached_aliases_to_mids(candidate_aliases))

evaluate_candidates(candidates_mids)

Precision: 0.982345 [9904 of 10082]
Recall: 0.946844 [10082 of 10648]
Expected Guessing Accuracy: 0.637027 [6783 of 10648]


### Version 1

For the first version, we will try to follow the strategy in `Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb` to link more aliases to questions.

In [11]:
from scripts.utils.table import format_pipe_table

negative_sample = []
candidates_mids = []

for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    predicted_subject_name = row['predicted_subject_name']
    candidate_aliases = cached_preprocessed_alias_to_alias(predicted_subject_name)
    if len(candidate_aliases) == 0:
        # NOTE: Normalized alias has a broader reach; therefore, we only use it if the first check failed.
        # We found this increased precision and expected guessing accuracy to add the check.
        # Past Numbers:
        # - Precision: 0.982796 [9997 of 10172]
        # - Recall: 0.955297 [10172 of 10648]
        # - Expected Guessing Accuracy: 0.636317 [6775 of 10648]
        candidate_aliases = cached_normalized_alias_to_alias(text_normalize(predicted_subject_name))
    candidates_mids.append(cached_aliases_to_mids(candidate_aliases))

    if len(candidate_aliases) == 0:
        negative_sample.append({
            'Predicted Subject': predicted_subject_name,
            'Normalized Predicted Subject': text_normalize(predicted_subject_name),
            'Correct Alias': row['subject_name'],
            'Normalized Correct Alias': text_normalize(row['subject_name']),
            'Question': row['question']
        })
    
evaluate_candidates(candidates_mids)
print(format_pipe_table(negative_sample[100:150], columns=['Predicted Subject',
                                                           'Correct Alias',
                                                           'Question']))

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Subject Name not in Question.ipynb



Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




Precision: 0.982206 [9991 of 10172]
Recall: 0.955297 [10172 of 10648]
Expected Guessing Accuracy: 0.643142 [6848 of 10648]
| Index | Predicted Subject | Correct Alias | Question |
| --- | --- | --- | --- |
| 0 | riot grrrl music | riot grrrl | what artist creates riot grrrl music |
| 1 | fundamentalist | the reluctant fundamentalist | who was the female actress that directed the reluctant fundamentalist? |
| 3 | the the champs | the champs | What is the the champs known as? |
| 4 | lable century media records | century media records | what artist is on the lable century media records? |
| 5 | the mihinthale | mihinthale | what religion is practiced at the mihinthale |
| 6 | black coat color | black | which dog breed has black coat color |
| 7 | the division i ( ncaa ) | division i (ncaa) | what is a team in the division i (ncaa)? |
| 8 | the karađorđevic dynasty | karađorđević dynasty | what country is the karađorđević dynasty from |
| 9 | the shadow play | shadow play | what subject 

#### Analysis

##### Numbers:

Version 0
- Precision: 0.982345 [9904 of 10082]
- Recall: 0.946844 [10082 of 10648]
- Expected Guessing Accuracy: 0.637027 [6783 of 10648]

Version 1
- Precision: 0.982206 [9991 of 10172]
- Recall: 0.955297 [10172 of 10648]
- Expected Guessing Accuracy: 0.643142 [6848 of 10648]

Recall increased by 0.008453.
Precision decreased by 0.000139.

##### Error Bucket:

**Discussion:**

Most of the cases the correct span a subset of the predicted subject; therefore, we need to follow up with some method for ngrams. 

An easier case to tackle is Longer and Shorter, Article. We can query for longer aliases and attempt to remove a article.

**Buckets:**
- Shorter (29 / 50) The predicted subject is longer than the correct alias.
    - Article (10 / 50) Part of the difference is due to an extra article.
- Similar (8 / 50) The predicted subject has the correct span but the correct alias is a bit different.
- Longer (10 / 50) The predicted subject is longer than the predicted subject.
    - Article (2 / 50) Part of the difference is due to an extra article.
- Misaligned (3 / 50) The alias and the predicted subject intersect but not shorter or longer.


| Index | Bucket | Predicted Subject | Correct Alias | Question |
| --- | --- | --- | --- | --- |
| 0 | Shorter | riot grrrl music | riot grrrl | what artist creates riot grrrl music |
| 1 | Shorter | fundamentalist | the reluctant fundamentalist | who was the female actress that directed the reluctant fundamentalist? |
| 2 | Shorter | stormwarning classifed | stormwarning | What genre is the album stormwarning classifed under |
| 3 | Shorter, Article | the the champs | the champs | What is the the champs known as? |
| 4 | Shorter | lable century media records | century media records | what artist is on the lable century media records? |
| 5 | Shorter, Article | the mihinthale | mihinthale | what religion is practiced at the mihinthale |
| 6 | Shorter | black coat color | black | which dog breed has black coat color |
| 7 | Shorter, Article | the division i ( ncaa ) | division i (ncaa) | what is a team in the division i (ncaa)? |
| 8 | Shorter, Article | the karađorđevic dynasty | karađorđević dynasty | what country is the karađorđević dynasty from |
| 9 | Shorter, Article | the shadow play | shadow play | what subject is the shadow play written about |
| 10 | Shorter | author and historian camil muresanu | camil mureșanu | Author and historian camil mureşanu was influenced by which person? |
| 11 | Shorter | the theater production of the island | the island | Where was the theater production of the island performed? |
| 12 | Longer | godbout v. longueuil | godbout v longueuil (city of) | what court handled the godbout v. longueuil case? |
| 13 | Misaligned | good breakfast | breakfast food | what's a good breakfast food |
| 14 | Similar | raymond meier | raymond a. meier | Which city was raymond meier born in |
| 15 | Shorter | brothers in arms ' language | brothers in arms | What is brothers in arms' language filmed in? |
| 16 | Shorter, Article | the afterglow fil | afterglow | who did the music for the afterglow fil, |
| 17 | Shorter | diet of veganism | veganism | What kind of alcohol is allowed on diet of veganism? |
| 18 | Shorter, Article | the cosmic egg | cosmic egg | which country was the cosmic egg released in |
| 19 | Shorter | latin pop music | latin pop | who is an artist that creates latin pop music |
| 20 | Longer | pat sin leng wildfire | 1996 pat sin leng wildfire | what did the 1996 pat sin leng wildfire happen |
| 21 | Shorter | star wars : jedi reading game | star wars: jedi reading | which company created the star wars: jedi reading game |
| 22 | Similar | prince georges county | prince george's county | what is in prince georges county |
| 23 | Longer | i shall be | i shall be released | what release is i shall be released on  |
| 24 | Shorter | verve music group | verve | Name an artist under the verve music group record label |
| 25 | Similar | single - player mode | single-player video game | what is a single-player mode game? |
| 26 | Similar | the godfathers revenge | the godfather's revenge | what is the genre for the book the godfathers revenge |
| 27 | Shorter | wwj ( am ) radio station | wwj | What kind of content does wwj (am) radio station offer? |
| 28 | Similar | kevin walshs | kevin walsh | what is  kevin walshs nationality |
| 29 | Shorter | dance - pop musical | dance-pop | What album is classified under the Dance-Pop musical genre? |
| 30 | Shorter | k - pop singer | k-pop | who is the most influential female k-pop singer? |
| 31 | Shorter | deconstructivism sructure | deconstructivism | What kind of architectural style is exhibited by the deconstructivism sructure |
| 32 | Longer |985 | area code 985 | which city as the area code 985 |
| 33 | Shorter, Article |the saint novel | saint | What type of book is the saint novel? |
| 34 | Misaligned | surrey county | north carolina | What is a city in Surrey County, north carolina? |
| 35 | Longer | immunocompromised | immunocompromised host | Being a immunocompromised host can lead to what disease? |
| 36 | Similar | indie films | indie film | what are indie films? |
| 37 | Shorter, Article | the the stranglers | the stranglers | The the stranglers could be best called |
| 38 | Shorter, Article | the piano concerto no . 1 | piano concerto no. 1 | which country was the piano concerto no. 1 released in |
| 39 | Shorter, Similar | dance - pop music | dance-pop | what artist makes dance-pop music? |
| 40 | Shorter | river paraul noroios | pârâul noroios | Which country is the river pârâul noroios in |
| 41 | Longer | battle of james island | first battle of james island | Name a soldier involved in the battle of james island. |
| 42 | Longer | endocrine neoplasia i | multiple endocrine neoplasia i | which genome is the gene multiple endocrine neoplasia i in |
| 43 | Shorter | military topics | military | What is a book that deals with military topics |
| 44 | Similar | woodys gone | woody's gone | who recorded woodys gone |
| 45 | Longer, Article | new york shakespeare festival | the new york shakespeare festival | which 1983 plays were in the new york shakespeare festival? |
| 46 | Shorter | the realm of science fiction | science fiction | what is an example of a book i the realm of science fiction |
| 47 | Longer, Article | register - herald | the register-herald | Who is the owner of the the register-herald newspaper? |
| 48 | Misaligned | beast game | altered beast | who is the creator of the altered beast game |
| 49 | Longer | christian nation | letter to a christian nation | What did the letter to a christian nation dispute the value of? |


## Generate Candidates - Version 2

Here we tackle the error buckets "Longer" and "Shorter, Article" (20 / 50).

In [12]:
@lru_cache(maxsize=65536)
def cached_super_normalized_alias_to_alias(text):
    cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE normalized_alias LIKE %s""", ('%' + text + '%',))
    return list([r[0] for r in cursor.fetchall()])

In [13]:
candidates_mids = []

for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    # Exact Match
    predicted_subject_name = row['predicted_subject_name']
    candidate_aliases = cached_preprocessed_alias_to_alias(predicted_subject_name)
    
    # Punctuation Differences
    if len(candidate_aliases) == 0:
        candidate_aliases = cached_normalized_alias_to_alias(text_normalize(predicted_subject_name))
        
    # Longer Aliases
    if len(candidate_aliases) == 0:
        candidate_aliases = cached_super_normalized_alias_to_alias(text_normalize(predicted_subject_name))
        # Same strategy as Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb
        # Precision: 0.982235 [10063 of 10245]
        # Recall: 0.962153 [10245 of 10648]
        # Expected Guessing Accuracy: 0.648522 [6905 of 10648]
        normalized_question = text_normalize(row['question'])
        candidate_aliases = [a for a in candidate_aliases if text_normalize(a) in normalized_question]
        
    # Shorter, Article Aliases
    if len(candidate_aliases) == 0:
        predicted_subject_name_tokens = predicted_subject_name.split()
        if predicted_subject_name_tokens[0].lower() == 'the':
            shorter_article_predicted_subject_name = ' '.join(predicted_subject_name_tokens[1:])
            shorter_article_predicted_subject_name = text_normalize(shorter_article_predicted_subject_name)
            candidate_aliases = cached_normalized_alias_to_alias(shorter_article_predicted_subject_name)

    candidates_mids.append(cached_aliases_to_mids(candidate_aliases))

evaluate_candidates(candidates_mids)


Precision: 0.982252 [10128 of 10311]
Recall: 0.968351 [10311 of 10648]
Expected Guessing Accuracy: 0.653240 [6955 of 10648]


#### Analysis

##### Numbers:

Version 0
- Precision: 0.982345 [9904 of 10082]
- Recall: 0.946844 [10082 of 10648]
- Expected Guessing Accuracy: 0.637027 [6783 of 10648]

Version 1
- Precision: 0.982206 [9991 of 10172]
- Recall: 0.955297 [10172 of 10648]
- Expected Guessing Accuracy: 0.643142 [6848 of 10648]
    
Version 2
- Precision: 0.982252 [10128 of 10311]
- Recall: 0.968351 [10311 of 10648]
- Expected Guessing Accuracy: 0.653240 [6955 of 10648]

Recall increased by 0.013054 from Version 1 to 2.
Precision increased by .000046 from Version 1 to 2.

**Discussion:**

The aditions only provided positive results. We now move on to shorter aliases.

## Generate Candidates - Version 3

Before we look for shorter substrings, it's important to look for similar length substrings that are similar (10 / 50)

In [61]:
# Helper method to pay with the metric
def pg_trgm_similarity(text, other_text):
    cursor.execute('SELECT similarity(%s, %s);', (text, other_text))
    similarity = cursor.fetchall()[0][0]
    return similarity
                   
pg_trgm_similarity('hi', 'hey')

0.166667

In [86]:
@lru_cache(maxsize=65536)
def cached_similar_normalized_alias_to_alias(text):
    cursor.execute("""SELECT set_limit(0.8);
                    SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE normalized_alias %% %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

In [87]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table
import json

candidates_mids = []

def lookup_alias_fuzzy(question, text):
    candidate_aliases = cached_similar_normalized_alias_to_alias(text_normalize(text))
    normalized_question = text_normalize(question)
#     candidate_aliases = [a for a in candidate_aliases if text_normalize(a) in normalized_question]
    # TODO: Look into using edit_token_distance
    # TODO: Look into set_limit
    # TODO: Try ranking by difference from predicted subject name
    if len(candidate_aliases) > 0:
        max_length = len(max(candidate_aliases, key=lambda a: len(a)))
        candidate_aliases = [a for a in candidate_aliases if len(a) == max_length]
    return candidate_aliases


for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    # Exact Match
    predicted_subject_name = row['predicted_subject_name']
    candidate_aliases = cached_preprocessed_alias_to_alias(predicted_subject_name)
    
    # Punctuation Differences
    if len(candidate_aliases) == 0:
        candidate_aliases = cached_normalized_alias_to_alias(text_normalize(predicted_subject_name))
        
        
    # Longer Aliases
    if len(candidate_aliases) == 0:
        candidate_aliases = cached_super_normalized_alias_to_alias(text_normalize(predicted_subject_name))
        normalized_question = text_normalize(row['question'])
        candidate_aliases = [a for a in candidate_aliases if text_normalize(a) in normalized_question]
        
    # Shorter, Article Aliases
    if len(candidate_aliases) == 0:
        predicted_subject_name_tokens = predicted_subject_name.split()
        if predicted_subject_name_tokens[0] in 'the':
            shorter_article_predicted_subject_name = ' '.join(predicted_subject_name_tokens[1:])
            candidate_aliases = lookup_alias_fuzzy(row['question'], shorter_article_predicted_subject_name)
            
    # Similar
    if len(candidate_aliases) == 0:
        candidate_aliases = lookup_alias_fuzzy(row['question'], text_normalize(predicted_subject_name))
            
# Precision: 0.981792 [10191 of 10380]
# Recall: 0.974831 [10380 of 10648]
# Expected Guessing Accuracy: 0.658173 [7008 of 10648]

# Precision: 0.981792 [10191 of 10380]
# Recall: 0.974831 [10380 of 10648]
# Expected Guessing Accuracy: 0.658300 [7009 of 10648]

# Precision: 0.981708 [10197 of 10387]
# Recall: 0.975488 [10387 of 10648]
# Expected Guessing Accuracy: 0.658023 [7006 of 10648]

# Precision: 0.981605 [10192 of 10383]
# Recall: 0.975113 [10383 of 10648]
# Expected Guessing Accuracy: 0.658551 [7012 of 10648]

# Precision: 0.981605 [10192 of 10383]
# Recall: 0.975113 [10383 of 10648]
# Expected Guessing Accuracy: 0.658551 [7012 of 10648]

# Precision: 0.981264 [10213 of 10408]
# Recall: 0.977461 [10408 of 10648]
# Expected Guessing Accuracy: 0.660366 [7031 of 10648]

# Precision: 0.974048 [10284 of 10558]
# Recall: 0.991548 [10558 of 10648]
# Expected Guessing Accuracy: 0.664358 [7074 of 10648]

# Allow for etc.
# Precision: 0.980584 [10202 of 10404]
# Recall: 0.977085 [10404 of 10648]
# Expected Guessing Accuracy: 0.659773 [7025 of 10648]
    

    candidates_mids.append(cached_aliases_to_mids(candidate_aliases))

evaluate_candidates(candidates_mids)


Precision: 0.980584 [10202 of 10404]
Recall: 0.977085 [10404 of 10648]
Expected Guessing Accuracy: 0.659773 [7025 of 10648]


In [None]:
    # Shorter
    if len(candidate_aliases) == 0:
        predicted_subject_name_tokens = predicted_subject_name.split()
        start_index = row['predicted_start_index']
        end_index = row['predicted_end_index']
        assert end_index - start_index == len(predicted_subject_name_tokens)
        tag_confidence = json.loads(row['tag_confidence'])
        for i in range(len(predicted_subject_name_tokens) - 1):
            if tag_confidence[start_index] < tag_confidence[end_index - 1]:
                predicted_subject_name_tokens = predicted_subject_name_tokens[1:]
                start_index += 1
            else:
                predicted_subject_name_tokens = predicted_subject_name_tokens[:-1]
                end_index -= 1
            shorter_predicted_subject_name = ' '.join(predicted_subject_name_tokens)
            shorter_predicted_subject_name = text_normalize(shorter_predicted_subject_name)
            candidate_aliases = cached_normalized_alias_to_alias(shorter_predicted_subject_name)
            candidate_aliases = [a for a in candidate_aliases if text_normalize(a) in normalized_question]
            if len(candidate_aliases) > 0:
                if row['subject_name'] not in candidate_aliases:
                    negative_samples.append({
                        'Similarity': pg_trgm_similarity(text_normalize(predicted_subject_name), text_normalize(row['subject_name'])),
                        'Shorter Predicted Subject Name': shorter_predicted_subject_name,
                        'Predicted Subject': predicted_subject_name,
                        'Tag Confidence': [round(c, 2) for c in tag_confidence],
                        'Correct Alias': row['subject_name'],
                        'Question': row['question']
                    })
                break;

In [26]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table
import importlib
import scripts.utils.import_notebook

normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

correct = 0
skipped = 0
expected_accuracy = 0
total = 0
negative_sample = []

for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    if not isinstance(row['subject_name'], str):
        continue
        
    predicted_subject_name = row['predicted_subject_name']
    # Normalized aliases between question and alias for exact linking
    candidate_aliases = cached_normalized_alias_to_alias(normalize(predicted_subject_name))
    # Same strategy as Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb
    normalized_question = normalize(row['question'])
    candidate_aliases = [a for a in candidate_aliases if normalize(a) in normalized_question]
    if len(candidate_aliases) > 0:
        scores = [len(normalize(a)) for a in candidate_aliases]
        max_score = max(scores)
        # Without this filter:
        # - Precision: 0.988872 [10130 of 10244]
        # - Recall: 0.962059 [10244 of 10648]
        # - Expected Guessing Accuracy: 0.643938 [6856 of 10648]
        # Post filter:
        # - Precision: 0.978914 [10028 of 10244]
        # - Recall: 0.962059 [10244 of 10648]
        # - Expected Guessing Accuracy: 0.647982 [6899 of 10648]
        top_aliases = [a for i, a in enumerate(candidate_aliases) if scores[i] == max_score]
        mids = []
        for alias in top_aliases:
            mids.extend(cached_alias_to_mid(alias))
    else:
        skipped += 1
        top_aliases = ''
        mids = []
    
    total += 1
    if row['subject'] in mids:
        correct += 1
        expected_accuracy += 1 / len(mids)
    elif len(candidate_aliases) > 0:
        negative_sample.append({
            'Predicted Subject': predicted_subject_name,
            'Predicted Aliases': top_aliases,
            'Correct Alias': row['subject_name'],
            'Question': row['question'],
            'Number of Candidates': len(candidate_aliases),  
        })
    
print('Precision: %f [%d of %d]' % (correct / (total - skipped), correct, (total - skipped)))
print('Recall: %f [%d of %d]' % ((total - skipped) / total, (total - skipped), total))
print('Expected Guessing Accuracy: %f [%d of %d]' % (expected_accuracy / total, expected_accuracy, total))
print(format_pipe_table(negative_sample[100:150], columns=['Number of Candidates', 'Predicted Aliases',
                                                           'Correct Alias',
                                                           'Predicted Subject', 
                                                           'Question']))
print()


Precision: 0.978914 [10028 of 10244]
Recall: 0.962059 [10244 of 10648]
Expected Guessing Accuracy: 0.647982 [6899 of 10648]
| Index | Number of Candidates | Predicted Aliases | Correct Alias | Predicted Subject | Question |
| --- | --- | --- | --- | --- | --- |
| 0 | 1 | ['la folie du doute'] | male | la folie du doute | Which male character is from the film la Folie Du Doute? |
| 1 | 2 | ['1967 re'] | 1967 | 1967 | in what region was 1967 released |
| 2 | 3 | ['comedy film'] | todd solondz | comedy | What is the name of the comedy film created by todd solondz?  |
| 3 | 1 | ['religion'] | theudigisel | religion | theudigisel's religion can be said to be? |
| 4 | 1 | ['dallas'] | post-hardcore | dallas | what is the name of an American post-hardcore band formed in 2010 in Dallas |
| 5 | 2 | ['icelandic'] | iceland | iceland | what icelandic artist sings the song iceland |
| 6 | 2 | ['st. paris'] | paris | paris | What is a track done by the artist paris |
| 7 | 1 | ['diamonds are forev

## Analysis - Version 1


### Discussion

#### Numbers:

- Precision: 0.978914 [10028 of 10244]
- Recall: 0.962059 [10244 of 10648]
- Expected Guessing Accuracy: 0.647982 [6899 of 10648]

#### Error Bucket:

**Discussion:**


**Buckets:**
- Wrong Span 
- Correct Span
- Wrong Alias
- Substring
- Extra Article
- Not in Question

| Index | Bucket | Number of Candidates | Predicted Aliases | Correct Alias | Predicted Subject | Question |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | Wrong Span | 1 | ['la folie du doute'] | male | la folie du doute | Which male character is from the film la Folie Du Doute? |
| 1 | Substring | 2 | ['1967 re'] | 1967 | 1967 | in what region was 1967 released |
| 2 | Wrong Span | 3 | ['comedy film'] | todd solondz | comedy | What is the name of the comedy film created by todd solondz?  |
| 3 | Wrong Span | 1 | ['religion'] | theudigisel | religion | theudigisel's religion can be said to be? |
| 4 | Wrong Span | 1 | ['dallas'] | post-hardcore | dallas | what is the name of an American post-hardcore band formed in 2010 in Dallas |
| 5 | Correct Span, Wrong Alias | 2 | ['icelandic'] | iceland | iceland | what icelandic artist sings the song iceland |
| 6 | Substring | 2 | ['st. paris'] | paris | paris | What is a track done by the artist paris |
| 7 | Wrong Span | 1 | ['diamonds are forever'] | female | diamonds are forever | Name a female character in the film Diamonds are Forever |
| 8 | Wrong Span | 1 | ['compilation album'] | album | compilation album | what is a compilation album |
| 9 | Extra Article | 2 | ['a song'] | song | song | What's an example of a song |
| 10 | Extra Article | 2 | ['the korean war'] | korean war | korean war | what Army veteran recipient of the Medal of Honor fought in the korean war |
| 11 | Wrong Span | 1 | ['midway games'] | sports game | midway games | What's a sports game made by midway games |
| 12 | Extra Article | 3 | ['the thriller'] | thriller | thriller | what film belongs to the  thriller (genre) |
| 13 | Correct? | 1 | ['chancellor of germany'] | chancellor | chancellor of germany | if someone is the chancellor of germany, what is their title  |
| 14 | Substring | 2 | ['e album'] | album | album | Which album was also a release album? |
| 15 | Extra Article | 2 | ['the battle of gettysburg'] | battle of gettysburg | battle of gettysburg | Who was a general involved in the battle of gettysburg |
| 16 | Not in Question | 1 | ['german'] | germans | german | who is german? |
| 17 | Substring | 4 | ['t drama'] | drama | drama | What drama came out in 2011 |
| 18 | Substring | 2 | ['navigator i'] | navigator | navigator | what release is navigator in? |
| 19 | Wrong Span | 1 | ['wyoming'] | national park | wyoming | what is a national park in Wyoming  |
| 20 | Wrong Span | 1 | ['deep purple'] | album | deep purple | which albums were released by deep purple? |
| 21 | Substring | 2 | ['david b.'] | david | david | What album can david be found on? |
| 22 | Extra Article | 2 | ['a western'] | western | western | what is the tittle of a western (genre) movie on netflix |
| 23 | Extra Article | 2 | ['the korean war'] | korean war | korean war | Who was a notable person in the korean war? |
| 24 | Wrong Span | 1 | ['recurring'] | film festival | recurring | what film festival is recurring? |
| 25 | Wrong Alias  | 2 | ['work of fiction'] | fiction | fiction | what book is a work of fiction? |
| 26 | Wrong Alias | 2 | ['radio program'] | radio | radio | what show is a radio program? |
| 27 | Extra Article | 2 | ['in copenhagen'] | copenhagen | copenhagen | what actor was born in copenhagen |
| 28 | Wrong Span | 1 | ['the sun'] | solar system | the sun | What is the name of a solar system that orbits the sun? |
| 29 | Substring | 2 | ['s. william green'] | william green | william green | what is william green's sex |
| 30 | Wrong Span | 1 | ['l7'] | compilation album | l7 | what's a compilation album focusing on the band L7 |
| 31 | Wrong Span | 1 | ['so hard to forget'] | sao paulo | so hard to forget | Name a Brazilian actress born in sao paulo who played a role in the 2010 So Hard to Forget movie |
| 32 | Wrong Span | 1 | ['piano jazz'] | jazz | piano jazz | what is the name of the live solo piano jazz album  |
| 33 | Substring | 2 | ['claude debussy (i)'] | claude debussy | claude debussy | Which Russian composer did claude debussy influence? |
| 34 | Extra Article | 2 | ['the korean war'] | korean war | korean war | which military personnel was involved in the korean war |
| 35 | Correct? | 2 | ['william walls'] | william wall | william wall | What is william wall's gender? |
| 36 | Extra Article | 2 | ['in chicago'] | chicago | chicago | whose existence began in chicago |
| 37 | Wrong Span | 1 | ['daz dillinger'] | album | daz dillinger | What's an album by daz dillinger |
| 38 | Wrong Span | 1 | ['compilation album'] | album | compilation album | which compilation album was released in 2005? |
| 39 | Extra Article | 1 | ['the bahamas'] | bahamas | the bahamas | who was the ruler of the bahamas? |
| 40 | Other | 2 | ['about religion'] | religion | religion | what work is about religion? |
| 41 | Correct? | 2 | ['ken owens'] | ken owen | ken owen | what is ken owen's nationality  |
| 42 | Substring | 2 | ['f. richard jones'] | richard jones | richard jones | Who is the child of richard jones (bassist)? |
| 43 | Extra Article | 2 | ['in ireland'] | harp | ireland | What type of harp is traditionally found in ireland |
| 44 | Extra Article |  3 | ['a history'] | history | history | What is an example of a history film? |
| 45 | Extra Article |  4 | ['a méxico'] | mexico | mexico | What is the title of a mexico netflix film? |
| 46 | Extra Article |  2 | ['the fantasy'] | fantasy | fantasy | what is a film in the fantasy genre? |
| 47 | Wrong Span | 1 | ['bob saget'] | album | bob saget | which comedy album was recorded by bob saget? |
| 48 | Extra Article | 5 | ['in washington, dc'] | washington, d.c. | washington , d.c . | WHo is someone that was born in washington, d.c. |
| 49 | Wrong Span | 1 | ['2005'] | album | 2005 | what is an album from 2005 |

## Generate Candidates - Version 2

In [33]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table
import importlib
import scripts.utils.import_notebook

normalize = importlib.import_module(
                "scripts.Simple QA Numbers.HYPOTHESIS - Subject Name not in Question").normalize

correct = 0
skipped = 0
expected_accuracy = 0
total = 0
negative_sample = []

def get_mids(aliases):
    mids = []
    for alias in aliases:
        mids.extend(cached_alias_to_mid(alias))
    return mids

for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
#     if not isinstance(row['subject_name'], str):
#         continue
        
    mids = []
    predicted_subject_name = row['predicted_subject_name']
    # Normalized aliases between question and alias for exact linking
    candidate_aliases = cached_normalized_alias_to_alias(normalize(predicted_subject_name))
    if predicted_subject_name in candidate_aliases:
        top_aliases = [predicted_subject_name]
        mids = get_mids(top_aliases)
    # Without ELSE Case
    # Pecision: 0.979686 [8874 of 9058]
    # Recall: 0.850676 [9058 of 10648]
    # Expected Guessing Accuracy: 0.558521 [5947 of 10648]
    else:
        # Same strategy as Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb
        normalized_question = ' ' + normalize(row['question']) + ' '
        candidate_aliases = [a for a in candidate_aliases if ' ' + normalize(a) + ' ' in normalized_question]
        if len(candidate_aliases) > 0:
            scores = [len(normalize(a)) for a in candidate_aliases]
            max_score = max(scores)
            # Without this filter:
            # Precision: 0.981485 [10019 of 10208]
            # Recall: 0.958678 [10208 of 10648]
            # Expected Guessing Accuracy: 0.648091 [6900 of 10648]
            
            # Post filter:
            # Precision: 0.981289 [10017 of 10208]
            # Recall: 0.958678 [10208 of 10648]
            # Expected Guessing Accuracy: 0.648178 [6901 of 10648]
            top_aliases = [a for i, a in enumerate(candidate_aliases) if scores[i] == max_score]
            mids = get_mids(top_aliases)

    if len(mids) == 0:
        skipped += 1
    
    total += 1
    if row['subject'] in mids:
        correct += 1
        expected_accuracy += 1 / len(mids)
    elif len(candidate_aliases) > 0:
        negative_sample.append({
            'Predicted Subject': predicted_subject_name,
            'Predicted Aliases': top_aliases,
            'Correct Alias': row['subject_name'],
            'Question': row['question'],
            'Number of Candidates': len(candidate_aliases),  
        })
    
print('Precision: %f [%d of %d]' % (correct / (total - skipped), correct, (total - skipped)))
print('Recall: %f [%d of %d]' % ((total - skipped) / total, (total - skipped), total))
print('Expected Guessing Accuracy: %f [%d of %d]' % (expected_accuracy / total, expected_accuracy, total))
print(format_pipe_table(negative_sample[150:200], columns=['Number of Candidates', 'Predicted Aliases',
                                                           'Correct Alias',
                                                           'Predicted Subject', 
                                                           'Question']))
print()


Precision: 0.974891 [10017 of 10275]
Recall: 0.947441 [10275 of 10845]
Expected Guessing Accuracy: 0.636404 [6901 of 10845]
| Index | Number of Candidates | Predicted Aliases | Correct Alias | Predicted Subject | Question |
| --- | --- | --- | --- | --- | --- |
| 0 | 1 | ['minotaur: the labyrinths of crete'] | nan | minotaur : the labyrinths of crete | What game is a version of minotaur: the labyrinths of crete |
| 1 | 22 | ['lara croft'] | nan | lara croft | Name a music album from lara croft |
| 2 | 811 | ['ireland'] | harp | ireland | What type of harp is traditionally found in ireland |
| 3 | 1267 | ['never'] | never known | never | The composition never known is performed in which popular language? |
| 4 | 1 | ['elemental: the power of illuminated love'] | nan | elemental : the power of illuminated love | what type of novel is elemental: the power of illuminated love? |
| 5 | 436 | ['drama'] | korean drama | drama | what is a korean drama? |
| 6 | 315 | ['vay'] | nan | vay | who 