# Step 2 - Generate Candidates

Our goal during this step is to generate candidate relations and aliases.

In [1]:
import sys
sys.path.insert(0, '../../')

In [2]:
from scripts.utils.connect import get_connection 
from scripts.utils.data import FB2M_NAME_TABLE

connection = get_connection()
cursor = connection.cursor()

In [12]:
import pandas as pd
from tqdm import tqdm_notebook

tqdm_notebook().pandas()

df =pd.read_pickle('step_1_predict_subject_name.pkl')
df[:5]




Unnamed: 0,end_index,object,predicted_subject_names,question,question_tokens,relation,start_index,subject,subject_name,subject_name_tokens
6219,,0bs56bp,"[{'name': 'american thoroughbread', 'score': 1...",Name an American Thoroughbread racehorse,"[name, an, american, thoroughbread, racehorse]",biology/organism_classification/organisms_of_t...,,03k3r,,
3364,9.0,01sjng,"[{'name': 'vision racing driving simulator', '...",what kind of game is vision racing driving sim...,"[what, kind, of, game, is, vision, racing, dri...",cvg/computer_videogame/cvg_genre,5.0,02qlppc,vision racing driving simulator,"(vision, racing, driving, simulator)"
9374,6.0,0dlmm88,"[{'name': 'romance film', 'score': 28.02931404...",what tv program is romance film,"[what, tv, program, is, romance, film]",tv/tv_genre/programs,4.0,02l7c8,romance film,"(romance, film)"
10142,4.0,04rrx,"[{'name': 'polaski', 'score': 32.1325416564941...",what state is polaski located in,"[what, state, is, polaski, located, in]",location/location/containedby,3.0,049_zj3,polaski,"(polaski,)"
97,8.0,0qcr0,"[{'name': 'fern emmett', 'score': 23.679399490...",what disease claimed the life of fern emmett,"[what, disease, claimed, the, life, of, fern, ...",people/deceased_person/cause_of_death,6.0,02w9ycr,fern emmett,"(fern, emmett)"


Define text preprocessing the same as the training data and step 1.

In [4]:
import importlib
import scripts.utils.import_notebook
import re

preprocess = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").preprocess
tokenize = importlib.import_module(
                "scripts.Simple QA Models.Subject Recognition Data").spacy_tokenize

def text_preprocess(s):
    # Define `text_preprocess` the way the input text was preprocessed before step 1
    s = preprocess(s)
    s = tokenize(s)
    s = ' '.join(s)
    return s

def text_normalize(s):
    s = text_preprocess(s)
    # In `Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb` we found that
    # aliases and questions match up more if punctuation is removed.
    
    # Remove punctuation
    s = re.sub(r'[^\w\s]','',s)
    # Removing characters can create gaps of multiple spaces
    # Substitue multiple spaces with one
    s = re.sub('\s+', ' ', s)
    s = s.strip()
    return s

importing Jupyter notebook from ../../scripts/Simple QA Models/Subject Recognition Data.ipynb


## Index Subject Aliases

Create an index of subject aliases that are preprocessed similar to the predicted subect name. Allowing for a database lookup.

In [None]:
cursor.execute('ALTER TABLE ' + FB2M_NAME_TABLE + ' ADD COLUMN normalized_alias varchar')

In [None]:
cursor.execute('ALTER TABLE ' + FB2M_NAME_TABLE + ' ADD COLUMN preprocessed_alias varchar')

In [None]:
from tqdm import tqdm_notebook
import psycopg2

chunk_size = 10000

def update_chunk(rows):
    query = ('UPDATE ' + FB2M_NAME_TABLE + ' SET normalized_alias = %s, preprocessed_alias = %s' +
            ' WHERE mid = %s and alias = %s')
    psycopg2.extras.execute_batch(cursor, query, rows)

cursor.execute('SELECT mid, alias FROM ' + FB2M_NAME_TABLE)
rows = []
for mid, alias in tqdm_notebook(cursor.fetchall()):
    normalized_alias = text_normalize(alias)
    preprocessed_alias = text_preprocess(alias)
    rows.append(tuple([normalized_alias, preprocessed_alias, mid, alias]))
    
    # Insert Chunk
    if len(rows) > chunk_size:
        update_chunk(rows)
        rows = []
        
update_chunk(rows)

importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Subject Name not in Question.ipynb


In [15]:
connection.commit()

In [None]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_normalized_alias ON ' + 
               FB2M_NAME_TABLE + '(normalized_alias);')
connection.commit()

In [16]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_preprocessed_alias ON ' + 
               FB2M_NAME_TABLE + '(preprocessed_alias);')
connection.commit()

In [None]:
cursor.execute('CREATE INDEX ' + FB2M_NAME_TABLE + '_normalized_alias_trgm ON ' + 
               FB2M_NAME_TABLE + ' USING gist(normalized_alias gist_trgm_ops);')
connection.commit()

## Generate Candidates

If subject name is null, then the question does not refer to the true alias. The example is then unanswerable.

In [15]:
df_answerable = df[df.subject_name.notnull()]

Metrics used to evaluate different versions.

In [6]:
def evaluate_candidates(candidates_mids):
    correct = 0
    skipped = 0
    expected_accuracy = 0
    n_examples = df_answerable.shape[0]

    for i, (_, row) in enumerate(df_answerable.iterrows()):
        mids = candidates_mids[i]
        if len(mids) == 0:
            skipped += 1
        elif row['subject'] in mids:
            correct += 1
            expected_accuracy += 1 / len(mids)
        
    print('Precision: %f [%d of %d]' %
              (correct / (n_examples - skipped), correct, (n_examples - skipped)))
    print('Recall: %f [%d of %d]' %
              ((n_examples - skipped) / n_examples, (n_examples - skipped), n_examples))
    print('Expected Guessing Accuracy: %f [%d of %d]' % 
              (expected_accuracy / n_examples, expected_accuracy, n_examples))

Basic helper functions to run experiments quickly.

In [8]:
from functools import lru_cache

@lru_cache(maxsize=65536)
def cached_alias_to_mid(text):
    cursor.execute("""SELECT mid FROM fb_two_subject_name 
                  WHERE alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

def cached_aliases_to_mids(aliases):
    mids = []
    for alias in aliases:
        mids.extend(cached_alias_to_mid(alias))
    return mids

@lru_cache(maxsize=65536)
def cached_normalized_alias_to_alias(text):
    cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE normalized_alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

@lru_cache(maxsize=65536)
def cached_preprocessed_alias_to_alias(text):
    cursor.execute("""SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE preprocessed_alias = %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

@lru_cache(maxsize=65536)
def cached_similar_normalized_alias_to_alias(text):
    cursor.execute("""SELECT set_limit(0.8);
                    SELECT DISTINCT alias FROM fb_two_subject_name 
                  WHERE normalized_alias %% %s""", (text,))
    return list([r[0] for r in cursor.fetchall()])

In [49]:
# Helper method to play with the metric
def pg_trgm_similarity(text, other_text):
    cursor.execute('SELECT similarity(%s, %s);', (text, other_text))
    similarity = cursor.fetchall()[0][0]
    return similarity
                   
print(pg_trgm_similarity('hi', 'hey'))
print(pg_trgm_similarity('hey', 'hi'))

0.166667
0.166667


## Generate Candidates - Baseline

Just lookup the top k predicted subject names in order until one is seen.

In [25]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table
import json

negative_sample = []
candidates_mids = []

for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    for predicted in row['predicted_subject_names']:
        candidate_aliases = cached_preprocessed_alias_to_alias(predicted['name'])
    
        if len(candidate_aliases) > 0:
            candidates_mids.append(cached_aliases_to_mids(candidate_aliases))
            break
            
    if len(candidate_aliases) == 0:
        candidates_mids.append([])
        

evaluate_candidates(candidates_mids)


Precision: 0.964420 [10246 of 10624]
Recall: 0.997746 [10624 of 10648]
Expected Guessing Accuracy: 0.659801 [7025 of 10648]


### Version 1

For the first version, we will try to follow the strategy in `Normalized Reference Resolution#HYPOTHESIS - Subject Name not in Question.ipynb` to link more aliases to questions.

In [47]:
from scripts.utils.edit_distance import edit_token_distance
from Levenshtein import distance
from scripts.utils.table import format_pipe_table
import json

negative_samples = []
candidates_mids = []

for index, row in tqdm_notebook(df_answerable.iterrows(), total=df_answerable.shape[0]):
    for i, predicted in enumerate(row['predicted_subject_names']):
        strategy = 'PREPROCESSED'
        candidate_aliases = cached_preprocessed_alias_to_alias(predicted['name'])
        
        # Punctuation Differences
        if len(candidate_aliases) == 0:
            # NOTE: Normalized alias has a broader reach; therefore, we only use it if the first check failed.
            # We found this increased precision and expected guessing accuracy to add the check.
            strategy = 'NORMALIZED'
            candidate_aliases = cached_normalized_alias_to_alias(text_normalize(predicted['name']))
    
        if len(candidate_aliases) > 0:
            candidates_mids.append(cached_aliases_to_mids(candidate_aliases))
            if row['subject'] not in candidates_mids[-1]:
                considered_aliases = [predicted['name'] for j, predicted in 
                                          enumerate(row['predicted_subject_names']) if j <= i]
                negative_samples.append({
                    'Preprocessed Subject Name': text_preprocess(row['subject_name']),
                    'Considered Aliases': considered_aliases,
                    'Max Similarity': max([pg_trgm_similarity(row['subject_name'], a)
                                           for a in considered_aliases]),
                    'Predicted Alias': predicted['name'],
                    'Strategy': strategy,
                    'Question': row['question'],
                })
            break
            
    if len(candidate_aliases) == 0:
        candidates_mids.append([])

evaluate_candidates(candidates_mids)
print('Negative Sample:')
print(format_pipe_table(negative_samples[:50], columns=['Strategy', 'Max Similarity',
                                                        'Preprocessed Subject Name',
                                                        'Predicted Alias',
                                                        'Considered Aliases', 'Question']))


Precision: 0.968524 [10308 of 10643]
Recall: 0.999530 [10643 of 10648]
Expected Guessing Accuracy: 0.664496 [7075 of 10648]
Negative Sample:
| Index | Strategy | Max Similarity | Preprocessed Subject Name | Predicted Alias | Considered Aliases | Question |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | PREPROCESSED | 0.0 | short | documentary film | ['documentary film'] | Name a short documentary film released in 2011 |
| 1 | PREPROCESSED | 0.722222 | red cloud 's war | the red | ['the red clouds war', 'red clouds war', 'the red clouds', 'clouds war', 'red clouds', 'the red'] | what was involved in the red clouds war? |
| 2 | PREPROCESSED | 0.75 | corporation nation | nation | ['nation book', 'corporation nation book', 'the corporation nation book', 'nation'] | what subject is the corporation nation book about |
| 3 | PREPROCESSED | 0.8 | peter 's point plantation | peters | ['peters point plantation', 'peters point', 'point plantation', 'peters'] | What is peters point plantation'

#### Analysis

##### Numbers:

Version 0
- Precision: 0.964420 [10246 of 10624]
- Recall: 0.997746 [10624 of 10648]
- Expected Guessing Accuracy: 0.659801 [7025 of 10648]

Version 1
- Precision: 0.968524 [10308 of 10643]
- Recall: 0.999530 [10643 of 10648]
- Expected Guessing Accuracy: 0.664496 [7075 of 10648]

Recall increased by 0.001784.
Precision increased by 0.004104.


##### Error Bucket:

**Discussion:**

Handling possesives would fix 10 / 50 errors. Handling the `Similar` bucket would be difficult because it's typically because of extra words in the subject name not present in the question.

**Buckets:**
- Wrong Span (29 / 50): The wrong span in the question was selected
- Suffix (12 / 50): The correct subject name was not linked due to a suffix.
- Extra Article (3 / 50): The correct subject name was not linked due to an article.
- Similar (7 / 50): The correct subject name was similar but not exact to the predicted subject name.
- Other (1 / 50): Deeper reason that the correct subject name was not linked.

| Index | Similarity | Bucket | Strategy | Preprocessed Subject Name | Predicted Alias | Considered Aliases | Question |
| --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 0.0 | Wrong Span | PREPROCESSED | short | documentary film | ['documentary film'] | Name a short documentary film released in 2011 |
| 1 | 0.722222 | Suffix | PREPROCESSED | red cloud 's war | the red | ['the red clouds war', 'red clouds war', 'the red clouds', 'clouds war', 'red clouds', 'the red'] | what was involved in the red clouds war? |
| 2 | 0.75 | Wrong Span | PREPROCESSED | corporation nation | nation | ['nation book', 'corporation nation book', 'the corporation nation book', 'nation'] | what subject is the corporation nation book about |
| 3 | 0.8 | Suffix | PREPROCESSED | peter 's point plantation | peters | ['peters point plantation', 'peters point', 'point plantation', 'peters'] | What is peters point plantation's architectural style |
| 4 | 0.0555556 | Wrong Span | PREPROCESSED | album | aaron carter | ['aaron carter'] | Name an album released by aaron carter |
| 5 | 1.0 | Similar | PREPROCESSED | pillows & prayers : cherry red 1982–1983 | pillows & prayers : cherry red 1982 - 1983 | ['pillows & prayers : cherry red 1982 - 1983'] | What is the name of the track list for the release pillows & prayers: cherry red 1982-1983? |
| 6 | 0.5 | Wrong Span | PREPROCESSED | commune of luxembourg | luxembourg | ['luxembourg'] | which country is the commune of luxembourg in |
| 7 | 0.764706 | Extra Article | PREPROCESSED | the hits album 6 | 6 | ['hits album 6', '6'] | what song was included in the hits album 6 |
| 8 | 0.588235 | Wrong Span | PREPROCESSED | between two women | two women | ['two women'] | what is about between two women |
| 9 | 0.782609 | Suffix | PREPROCESSED | battle of hudson 's bay | bay | ['battle of hudsons bay', 'of hudsons bay', 'battle of hudsons', 'the battle of hudsons bay', 'hudsons bay', 'did the battle of hudsons bay', 'battle of', 'bay'] | where did the battle of hudsons bay take place |
| 10 | 0.0 | Wrong Span | PREPROCESSED | tablet | hypertension | ['hypertension'] | what is a tablet used to treat hypertension  |
| 11 | 0.0 | Wrong Span | PREPROCESSED | compilation album | frank zappa | ['frank zappa'] | what compilation album did frank zappa release? |
| 12 | 0.0 | Wrong Span | PREPROCESSED | soundtrack | anthony marinelli | ['anthony marinelli'] | What's a soundtrack written by anthony marinelli |
| 13 | 0.0 | Wrong Span | PREPROCESSED | album | george canyon | ['george canyon'] | name an album by George Canyon |
| 14 | 0.0 | Wrong Span | PREPROCESSED | album | portal | ['portal'] | What's an album by the band portal |
| 15 | 0.785714 | Suffix | PREPROCESSED | megan cheng | megan | ['megan chengs', 'megan'] | whats  megan chengs ethnicity |
| 16 | 0.705882 | Wrong Span | PREPROCESSED | martial arts film | martial arts | ['martial arts'] | what is the name of the netflix martial arts film? |
| 17 | 0.0222222 | Wrong Span | PREPROCESSED | creedence clearwater revival | compilation album | ['compilation album'] | What is a compilation album by creedence clearwater revival |
| 18 | 0.227273 | Wrong Span | PREPROCESSED | topical medication | medicine | ['medicine'] | Name a topical medicine |
| 19 | 0.636364 | Wrong Span | PREPROCESSED | master | the master | ['the master'] | what is one of the master's powers  |
| 20 | 0.0 | Other | PREPROCESSED | t - town | kearny | ['kearny'] | What newspaper circulates in the town of kearny |
| 21 | 0.571429 | Suffix | PREPROCESSED | drums | drum | ['drum'] | which musician plays the drum kit |
| 22 | 0.84375 | Suffix | PREPROCESSED | dimillo 's floating restaurant | restaurant | ['dimillos floating restaurant', 'dimillos floating', 'floating restaurant', 'dimillos', 'is dimillos floating restaurant', 'dimillos floating restaurant in', 'restaurant'] | what state is dimillos floating restaurant in? |
| 23 | 0.0 | Wrong Span | PREPROCESSED | ragtime | denmark | ['denmark'] | who is the ragtime artist born in denmark? |
| 24 | 0.8 | Similar, Extra Article | PREPROCESSED | the regatta mystery | mystery | ['regatta mystery', 'mystery'] | what theme is in the piece regatta mystery |
| 25 | 0.0 | Wrong Span | PREPROCESSED | album | jack | ['jack dejohnrette', 'jack'] | What is the name of Jack DeJohnrette's album? |
| 26 | 0.0 | Wrong Span | PREPROCESSED | bollywood | tamil | ['tamil'] | what bollywood Tamil film was released in 2004  |
| 27 | 0.0 | Wrong Span | PREPROCESSED | animated cartoon | ducks | ['ducks'] | what animated cartoon was about ducks? |
| 28 | 0.0 | Wrong Span | PREPROCESSED | photography | visual art | ['visual art'] | which artist uses photography as their preferred visual art form |
| 29 | 0.761905 | Suffix | PREPROCESSED | this pud 's for you | for you | ['this puds for you comes', 'this puds for you', 'this puds for you comes from', 'puds for you comes', 'puds for you', 'this puds for', 'this puds', 'episode this puds for you comes', 'for you comes', 'puds for you comes from', 'episode this puds for you', 'for you'] | what is the series where the episode this puds for you comes from |
| 30 | 0.826087 | Suffix | NORMALIZED | chet 's speech , part ii | , part ii | ['chets speech , part ii', 'speech , part ii', 'chets speech , part', 'chets speech ,', 'chets speech', ', part ii'] | who sings chets speech, part ii |
| 31 | 0.764706 | Wrong Span | PREPROCESSED | large family car | family | ['large family', 'family'] | What car model is an example of a large family car? |
| 32 | 0.761905 | Suffix | PREPROCESSED | men 's pommel horse | pommel horse | ['mens pommel horse', 'mens pommel', 'pommel horse'] | What olympic games featured mens pommel horse |
| 33 | 0.0526316 | Wrong Span | NORMALIZED | soundtrack | s.cry.ed | ['s.cry.ed'] | What's the soundtrack for s.cry.ed |
| 34 | 0.35 | Wrong Span | PREPROCESSED | sahara ( instrumental ) | sahara | ['sahara'] | who composed sahara (instrumental)? |
| 35 | 0.0625 | Wrong Span | PREPROCESSED | compilation | cema | ['albumby cema', 'cema'] | what album is released as a compilation albumby CEMA |
| 36 | 0.583333 | Wrong Span | PREPROCESSED | arabic name | arabic | ['arabic'] | What is a book that is about arabic name |
| 37 | 0.73913 | Similar | PREPROCESSED | multiplayer video game | game | ['multiplayer game', 'game'] | What's a text based multiplayer game |
| 38 | 0.75 | Extra Article | PREPROCESSED | the crystal city | crystal city | ['crystal city'] | what genre is crystal city |
| 39 | 0.84 | Suffix | PREPROCESSED | men 's badminton , singles | singles | ['mens badminton , singles', 'badminton , singles', 'mens badminton', 'mens badminton ,', 'singles'] | what olympic games was mens badminton, singles apart of |
| 40 | 0.0 | Wrong Span | PREPROCESSED | mercedes lackey | fantasy | ['fantasy'] | which fantasy series were written by mercedes lackey? |
| 41 | 0.666667 | Similar | PREPROCESSED | brian o'shea | brian oshea | ['brian oshea'] | brian oshea performs what type of martial art |
| 42 | 0.857143 | Similar | PREPROCESSED | u.s . office of war information | war | ['office of war information', 'office of war information help', 'the office of war information', 'war information', 'the office of war information help', 'of war information', 'office of war', 'war information help', 'of war information help', 'the office of war', 'office of war information help produce', 'war'] | which film did the office of war information help produce  |
| 43 | 0.0 | Wrong Span | PREPROCESSED | album | sham 69 | ['sham 69'] | which album is released by Sham 69 |
| 44 | 0.777778 | Wrong Span | NORMALIZED | lowthian bell | , 1st baronet | ['sir lowthian bell , 1st baronet', 'lowthian bell , 1st baronet', 'sir lowthian bell , 1st', 'sir lowthian bell ,', 'sir lowthian bell', 'bell , 1st baronet', 'sir lowthian', ', 1st baronet'] | what organization was founded by sir lowthian bell, 1st baronet |
| 45 | 0.862069 | Suffix | PREPROCESSED | st . peter 's episcopal church | st . peters | ['st . peters episcopal church', 'st . peters episcopal', '. peters episcopal church', 'peters episcopal church', 'st . peters'] | what state and city is st. peters episcopal church located in? |
| 46 | 0.851852 | Suffix | PREPROCESSED | richard scarry 's busytown | busytown | ['richard scarrys busytown', 'scarrys busytown', 'richard scarrys', 'busytown'] | what is a gameplay mode featured on richard scarrys busytown |
| 47 | 0.0 | Wrong Span | PREPROCESSED | album | soil | ['soil'] | What's an album by soil |
| 48 | 0.714286 | Similar | PREPROCESSED | texas a&m university school of law | texas wesleyan university | ['texas wesleyan university school of law', 'wesleyan university school of law', 'texas wesleyan university school of', 'texas wesleyan university school', 'university school of law', 'is texas wesleyan university school of law', 'texas wesleyan university'] | Where is texas wesleyan university school of law located? |
| 49 | 0.535714 | Wrong Span | PREPROCESSED | public service announcement | public service | ['public service'] | What is the name of a public service announcement? |


