# Relation Ranking Data

For this experiment, we are going to create a dataset of True and False relations for each question based off the candidates generated during the end-to-end script.

In [1]:
import sys
sys.path.insert(0, '../../')
import pandas as pd
from tqdm import tqdm_notebook
import importlib
import scripts.utils.import_notebook
from scripts.utils.connect import get_connection 
from scripts.utils.data import FB2M_KG_TABLE

tqdm_notebook().pandas()

connection = get_connection()
cursor = connection.cursor()




In [2]:
from scripts.utils.simple_qa import load_simple_qa
from sklearn.utils import shuffle


# Destination Filename
DEST_TRAIN = './../../data/relation_ranking/train.txt'
DEST_DEV = './../../data/relation_ranking/dev.txt'

df_dev, = load_simple_qa(dev=True)
df_dev = shuffle(df_dev, random_state=123)
print('Dev:')
display(df_dev[:5])
df_train, = load_simple_qa(train=True)
print('Train:')
display(df_train[:5])

Dev:


Unnamed: 0,subject,relation,object,question
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett


Train:


Unnamed: 0,subject,relation,object,question
0,04whkz5,book/written_work/subjects,01cj3p,what is the book e about
1,0tp2p24,music/release_track/release,0sjc7c1,to what release does the release track cardiac...
2,04j0t75,film/film/country,07ssc,what country was the film the debt from
3,0ftqr,music/producer/tracks_produced,0p600l,what songs have nobuo uematsu produced?
4,036p007,music/release/producers,0677ng,Who produced eve-olution?


# Step 1 - Predict the Subject Name

The predicted subject name will be used to generate candidates.

In [4]:
sys.path.insert(0, '../../../allennlp')
add_predicted_subject_name = importlib.import_module(
                "scripts.Simple QA End-To-End.Step 1 - Predict Subject Name").add_predicted_subject_name

df_dev = df_dev.progress_apply(add_predicted_subject_name, axis=1)
display(df_dev[:5])
df_train = df_train.progress_apply(add_predicted_subject_name, axis=1)
display(df_train[:5])

importing Jupyter notebook from ../../scripts/Simple QA End-To-End/Step 1 - Predict Subject Name.ipynb
importing Jupyter notebook from ../../scripts/Simple QA Models/Subject Recognition Data.ipynb


importing Jupyter notebook from ../../scripts/Simple QA Numbers/HYPOTHESIS - Subject Name not in Question.ipynb



Unnamed: 0,subject,relation,object,question,predicted_subject_names,predicted_question_tokens
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse,"[{'name': 'american thoroughbread', 'score': 4...","[name, an, american, thoroughbread, racehorse]"
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...,"[{'name': 'vision racing driving simulator', '...","[what, kind, of, game, is, vision, racing, dri..."
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film,"[{'name': 'romance film', 'score': 55.87209701...","[what, tv, program, is, romance, film]"
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in,"[{'name': 'polaski', 'score': 57.6216735839843...","[what, state, is, polaski, located, in]"
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett,"[{'name': 'fern emmett', 'score': 69.716217041...","[what, disease, claimed, the, life, of, fern, ..."





Unnamed: 0,subject,relation,object,question,predicted_subject_names,predicted_question_tokens
0,04whkz5,book/written_work/subjects,01cj3p,what is the book e about,"[{'name': 'e', 'score': 60.41775894165039, 'st...","[what, is, the, book, e, about]"
1,0tp2p24,music/release_track/release,0sjc7c1,to what release does the release track cardiac...,"[{'name': 'cardiac arrest', 'score': 99.456245...","[to, what, release, does, the, release, track,..."
2,04j0t75,film/film/country,07ssc,what country was the film the debt from,"[{'name': 'the debt', 'score': 78.223381042480...","[what, country, was, the, film, the, debt, from]"
3,0ftqr,music/producer/tracks_produced,0p600l,what songs have nobuo uematsu produced?,"[{'name': 'nobuo uematsu', 'score': 67.8582611...","[what, songs, have, nobuo, uematsu, produced, ?]"
4,036p007,music/release/producers,0677ng,Who produced eve-olution?,"[{'name': 'eve - olution', 'score': 62.1605491...","[who, produced, eve, -, olution, ?]"


10845


# Step 2 - Generate Candidates

Generate mids that align with the predicted subject name.

In [None]:
from functools import partial 

generate_candidates = importlib.import_module(
                "scripts.Simple QA End-To-End.Step 2 - Generate Candidates").generate_candidates

df_dev = df_dev.progress_apply(partial(generate_candidates, cursor), axis=1)
display(df_dev[:5])
df_train = df_train.progress_apply(partial(generate_candidates, cursor), axis=1)
display(df_train[:5])
print(len(df_dev))

# Step 3 - Generate Relation Candidates

Given the predicted mids, we generate a list of possible relations that correspond with the associated mids. For the training dataset, we include the True relation in the pool. For the development set, we leave the relation pool untouched.

In [5]:
def generate_candidate_relations(row):
    cursor.execute("""SELECT DISTINCT relation
                      FROM {kg}
                      WHERE subject_mid = ANY(%s)""".format(kg=FB2M_KG_TABLE), (row['candidate_mids'],))
    candidate_relations = set([r[0] for r in cursor.fetchall()])
    return candidate_relations

df_dev['candidate_relations'] = df_dev.progress_apply(generate_candidate_relations, axis=1)
display(df_dev[:5])
df_train['candidate_relations'] = df_train.progress_apply(generate_candidate_relations, axis=1)
display(df_train[:5])
print(len(df_dev))




Unnamed: 0,subject,relation,object,question,predicted_subject_names,predicted_question_tokens,candidate_mids,predicted_start_index,predicted_end_index,predicted_subject_name,candidate_relations
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,Name an American Thoroughbread racehorse,"[{'name': 'american thoroughbread', 'score': 4...","[name, an, american, thoroughbread, racehorse]","[01z1jf2, 04q7gbh]",2.0,3.0,american,"{people/ethnicity/languages_spoken, dining/cui..."
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...,"[{'name': 'vision racing driving simulator', '...","[what, kind, of, game, is, vision, racing, dri...",[02qlppc],5.0,9.0,vision racing driving simulator,"{cvg/computer_videogame/developer, cvg/compute..."
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film,"[{'name': 'romance film', 'score': 55.87209701...","[what, tv, program, is, romance, film]",[02l7c8],4.0,6.0,romance film,"{tv/tv_genre/programs, media_common/media_genr..."
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in,"[{'name': 'polaski', 'score': 57.6216735839843...","[what, state, is, polaski, located, in]",[049_zj3],3.0,4.0,polaski,{location/location/containedby}
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett,"[{'name': 'fern emmett', 'score': 69.716217041...","[what, disease, claimed, the, life, of, fern, ...",[02w9ycr],6.0,8.0,fern emmett,"{people/deceased_person/cause_of_death, people..."





Unnamed: 0,subject,relation,object,question,predicted_subject_names,predicted_question_tokens,candidate_mids,predicted_start_index,predicted_end_index,predicted_subject_name,candidate_relations
0,04whkz5,book/written_work/subjects,01cj3p,what is the book e about,"[{'name': 'e', 'score': 60.41775894165039, 'st...","[what, is, the, book, e, about]","[0w6s3lm, 0t_6vtt, 092f13t, 02kc15m, 02qpq, 04...",4.0,5.0,e,"{film/film/film_format, music/artist/label, fr..."
1,0tp2p24,music/release_track/release,0sjc7c1,to what release does the release track cardiac...,"[{'name': 'cardiac arrest', 'score': 99.456245...","[to, what, release, does, the, release, track,...","[0m__wsj, 0mn29_b, 01k90ht, 0lntgk_, 0fdq4qy, ...",7.0,9.0,cardiac arrest,"{medicine/symptom/symptom_of, music/release_tr..."
2,04j0t75,film/film/country,07ssc,what country was the film the debt from,"[{'name': 'the debt', 'score': 78.223381042480...","[what, country, was, the, film, the, debt, from]","[0b3wz4j, 04j0t75, 0bwlk1l, 05mqx7d, 0sj0wgb, ...",5.0,7.0,the debt,"{film/film/written_by, music/album/release_typ..."
3,0ftqr,music/producer/tracks_produced,0p600l,what songs have nobuo uematsu produced?,"[{'name': 'nobuo uematsu', 'score': 67.8582611...","[what, songs, have, nobuo, uematsu, produced, ?]",[0ftqr],3.0,5.0,nobuo uematsu,"{music/artist/label, freebase/valuenotation/is..."
4,036p007,music/release/producers,0677ng,Who produced eve-olution?,"[{'name': 'eve - olution', 'score': 62.1605491...","[who, produced, eve, -, olution, ?]","[04cjlrh, 04cjlrq, 034l2mx, 01knvq6, 036p007, ...",2.0,5.0,eve - olution,"{music/album/release_type, music/album/artist,..."


# Step 4 - Write Data File

In [6]:
import random

lines = []

def write_data(df, file_name):
    examples = []
    for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
        if not isinstance(row['predicted_subject_name'], str):
            continue
        
        formatted_question =  ''
        for i, token in enumerate(row['predicted_question_tokens']):
            if i == row['predicted_start_index']:
                formatted_question += '<e>'
            elif i > row['predicted_start_index'] and i < row['predicted_end_index']:
                continue
            else:
                formatted_question += token.lower().strip()
            formatted_question += ' '
        entity = ' '.join(row['predicted_question_tokens'][
                            int(row['predicted_start_index']):int(row['predicted_end_index'])])
        candidate_relations = list(row['candidate_relations'])
        random.shuffle(candidate_relations)
        
        relation_candidates = ' '.join(candidate_relations)
        true_relation = row['relation']
            
        examples.append('\t'.join([true_relation, relation_candidates, formatted_question, entity]))
    file_ = open(file_name, 'w')
    file_.write('\n'.join(examples))

In [7]:
write_data(df_dev, DEST_DEV)
write_data(df_train, DEST_TRAIN)





