# End-to-End Freebase QA

Use the below line to import modules from the root of the repository.

Reference: https://stackoverflow.com/questions/34976803/sys-path-different-in-jupyter-and-python-how-to-import-own-modules-in-jupyter

In [1]:
import sys
sys.path.insert(0, '../../')

In [57]:
RELATION_CLASSIFIER = '../../results/0626.11-05_08:38:02.relation_classifier/11m_05d_08h_45m_57s.pt'
OBJECT_RECOGNITION = '../../results/0605.11-05_09:35:18.object_recognition/11m_05d_09h_45m_22s.pt'

import math
import pprint
import re
import unicodedata

from nltk.tokenize.treebank import TreebankWordTokenizer

from lib.checkpoint import Checkpoint

pretty_printer = pprint.PrettyPrinter(indent=2)
tokenizer = TreebankWordTokenizer() # Same tokenizer used during training
    
def remove_accents(text):
    # https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    nkfd_form = unicodedata.normalize('NFKD', text)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

def preprocess(s):
    s = s.replace("'s", '')
    s = remove_accents(s)
    s = s.split()
    s = [re.sub('^\W+', '', w) for w in s]  # Strip prefix none alphanumeric
    s = [re.sub('\W+$', '', w) for w in s]  # Strip leading none alphanumeric
    s = [w.strip().lower() for w in s]
    s = [w for w in s if len(w) != 0] # Remove Empty
    return ' '.join(s)

def get_predict(checkpoint_path):
    """ Get a predict callable for a model """
    checkpoint = Checkpoint(checkpoint_path=checkpoint_path)
    return checkpoint.predict

relation_classifier_predict = get_predict(RELATION_CLASSIFIER)
object_recognition_predict = get_predict(OBJECT_RECOGNITION)

def get_object(question):
    """ Given a question return the object in the question using `OBJECT_CHECKPOINT` model. """
    question = preprocess(question)
    marks, confidence = object_recognition_predict(question)
    marks = marks.split()[:-1]
    question = tokenizer.tokenize(question)
    if len(question) != len(marks):
        print('Warning: Marks and Question does not match up. %s, %s.' % (marks, question))
    entity = []
    for marker, word in zip(marks, question):
        if marker == 'e':
            entity.append(word)
    return ' '.join(entity), sum(confidence) / len(confidence)

def get_relation(question, top_k=3):
    """ 
    Given a question return the predicate in the question using `RELATION_CLASSIFIER` model.
    
    Args:
        question (str)
    Returns:
        list of predicates and their confidence
    """
    question = preprocess(question)
    predicted = list(relation_classifier_predict(question, top_k=top_k))
    if top_k == 1:
            predicted[1] = sum(predicted[1]) / len(predicted[1]) 
            return [predicted]
    return [(class_, sum(confidence) / len(confidence)) for class_, confidence in predicted]

# To test this cell
print(get_object('Where was Obama born?'))
print(get_relation('Where was Obama born?', 1))

('obama', 0.9997887267225003)
[['www.freebase.com/people/person/place_of_birth', 1.0]]




In [58]:
import pprint

from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from fuzzywuzzy import fuzz

# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost'])
client = Elasticsearch()

def get_object_link(object_, property_, print_info=False):
    """
    Link the object_ (str) to a QID.
    
    Args:
        object_ (str): object to link
        predicate_mid (str): MID to filter the results
        print_info (bool): print the top matches
    Return:
        mid (str): object QID
        score (float): the score assigned to the top result
        name (str): name of the object QID
    """
    property_ = property_.replace('www.freebase.com/', '')
    hits = Search.from_dict({
        'size' : 10,
        'query': {
            'bool': {
                'filter': {
                    'nested': {
                        'path': 'facts', 
                        'query': {
                          'match': { 'facts.property': property_ }
                        }
                    }
                },
                'must': [{
                    'nested' : {
                        'path' : 'names',
                        'score_mode' : 'max',
                        'query' : {
                            'match' : {'names.name' : object_}
                        }
                    }
                }]
            }
        },
    }).using(client).index('fb5m_entities').execute()
    if len(hits) == 0:
        print('WARNING: No Object found.')
        return None, None, None, None
    return hits[0].mid, hits[0].meta.score, hits[0].names, hits[0].facts

object_ = get_object('what major cities does us route 2 run through')[0]
relation = get_relation('what major cities does us route 2 run through')[0][0]
print(object_, relation)
get_object_link(object_, relation, print_info=True)

us route 2 www.freebase.com/transportation/road/major_cities


('0kpw3',
 13.037585,
 [{'name': 'u.s. highway 66'}, {'name': 'the great diagonal way'}, {'name': 'mother road'}, {'name': 'u.s. route 66'}, {'name': 'will rogers highway'}, {'name': 'us route 66'}, {'name': 'main street of america'}, {'name': 'us 66'}],
 [{'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['0b_r0'], 'property': 'transportation/road/highway_system'}, {'subjects': ['01xr_wb'], 'property': 'common/topic/notable_types'},

In [64]:
import ujson as json
import pandas as pd

from IPython.display import display

SRC = '../../data/simple_qa/dev.tsv'

def answer_question(question):
    """
    Answer a question in JSON format for IO.
    
    Args:
        question (str)
    Returns:
        DL Predicate (str): name of the predicate
        DL Preidcate PID (str): PID of the predicate
        DL Preidcate Confidence (float)
        DL Top Predicates (list of predicate, PID, and confidence): Top predicates about > .9 confience
        DL Object (str): name of the object in the question
        DL Object Confidence (float)
        DL Object Name (str): name of the Wikidata object linked too
        DL Object Aliases (list of str): list of aliases for the object
        DL Object ID (str): QID for th WikiData object
        DL Object Score (float): score by ElasticSearch for object linking
        DL Answers (list of Object ID, Score, Answer): list of tuples that have the PID and QID
    """
    question = preprocess(question)
    print('Question:     ', question)
    top_predicates = get_relation(question, top_k=3)
    print('Relation(s):  ', top_predicates)
    top_mid = None
    top_score = 0
    top_names = None
    top_relation = None
    top_subjects = None
    object_, object_confidence = get_object(question)
    print('Object:       ', object_, '(%.01f)' % object_confidence)
    for relation, confidence in top_predicates:
        mid, score, names, facts = get_object_link(object_, relation)
        print('Object FB:    ', names)
        print('Object Score: ', score)
        print('Relation:     ', relation)
        if mid and score > top_score:
            top_subjects = set()
            for fact in facts:
                if fact['property'] in relation:
                    top_subjects.update(fact['subjects'])
            top_mid = mid
            top_score = score
            top_names = names
            top_relation = relation
    print('Object MID:   ', top_mid)
    print('Object Score: ', top_score)
    print('Object Names: ', top_names)
    print('Top Relation: ', top_relation)
    print('Top Subjects:  ', top_subjects)
    return top_mid, top_relation, top_subjects

def main():
    """
    Run main to save answers to the pandas table SRC
    
    Object Accuracy: 94%
    Object Linking: 75.7%
    Relation Accuracy: 82%
    Relation Accuracy Top 1: 77% ~> 81%
    Relation Accuracy Top 2: 88% ~> 91%
    Accuracy: 72%
    Total Questions: 1514
    """
    data = pd.read_table(SRC)
    display(data.head())
    
    def add_answers(row):
        question = row['Question EN']
        predicted_object, predicted_relation, predicted_subjects = answer_question(question)
        true_object = row['Object MID'].replace('www.freebase.com/m/', '').strip()
        true_subject = row['Subject MID'].replace('www.freebase.com/m/', '').strip()
        true_relation = row['Freebase Property'].strip()
        if not predicted_subjects or true_subject not in predicted_subjects:
            if true_object != predicted_object:
                print('Wrong Object MID:    ', predicted_object)
            print('Correct Object MID:  ', true_object)
            print('Correct Object:      ', row['Object EN'])
            
            if true_relation != predicted_relation:
                print('Wrong Relation:      ', predicted_relation)
            print('Correct Relation:    ', true_relation)
            
            if predicted_subjects and true_subject not in predicted_subjects:
                print('Wrong Subjects:      ', predicted_subjects)
            print('Correct Subject MID: ', true_subject)
            print('WHOOPS!')
        else:
            print('CORRECT!')
        print()
    
    data = data.apply(add_answers, axis=1)
    print('Done!')

main()

Unnamed: 0,Object MID,Freebase Property,Subject MID,Question,WikiData Property,Subject EN,Object EN,Question FR DeepL,Question EN,Object EN Mask
0,www.freebase.com/m/0f3xg_,www.freebase.com/symbols/namesake/named_after,www.freebase.com/m/0cqt90,Who was the trump ocean club international hot...,P138,Donald Trump,Trump Ocean Club International Hotel and Tower,qui était l'hôtelier et la tour internationale...,Who was the trump ocean club international hot...,c c c e e e e e e e c c
1,www.freebase.com/m/07f3jg,www.freebase.com/people/person/place_of_birth,www.freebase.com/m/0565d,where was sasha vujačić born,P19,Maribor,Sasha Vujačić,où est né sasha vuja?i?,where was sasha vujačić born,c c e e c
2,www.freebase.com/m/031j8nn,www.freebase.com/music/release/region,www.freebase.com/m/07ssc,What is a region that dead combo was released in,,United Kingdom,Dead Combo,quelle est la région dans laquelle le combo mo...,What is a region that dead combo was released in,c c c c c e e c c c
3,www.freebase.com/m/0c1cyhd,www.freebase.com/film/director/film,www.freebase.com/m/0wxsz5y,What is a film directed by wiebke von carolsfeld?,inverse:P57,Stay,Wiebke von Carolsfeld,Qu'est-ce qu'un film réalisé par wiebke von ca...,What is a film directed by wiebke von carolsfeld?,c c c c c c e e e
4,www.freebase.com/m/0fvhc0g,www.freebase.com/music/release/region,www.freebase.com/m/0345h,what country was music for stock exchange rel...,,Germany,Music for Stock Exchange,quel pays était la musique de la bourse publié...,what country was music for stock exchange rel...,c c c e e e e c c


Question:      who was the trump ocean club international hotel and tower named after
Relation(s):   [('www.freebase.com/symbols/namesake/named_after', 0.9895257231903946), ('www.freebase.com/organization/organization/advisors', 0.004211182840038119), ('www.freebase.com/music/recording/artist', 0.0020376382557712215)]
Object:        trump ocean club international hotel and tower (1.0)
Object FB:     [{'name': 'trump ocean club international hotel and tower'}]
Object Score:  31.031027
Relation:      www.freebase.com/symbols/namesake/named_after
Object FB:     [{'name': 'isl international sport and leisure'}]
Object Score:  7.9539623
Relation:      www.freebase.com/organization/organization/advisors
Object FB:     [{'name': 'trump tower'}]
Object Score:  18.09557
Relation:      www.freebase.com/music/recording/artist
Object MID:    0f3xg_
Object Score:  31.031027
Object Names:  [{'name': 'trump ocean club international hotel and tower'}]
Top Relation:  www.freebase.com/symbols/namesake/n

Object FB:     [{'name': 'john hunyadi'}]
Object Score:  13.287171
Relation:      www.freebase.com/people/person/nationality
Object FB:     [{'name': 'family'}, {'name': 'masters of horror : john landis : family'}]
Object Score:  10.036607
Relation:      www.freebase.com/film/film/country
Object MID:    01px0j
Object Score:  20.348745
Object Names:  [{'name': 'hunyadi family'}]
Top Relation:  www.freebase.com/people/family/country
Top Subjects:   {'03gj2'}
CORRECT!

Question:      what major cities does u.s route 2 run through
Relation(s):   [('www.freebase.com/transportation/road/major_cities', 0.9999427811780749), ('www.freebase.com/location/location/containedby', 3.880588867578666e-05), ('www.freebase.com/location/location/contains', 1.4279428861668248e-05)]
Object:        u.s route 2 (1.0)
Object FB:     [{'name': 'u.s. route 2'}, {'name': 'u.s. highway 2'}]
Object Score:  17.76996
Relation:      www.freebase.com/transportation/road/major_cities
Object FB:     [{'name': 'u.s. highw

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Object:        australia (1.0)
Object FB:     [{'name': 'australia'}]
Object Score:  10.5068445
Relation:      www.freebase.com/music/composition/recordings
Object FB:     [{'name': 'australia'}]
Object Score:  10.5068445
Relation:      www.freebase.com/music/composition/language
Object FB:     None
Object Score:  None
Relation:      www.freebase.com/music/compositional_form/compositions
Object MID:    0nm2g6l
Object Score:  10.5068445
Object Names:  [{'name': 'australia'}]
Top Relation:  www.freebase.com/music/composition/recordings
Top Subjects:   {'02prsd4'}
Wrong Object MID:     0nm2g6l
Correct Object MID:   0zvwcn5
Correct Object:       nan
Correct Relation:     www.freebase.com/music/composition/recordings
Wrong Subjects:       {'02prsd4'}
Correct Subject MID:  0fw_qg3
WHOOPS!

Question:      what is the film tempo di uccidere about
Relation(s):   [('www.freebase.com/media_common/netflix_title/netflix_genres', 0.4808569935592784), ('www.freebase.com/film/film/subjects', 0.3650900

Object:        thriller (1.0)
Object FB:     [{'name': 'thriller'}]
Object Score:  14.047661
Relation:      www.freebase.com/media_common/netflix_genre/titles
Object FB:     [{'name': 'thriller'}]
Object Score:  14.047661
Relation:      www.freebase.com/film/film_genre/films_in_this_genre
Object FB:     [{'name': 'thriller'}]
Object Score:  14.047661
Relation:      www.freebase.com/tv/tv_genre/programs
Object MID:    01jfsb
Object Score:  14.047661
Object Names:  [{'name': 'thriller'}]
Top Relation:  www.freebase.com/media_common/netflix_genre/titles
Top Subjects:   {'04j2dcg', '0crryp5', '04j1pp5', '0c1n5_z', '04j23yr', '02qpnmm', '030xw6', '04nvmwz', '07k482', '04j33zr', '069m26', '04j3446', '0414gmx', '0crsf9v', '0gy4mh', '02r8_xn', '033qdy', '0gm19g4', '04j1pt9', '05b5x97', '03m9t55', '0crt42p', '091w1s', '04q0svn', '089d0v', '0b6_znh', '02r34wj', '0crx42h', '04j0jxz', '0gyy81x', '096nbp', '0crxx7t', '04j2cx5', '0489v9', '0bd5t90', '07k68tf', '0crvk6q', '0cs5nmx', '0f7tms', '0g9tdd

KeyboardInterrupt: 