# Freebase Seq2Seq

In [16]:
PREFIX = '/Users/petrochuk/sync/pytorch-seq2seq/notebooks/models/'
OBJECT_CHECKPOINT = PREFIX + 'simple_qa_object_recognition/2017_09_19_10_16_38_[3]'
PREDICATE_CLASSIFICATION_CHECKPOINT = (PREFIX + 
                            'simple_qa_predicate_classification_freebase/2017_09_19_16_01_16')

import math
import pprint

from seq2seq.controllers import Predictor
from seq2seq.util.checkpoint import Checkpoint

pretty_printer = pprint.PrettyPrinter(indent=2)

def preprocess(s):
    """
    This preprocessing step is required for `get_object`.
    The same step is computed when creating the data.
    
    Args:
        s (str) string to preprocess
    Returns:
        preprocessed string
    """
    s = s.lower()
    s = s.replace('?', '')
    s = s.replace('.', '')
    s = s.strip()
    return s

def get_predict(checkpoint_name):
    """ Get a predict callable for a model """
    checkpoint = Checkpoint.get_checkpoint(checkpoint_name=checkpoint_name)
    predictor = Predictor(checkpoint.model, checkpoint.input_field, checkpoint.output_field)
    return predictor.predict

predicate_classification_predict = get_predict(PREDICATE_CLASSIFICATION_CHECKPOINT)
object_recognition_predict = get_predict(OBJECT_CHECKPOINT)

def get_object(question):
    """ Given a question return the object in the question using `OBJECT_CHECKPOINT` model. """
    question = preprocess(question)
    marks, confidence = object_recognition_predict(question)
    entity = []
    for marker, word in zip(marks, question.split()):
        if marker == 'e':
            entity.append(word)
    return ' '.join(entity), sum(confidence) / len(confidence)

def get_predicate_id(question, top=3):
    """ 
    Given a question return the predicate in the question using
    `PREDICATE_CLASSIFICATION_CHECKPOINT` model.
    
    Args:
        question (str)
        above (float) return every predicate that is above 0.9 confidence
    Returns:
        list of predicates and their confidence
    """
    question = preprocess(question)
    return [(class_[0], confidence[0]) for class_, confidence in
     list(predicate_classification_predict(question, top=top))]

# To test this cell
print(get_predicate_id('what area code is 845'))
print(get_object('Who is Obama?'))

[('www.freebase.com/location/location/containedby', 0.3764419095940369), ('www.freebase.com/astronomy/star_system_body/star_system', 0.16490115373806102), ('www.freebase.com/astronomy/asteroid/member_of_asteroid_group', 0.16397528790582935)]
('obama', 0.9999145006698564)


In [141]:
import pprint

from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost'])
client = Elasticsearch()

def get_object_link(object_, property_, print_info=False):
    """
    Link the object_ (str) to a QID.
    
    Args:
        object_ (str): object to link
        predicate_mid (str): MID to filter the results
        print_info (bool): print the top matches
    Return:
        mid (str): object QID
        score (float): the score assigned to the top result
        name (str): name of the object QID
    """
    property_ = property_.replace('www.freebase.com/', '')
    print(object_, property_)
    hits = Search.from_dict({
        'query': {
            'bool': {
                'filter': {
                    'nested': {
                        'path': 'facts', 
                        'query': {
                          'match': { 'facts.property': property_ }
                        }
                    }
                },
                'must': [{
                    'match' : {'name' : object_},
                }]
            }
        },
    }).using(client).index('fb5m_entities').execute()
    if print_info:
        print('Hits:')
        pretty_printer.pprint([(hit.mid, hit.name, hit.meta.score,
                                hit.facts) for hit in hits])
    if len(hits) == 0:
        print('WARNING: No Object found.')
        return None, None, None, None
    return hits[0].mid, hits[0].meta.score, hits[0].name, hits[0].facts

In [146]:
import ujson as json
import pandas as pd

from IPython.display import display

SRC = '/Users/petrochuk/sync/pytorch-seq2seq/data/simple_qa/dev.tsv'

def answer_question(question):
    """
    Answer a question in JSON format for IO.
    
    Args:
        question (str)
    Returns:
        DL Predicate (str): name of the predicate
        DL Preidcate PID (str): PID of the predicate
        DL Preidcate Confidence (float)
        DL Top Predicates (list of predicate, PID, and confidence): Top predicates about > .9 confience
        DL Object (str): name of the object in the question
        DL Object Confidence (float)
        DL Object Name (str): name of the Wikidata object linked too
        DL Object Aliases (list of str): list of aliases for the object
        DL Object ID (str): QID for th WikiData object
        DL Object Score (float): score by ElasticSearch for object linking
        DL Answers (list of Object ID, Score, Answer): list of tuples that have the PID and QID
    """
    print()
    question = preprocess(question)
    print('Question:', question)
    top_predicates = get_predicate_id(question, top=3)
    print('Predicate QIDs:', top_predicates)
    top_mid = None
    top_score = 0
    top_name = None
    top_predicate_id = None
    top_subject = None
    object_, object_confidence = get_object(question)
    print('Object:', object_, '| Confidence:', object_confidence)
    for predicate_id, confidence in top_predicates:
        mid, score, name, facts = get_object_link(object_, predicate_id)
        print('Object:', name)
        print('Object Score:', score)
        print('Predicate:', predicate_id)
        print('')
        if mid and score > top_score:
            top_subject = [fact for fact in facts if fact['property'] in predicate_id]
            top_mid = mid
            top_score = score
            top_name = name
            top_predicate_id = predicate_id
    print('Object MID:', top_mid)
    print('Object Score:', top_score)
    print('Object Name:', top_name)
    print('Top Predicate:', top_predicate_id)
    if top_subject:
        print('Top Subject:', top_subject[0]['subjects'])
    return top_mid, top_predicate_id

def main():
    """
    Run main to save answers to the pandas table SRC
    """
    data = pd.read_table(SRC)
    display(data.head())
    
    def add_answers(row):
        question = row['Question EN']
        mid, predicate_id = answer_question(question)
        object_mid = row['Object MID'].replace('www.freebase.com/m/', '').strip()
        if not mid or mid != object_mid or row['Freebase Property'] != predicate_id:
            print('FAIL!')
            if mid != object_mid:
                print('WRONG MID:', mid)
            if row['Freebase Property'] != predicate_id:
                print('WRONG PROP:', predicate_id)
            print('Correct MID:', object_mid)
            print('Correct Object:', row['Object EN'])
            print('Correct Subject:', row['Subject EN'])
            print('Correct Subject:', row['Subject MID'])
            print('Correct Property:', row['Freebase Property'])
        else:
            print('CORRECT!')
    
    data = data.apply(add_answers, axis=1)
    print('Done!')

main()

Unnamed: 0,Object MID,Freebase Property,Subject MID,Question,WikiData Property,Subject EN,Object EN,Question FR,Question EN,Object EN Mask
0,www.freebase.com/m/0f3xg_,www.freebase.com/symbols/namesake/named_after,www.freebase.com/m/0cqt90,Who was the trump ocean club international hot...,P138,Donald Trump,Trump Ocean Club International Hotel and Tower,qui était l'hôtelier et la tour internationale...,Who was the trump ocean club international hot...,c c c e e e e e e e c c
1,www.freebase.com/m/07f3jg,www.freebase.com/people/person/place_of_birth,www.freebase.com/m/0565d,where was sasha vujačić born,P19,Maribor,Sasha Vujačić,où est né sasha vuja?i?,where was sasha vujačić born,c c e e c
2,www.freebase.com/m/031j8nn,www.freebase.com/music/release/region,www.freebase.com/m/07ssc,What is a region that dead combo was released in,,United Kingdom,Dead Combo,quelle est la région dans laquelle le combo mo...,What is a region that dead combo was released in,c c c c c e e c c c
3,www.freebase.com/m/0c1cyhd,www.freebase.com/film/director/film,www.freebase.com/m/0wxsz5y,What is a film directed by wiebke von carolsfeld?,inverse:P57,Stay,Wiebke von Carolsfeld,Qu'est-ce qu'un film réalisé par wiebke von ca...,What is a film directed by wiebke von carolsfeld?,c c c c c c e e e
4,www.freebase.com/m/0fvhc0g,www.freebase.com/music/release/region,www.freebase.com/m/0345h,what country was music for stock exchange rel...,,Germany,Music for Stock Exchange,quel pays était la musique de la bourse publié...,what country was music for stock exchange rel...,c c c e e e e c c



Question: who was the trump ocean club international hotel and tower named after
Predicate QIDs: [('www.freebase.com/symbols/namesake/named_after', 0.9943095652158973), ('www.freebase.com/symbols/name_source/namesakes', 0.0014407299269798557), ('www.freebase.com/medicine/manufactured_drug_form/manufacturer', 0.0007310642166082501)]
Object: trump ocean club international hotel and tower | Confidence: 0.9900202416305273
trump ocean club international hotel and tower symbols/namesake/named_after
Object: Trump Ocean Club International Hotel and Tower
Object Score: 29.844212
Predicate: www.freebase.com/symbols/namesake/named_after

trump ocean club international hotel and tower symbols/name_source/namesakes
Object: Donald Trump
Object Score: 9.961305
Predicate: www.freebase.com/symbols/name_source/namesakes

trump ocean club international hotel and tower medicine/manufactured_drug_form/manufacturer
Object: Hydrocodone Bitartrate And Acetaminophen 750/7.5 tablet
Object Score: 2.3830202
Pred

Object: Seymour Parker Gilbert
Object Score: 14.147825
Predicate: www.freebase.com/common/topic/notable_types

seymour parker gilbert's people/person/profession
Object: Seymour Parker Gilbert
Object Score: 14.147825
Predicate: www.freebase.com/people/person/profession

seymour parker gilbert's fictional_universe/fictional_character/occupation
Object: Jennifer Parker
Object Score: 7.9854445
Predicate: www.freebase.com/fictional_universe/fictional_character/occupation

Object MID: 02p_vkx
Object Score: 14.147825
Object Name: Seymour Parker Gilbert
Top Predicate: www.freebase.com/common/topic/notable_types
Top Subject: ['02xlh55']
FAIL!
WRONG PROP: www.freebase.com/common/topic/notable_types
Correct MID: 02p_vkx
Correct Object: Seymour Parker Gilbert
Correct Subject: Lawyer
Correct Subject: www.freebase.com/m/04gc2
Correct Property: www.freebase.com/people/person/profession

Question: what does (12385) 1994 uo orbit
Predicate QIDs: [('www.freebase.com/astronomy/orbital_relationship/orbits

Object: Cefotaxime sodium 2 injection
Object Score: 4.2260833
Predicate: www.freebase.com/medicine/drug_formulation/formulation_of

us route 2 medicine/drug_formulation/routes
Object: Benzocaine benzalkonium chloride 2/2 cloth
Object Score: 4.765423
Predicate: www.freebase.com/medicine/drug_formulation/routes

Object MID: 0j9xz
Object Score: 12.089397
Object Name: US Route 101
Top Predicate: www.freebase.com/transportation/road/major_cities
Top Subject: ['0f04v', '0zdx3', '030qb3t', '0fw1y', '0d6lp']
FAIL!
WRONG MID: 0j9xz
Correct MID: 029j_m
Correct Object: U.S. Route 2
Correct Subject: Kalispell
Correct Subject: www.freebase.com/m/0x1vt
Correct Property: www.freebase.com/transportation/road/major_cities

Question: who was a child of mithibai jinnah
Predicate QIDs: [('www.freebase.com/people/person/children', 0.9992166773584613), ('www.freebase.com/fictional_universe/fictional_character/children', 0.00039051100431127607), ('www.freebase.com/people/deceased_person/place_of_death', 9.46

KeyboardInterrupt: 