In [55]:
import os
import psycopg2
import psycopg2.extras

# Load .env file
pass_ = {}
for line in open('../../.pass'):
    split = line.strip().split('=')
    pass_[split[0]] = split[1]

# Connect    
connection = psycopg2.connect(dbname=pass_['DB_NAME'],
                              port=pass_['DB_PORT'],
                              user=pass_['DB_USER'],
                              host=pass_['DB_HOST'],
                              password=pass_['DB_PASS'])
cursor = connection.cursor()

In [11]:
simple_qa = {
    'train': '../../data/SimpleQuestions_v2/annotated_fb_data_train.txt',
    'dev': '../../data/SimpleQuestions_v2/annotated_fb_data_valid.txt',
    'test': '../../data/SimpleQuestions_v2/annotated_fb_data_test.txt',
}

In [18]:
import sys
sys.path.insert(0, '../../')

import math
import pprint
import re
import unicodedata

from lib.checkpoint import Checkpoint


RELATION_CLASSIFIER = '../../results/0626.11-05_08:38:02.relation_classifier/11m_05d_08h_45m_57s.pt'
    
def remove_accents(text):
    # https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    nkfd_form = unicodedata.normalize('NFKD', text)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

def preprocess_question(s):
    s = s.replace("'s", '')
    s = remove_accents(s)
    s = s.split()
    s = [re.sub('^\W+', '', w) for w in s]  # Strip prefix none alphanumeric
    s = [re.sub('\W+$', '', w) for w in s]  # Strip leading none alphanumeric
    s = [w.strip().lower() for w in s]
    s = [w for w in s if len(w) != 0] # Remove Empty
    return ' '.join(s)

relation_classifier_predict = Checkpoint(checkpoint_path=RELATION_CLASSIFIER).predict

def get_relation(question, top_k=3):
    """ 
    Given a question return the predicate in the question using `RELATION_CLASSIFIER` model.
    
    Args:
        question (str)
    Returns:
        list of predicates and their confidence
    """
    question = preprocess_question(question)
    predicted = list(relation_classifier_predict(question, top_k=top_k))
    if top_k == 1:
            predicted[1] = sum(predicted[1]) / len(predicted[1]) 
            return [predicted]
    return [(class_, sum(confidence) / len(confidence)) for class_, confidence in predicted]

# To test this cell
print(get_relation('Where was Obama born?', 10))

[('www.freebase.com/people/person/place_of_birth', 1.0), ('www.freebase.com/people/person/nationality', 1.8365256779269813e-08), ('www.freebase.com/location/location/people_born_here', 5.0509226076001125e-09), ('www.freebase.com/music/album/album_content_type', 1.8572093479922836e-09), ('www.freebase.com/music/album/genre', 1.0259027594091375e-09), ('www.freebase.com/fictional_universe/fictional_character/place_of_birth', 4.3435644844782266e-10), ('www.freebase.com/people/deceased_person/place_of_death', 3.6899369947368274e-10), ('www.freebase.com/medicine/drug_formulation/drug_category', 1.7813512957546355e-10), ('www.freebase.com/media_common/netflix_title/netflix_genres', 8.754611452654275e-11), ('www.freebase.com/people/profession/people_with_this_profession', 6.442428900204096e-11)]




In [13]:
from IPython.display import display
import pandas as pd
from sklearn.utils import shuffle

names = ['subject', 'relation', 'object', 'question']
df_train = pd.read_table(simple_qa['train'], header=None, names=names)
df_dev = pd.read_table(simple_qa['dev'], header=None, names=names)
df_train = shuffle(df_train, random_state=123)
df_dev = shuffle(df_dev, random_state=123)
display(df_dev[:5])

Unnamed: 0,subject,relation,object,question
6219,www.freebase.com/m/03k3r,www.freebase.com/biology/organism_classificati...,www.freebase.com/m/0bs56bp,Name an American Thoroughbread racehorse
3364,www.freebase.com/m/02qlppc,www.freebase.com/cvg/computer_videogame/cvg_genre,www.freebase.com/m/01sjng,what kind of game is vision racing driving sim...
9374,www.freebase.com/m/02l7c8,www.freebase.com/tv/tv_genre/programs,www.freebase.com/m/0dlmm88,what tv program is romance film
10142,www.freebase.com/m/049_zj3,www.freebase.com/location/location/containedby,www.freebase.com/m/04rrx,what state is polaski located in
97,www.freebase.com/m/02w9ycr,www.freebase.com/people/deceased_person/cause_...,www.freebase.com/m/0qcr0,what disease claimed the life of fern emmett


In [14]:
from nltk.tokenize.treebank import TreebankWordTokenizer

tokenize = TreebankWordTokenizer().tokenize

def preprocess(row):
    row['subject'] = row['subject'].strip().replace('www.freebase.com/m/', '')
    row['object'] = row['object'].strip().replace('www.freebase.com/m/', '')
    question = row['question'].strip()
    row['question'] = ' '.join(tokenize(question.lower()))
    row['relation'] = row['relation'].strip().replace('www.freebase.com/', '')
    return row

df_dev = df_dev.apply(preprocess, axis=1)
display(df_dev[:5])
df_train = df_train.apply(preprocess, axis=1)

Unnamed: 0,subject,relation,object,question
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,name an american thoroughbread racehorse
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett


In [15]:
from numpy import nan

def add_subject_name(row):
    rows = []
    for mid in [row['subject'], row['object']]:
        sql = """SELECT alias FROM fb_name
                 WHERE mid='{mid}'""".format(mid=mid)
        cursor.execute(sql)
        rows += cursor.fetchall()
    if len(rows) == 0:
        return nan
    # Sort by largest name first
    names = sorted([row[0].strip().lower() for row in rows], key=lambda n: len(n), reverse=True)
    for name in names:
        if name in row['question']:
            return name
    return nan

df_dev['subject_name'] = df_dev.apply(add_subject_name, axis=1)
display(df_dev[:5])
df_train['subject_name'] = df_train.apply(add_subject_name, axis=1)

Unnamed: 0,subject,relation,object,question,subject_name
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,name an american thoroughbread racehorse,horse
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...,vision racing driving simulator
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film,romance film
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in,polaski
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett,fern emmett


In [77]:
from numpy import nan

def add_relations(row):
    relations = get_relation(row['question'], 1)
    return {relation.strip().replace('www.freebase.com/', ''): confidence for relation, confidence in relations}

df_dev['relations'] = df_dev.apply(add_relations, axis=1)
display(df_dev[:5])
df_train['relations'] = df_train.apply(add_relations, axis=1)

Unnamed: 0,subject,relation,object,question,subject_name,relations
6219,03k3r,biology/organism_classification/organisms_of_t...,0bs56bp,name an american thoroughbread racehorse,horse,{'biology/organism_classification/organisms_of...
3364,02qlppc,cvg/computer_videogame/cvg_genre,01sjng,what kind of game is vision racing driving sim...,vision racing driving simulator,{'cvg/computer_videogame/cvg_genre': 1.0}
9374,02l7c8,tv/tv_genre/programs,0dlmm88,what tv program is romance film,romance film,{'tv/tv_genre/programs': 0.9999465956643435}
10142,049_zj3,location/location/containedby,04rrx,what state is polaski located in,polaski,{'location/location/containedby': 1.0}
97,02w9ycr,people/deceased_person/cause_of_death,0qcr0,what disease claimed the life of fern emmett,fern emmett,{'people/deceased_person/cause_of_death': 0.98...


In [78]:
from tqdm import tqdm_notebook
from random import sample
import math

flatten = lambda l: [item for sublist in l for item in sublist]

def get_candidates(df):
    count = 0
    ret = []
    n_rows = df.shape[0]
    for index, row in tqdm_notebook(df.iterrows(), total=n_rows):
        # CHECK: Should dev have the below filter?
        if not pd.isnull(row['subject_name']):
            relation_map = row['relations']
            cursor.execute("""SELECT fb_name.alias, fb_kg.subject_mid, array_agg(fb_kg.relation)
                              FROM fb_kg
                              LEFT JOIN fb_name
                              ON subject_mid = mid
                              WHERE alias LIKE %s
                              GROUP BY fb_kg.subject_mid, fb_name.alias""", (row['subject_name'],))
            rows = cursor.fetchall()
            all_relations = flatten([relations for _, _, relations in rows])
            total_facts = float(len(all_relations))
            total_relations = float(len(set(all_relations)))
            labels = []
            feature_rows = []
            if len(rows) == 1:
                count += 1
            for alias, subject_mid, relations in rows:
                n_facts = len(relations) / total_facts
                n_relations = len(set(relations)) / total_relations
                for relation in set(relations):
                    if relation in relation_map:
                        confidence = math.log(relation_map[relation])
                        label = subject_mid == row['subject'] and relation == row['relation']
                        labels.append(label)
                        feature_rows.append([n_facts, n_relations, confidence])
            assert sum(labels) <= 1, 'n true: %d' % sum(labels) # Only one True
            ret.append({
                'rows': len(rows),
                'total_facts': total_facts,
                'total_relations': total_relations,
                'feature_rows': feature_rows,
                'labels': labels,
            })
    print('Count:', count)
    print('Count:', len([row for row in ret if len(row['labels']) == 1]))
    return ret

dev_candidates = get_candidates(df_dev.head(n=500))
print('Sample Candidates:', dev_candidates[:5])
train_candidates = get_candidates(df_train.head(n=4000))

Count: 300
Count: 330
Sample Candidates: [{'rows': 9, 'feature_rows': [[0.9681150359487339, 0.4, -0.0025396347045898546]], 'total_facts': 3199.0, 'labels': [True], 'total_relations': 65.0}, {'rows': 1, 'feature_rows': [[1.0, 1.0, 0.0]], 'total_facts': 3.0, 'labels': [True], 'total_relations': 3.0}, {'rows': 1, 'feature_rows': [[1.0, 1.0, -5.340576171878735e-05]], 'total_facts': 20198.0, 'labels': [True], 'total_relations': 15.0}, {'rows': 1, 'feature_rows': [[1.0, 1.0, 0.0]], 'total_facts': 3.0, 'labels': [True], 'total_relations': 1.0}, {'rows': 1, 'feature_rows': [[1.0, 1.0, -0.01611614227294927]], 'total_facts': 6.0, 'labels': [True], 'total_relations': 6.0}]





Exception in thread Thread-41:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




Count: 2310
Count: 3129


In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

train = flatten([row['feature_rows'] for row in train_candidates])
train_labels = flatten([row['labels'] for row in train_candidates])
dev = flatten([row['feature_rows'] for row in dev_candidates])
dev_labels = flatten([row['labels'] for row in dev_candidates])
model = LogisticRegression()
model.fit(train, train_labels)
dev_pred = model.predict(dev)
print(classification_report(dev_labels, dev_pred))

             precision    recall  f1-score   support

      False       0.99      1.00      0.99      9088
       True       0.89      0.82      0.86       371

avg / total       0.99      0.99      0.99      9459



In [80]:
correct = 0.0
for row in dev_candidates:
    if not row['feature_rows']:
        continue
    confidences = model.decision_function(row['feature_rows'])
    max_confidence = float("-inf")
    max_index = 0
    for i, confidence in enumerate(confidences):
        if confidence > max_confidence:
            max_confidence = confidence
            max_index = i
    if row['labels'][i]:
        correct += 1
print('Num Correct:', correct)
print('Accuracy:', correct / len(dev_candidates))

Num Correct: 320.0
Accuracy: 0.6625258799171843


In [70]:
len([row for row in dev_candidates if len(row['labels']) == 2])

133