In [1]:
import random
import os
from collections import Counter, defaultdict
import rel_ext
import utils

In [2]:
#set all the random seeds for reproducibility
#only system seed relevant here
utils.fix_random_seeds()

In [3]:
rel_ext_data_home = os.path.join('data', 'rel_ext_data')

# ***The Corpus***

In [4]:
corpus = rel_ext.Corpus(os.path.join(rel_ext_data_home, 'corpus.tsv.gz'))

print('Read {0:,} examples'.format(len(corpus)))

Read 331,696 examples


In [5]:
print(corpus.examples[1])

Example(entity_1='New_Mexico', entity_2='Arizona', left='to all Spanish-occupied lands . The horno has a beehive shape and uses wood as the only heat source . The procedure still used in parts of', mention_1='New Mexico', middle='and', mention_2='Arizona', right='is to build a fire inside the Horno and , when the proper amount of time has passed , remove the embers and ashes and insert the', left_POS='to/TO all/DT Spanish-occupied/JJ lands/NNS ./. The/DT horno/NN has/VBZ a/DT beehive/NN shape/NN and/CC uses/VBZ wood/NN as/IN the/DT only/JJ heat/NN source/NN ./. The/DT procedure/NN still/RB used/VBN in/IN parts/NNS of/IN', mention_1_POS='New/NNP Mexico/NNP', middle_POS='and/CC', mention_2_POS='Arizona/NNP', right_POS='is/VBZ to/TO build/VB a/DT fire/NN inside/IN the/DT Horno/NNP and/CC ,/, when/WRB the/DT proper/JJ amount/NN of/IN time/NN has/VBZ passed/VBN ,/, remove/VB the/DT embers/NNS and/CC ashes/NNS and/CC insert/VB the/DT')


In [6]:
#taking a closer look at one of the examples
ex = corpus.examples[1]

' '.join((ex.left, ex.mention_1, ex.middle, ex.mention_2, ex.right))

'to all Spanish-occupied lands . The horno has a beehive shape and uses wood as the only heat source . The procedure still used in parts of New Mexico and Arizona is to build a fire inside the Horno and , when the proper amount of time has passed , remove the embers and ashes and insert the'

In [7]:
#looking at entities over the corpus and seeing the most common ones
counter = Counter()
for example in corpus.examples:
    counter[example.entity_1] += 1
    counter[example.entity_2] += 1
print('The corpus contains {} entities'.format(len(counter)))
counts = sorted([(count, key) for key, count in counter.items()], reverse=True)
print('The most common entities are:')
for count, key in counts[:20]:
    print('{:10d} {}'.format(count, key))

The corpus contains 95909 entities
The most common entities are:
      8137 India
      5240 England
      4121 France
      4040 Germany
      3937 Australia
      3779 Canada
      3633 Italy
      3138 California
      2894 New_York_City
      2745 Pakistan
      2213 New_Zealand
      2183 New_York
      2148 United_Kingdom
      2030 Spain
      2005 Japan
      1891 Russia
      1806 Philippines
      1748 Malaysia
      1721 Indonesia
      1670 China


In [8]:
#finding examples containing 'Elon Musk' and 'Tesla Motors'
#only mentions examples where 'Elon Musk' was mentioned first and 'Tesla Motors' was mentioned second
corpus.show_examples_for_pair('Elon_Musk', 'Tesla_Motors')

The first of 5 examples for Elon_Musk and Tesla_Motors is:
Example(entity_1='Elon_Musk', entity_2='Tesla_Motors', left='space for a while , here ’ s what might be launching Americans into space in the next decade . Falcon 9 From sometimes Canadian , South African & American', mention_1='Elon Musk', middle='‘ s company Space X . Musk is a PayPal alumni and', mention_2='Tesla Motors', right='co-founder - remember that latter company name for future trivia questions and/or a remake of Back to the Future . After several successful launches on their Falcon', left_POS="space/NN for/IN a/DT while/NN ,/, here/RB '/'' s/VBZ what/WP might/MD be/VB launching/VBG Americans/NNPS into/IN space/NN in/IN the/DT next/JJ decade/NN ./. Falcon/NNP 9/CD From/IN sometimes/RB Canadian/JJ ,/, South/JJ African/NNP &/CC American/NNP", mention_1_POS='Elon/NNP Musk/NNP', middle_POS='`/`` s/NNS company/NN Space/NN X/NN ./. Musk/NNP is/VBZ a/DT PayPal/NNP alumni/NNS and/CC', mention_2_POS='Tesla/NNP Motors/NNPS', r

In [9]:
#examples that have the above entities in reverse order
corpus.show_examples_for_pair('Tesla_Motors', 'Elon_Musk')

The first of 2 examples for Tesla_Motors and Elon_Musk is:
Example(entity_1='Tesla_Motors', entity_2='Elon_Musk', left='their factory in Hethel . If you want to see one in action , Robert Scoble got a ride in the first production model , driven by', mention_1='Tesla Motors', middle='chairman', mention_2='Elon Musk', right='. Needless to say he got the whole thing on video , and covers a lot of technical details about the car – this is the', left_POS='their/PRP$ factory/NN in/IN Hethel/NNP ./. If/IN you/PRP want/VBP to/TO see/VB one/CD in/IN action/NN ,/, Robert/NNP Scoble/NNP got/VBD a/DT ride/NN in/IN the/DT first/JJ production/NN model/NN ,/, driven/VBN by/IN', mention_1_POS='Tesla/NNP Motors/NNPS', middle_POS='chairman/NN', mention_2_POS='Elon/NNP Musk/NNP', right_POS='./. Needless/JJ to/TO say/VB he/PRP got/VBD the/DT whole/JJ thing/NN on/IN video/NN ,/, and/CC covers/VBZ a/DT lot/NN of/IN technical/JJ details/NNS about/IN the/DT car/NN --/: this/DT is/VBZ the/DT')


# ***The Knowledge Base***

*The data distribution was ultimately derived from https://freebase-easy.cs.uni-freiburg.de/dump/ . The KB is a collection of relation triples of the form (relation, subject, object), such as:*


*   (place_of_birth, Barack_Obama, Honolulu)
*   (has_spouse, Barack_Obama, Michelle_Obama)
*   (author, The_Audacity_of_Hope, Barack_Obama) 

*The class makes it easy and efficient to look up triples in the KB using both relations and entities.*

In [10]:
kb = rel_ext.KB(os.path.join(rel_ext_data_home, 'kb.tsv.gz'))

print('Read {0:,} KB triples'.format(len(kb)))

Read 45,884 KB triples


In [11]:
#how many relations?
len(kb.all_relations)

16

In [12]:
#how many triples does each relation contain?
for rel in kb.all_relations:
    print('{:12d} {}'.format(len(kb.get_triples_for_relation(rel)), rel))

        1702 adjoins
        2671 author
         522 capital
       18681 contains
        3947 film_performance
        1960 founders
         824 genre
        2563 has_sibling
        2994 has_spouse
        2542 is_a
        1598 nationality
        1586 parents
        1097 place_of_birth
         831 place_of_death
        1216 profession
        1150 worked_at


In [13]:
#lookin at one example from each relation to see what they mean
for rel in kb.all_relations:
    print(tuple(kb.get_triples_for_relation(rel)[0]))

('adjoins', 'France', 'Spain')
('author', 'Uncle_Silas', 'Sheridan_Le_Fanu')
('capital', 'Panama', 'Panama_City')
('contains', 'Brickfields', 'Kuala_Lumpur_Sentral_railway_station')
('film_performance', 'Colin_Hanks', 'The_Great_Buck_Howard')
('founders', 'Lashkar-e-Taiba', 'Hafiz_Muhammad_Saeed')
('genre', '8_Simple_Rules', 'Sitcom')
('has_sibling', 'Ari_Emanuel', 'Rahm_Emanuel')
('has_spouse', 'Percy_Bysshe_Shelley', 'Mary_Shelley')
('is_a', 'Bhanu_Athaiya', 'Costume_designer')
('nationality', 'Ruben_Rausing', 'Sweden')
('parents', 'Rosanna_Davison', 'Chris_de_Burgh')
('place_of_birth', 'William_Penny_Brookes', 'Much_Wenlock')
('place_of_death', 'Jean_Drapeau', 'Montreal')
('profession', 'Rufus_Wainwright', 'Actor')
('worked_at', 'Brian_Greene', 'Columbia_University')


In [14]:
#kb.get_triples_for_entities() method allows us to look up triples by the entities they contain. 
#using it to see what relation(s) hold between France and Germany
kb.get_triples_for_entities('France', 'Germany')

[KBTriple(rel='adjoins', sbj='France', obj='Germany')]

In [15]:
#most relations in the KB are asymmetric
#but some relations like 'adjoins' and 'has sibling' are symmetric,so the vice versa would also be true
kb.get_triples_for_entities('Germany', 'France')

[KBTriple(rel='adjoins', sbj='Germany', obj='France')]

In [16]:
#there might be cases where there are more than one relation, even in one direction
#for instance, Ptolemly XIII was the brother and husband of Cleopatra
kb.get_triples_for_entities('Cleopatra', 'Ptolemy_XIII_Theos_Philopator')

[KBTriple(rel='has_sibling', sbj='Cleopatra', obj='Ptolemy_XIII_Theos_Philopator'),
 KBTriple(rel='has_spouse', sbj='Cleopatra', obj='Ptolemy_XIII_Theos_Philopator')]

In [17]:
#the distribution of entities in the KB
counter = Counter()
for kbt in kb.kb_triples:
    counter[kbt.sbj] += 1
    counter[kbt.obj] += 1
print('The KB contains {:,} entities'.format(len(counter)))
counts = sorted([(count, key) for key, count in counter.items()], reverse=True)
print('The most common entities are:')
for count, key in counts[:20]:
    print('{:10d} {}'.format(count, key))

The KB contains 40,141 entities
The most common entities are:
       945 England
       786 India
       438 Italy
       414 France
       412 California
       400 Germany
       372 United_Kingdom
       366 Canada
       302 New_York_City
       247 New_York
       236 Australia
       219 Philippines
       215 Japan
       212 Scotland
       208 Russia
       198 Actor
       172 Pakistan
       170 Ontario
       169 Ireland
       168 New_Zealand


## ***Joining the corpus and the KB***
In order to leverage the distant supervision paradigm, we'll need to connect information in the corpus with information in the KB. There are two possibilities, depending on how we formulate our prediction problem:

Use the KB to generate labels for the corpus. If our problem is to classify a pair of entity mentions in a specific example in the corpus, then we can use the KB to provide labels for training examples. 

We'll formulate our prediction problem such that the input is a pair of entities, and the goal is to predict what relation(s) the pair belongs to. The KB will provide the labels, and the corpus will provide the features.

We've created a Dataset class which combines a corpus and a KB, and provides a variety of convenience methods for the dataset.

In [18]:
dataset = rel_ext.Dataset(corpus, kb)

In [19]:
#determining how many relations we have for each triple in the KB
#computing averages per relation
dataset.count_examples()

                                             examples
relation               examples    triples    /triple
--------               --------    -------    -------
adjoins                   58854       1702      34.58
author                    11768       2671       4.41
capital                    7443        522      14.26
contains                  75952      18681       4.07
film_performance           8994       3947       2.28
founders                   5846       1960       2.98
genre                      1576        824       1.91
has_sibling                8525       2563       3.33
has_spouse                12013       2994       4.01
is_a                       5112       2542       2.01
nationality                3403       1598       2.13
parents                    3802       1586       2.40
place_of_birth             1657       1097       1.51
place_of_death             1523        831       1.83
profession                 1851       1216       1.52
worked_at                  3

***Negative Instances :***
In order to apply distant supervision paradigm, we also need some negative instances (entity pairs which do not belong to any known relation).
We assign these entities the special relation called `No_Relation`

In [20]:
unrelated_pairs = dataset.find_unrelated_pairs()
print('Found {0:,} unrelated pairs, including:'.format(len(unrelated_pairs)))
for pair in list(unrelated_pairs)[:10]:
    print('   ', pair)

Found 247,405 unrelated pairs, including:
    ('Aratus', 'Seneca_the_Younger')
    ('Rocky_Mountains', 'Czech_Republic')
    ('Estate_agent', 'Canada')
    ('Jungfrau', 'Zermatt')
    ('Charles_Eames', 'Gregory_Ain')
    ('Guwahati', 'Ahmedabad')
    ('Sarah', 'Book_of_Genesis')
    ('BRCA1', 'BRCA2')
    ('Cuddalore', 'Chittoor')
    ('Heinz_von_Foerster', 'Hans_Moravec')


Our prediction problem is a multi-label classification. 

There are a number of ways to approach multi-label classification, but the most obvious is the binary relevance method ( factors multi-label classification over n labels into n independent binary classification problems, one for each label). 

*Disadvantage:* by treating the binary classification problems as independent, it fails to exploit correlations between labels. 


So our problem will be to take as input an entity pair and a candidate relation (label), and to return a binary prediction as to whether the entity pair belongs to the relation.

# ***Building Datasets***

We'll now have a function to build datasets that are suitable for training and evaluating the predictive models. Characteristics of the datasets:


*   since our problem has been formulated as a multi label classification, we'll be training separate models for each relation and won't build a single dataset.
We'll build a dataset for each relation.

*  The dataset for each relation will consist of two parallel lists:
        1.   A list of candidate `KBTriples` which combine the given relation with a pair of entities.
        2.   A corresponding list of boolean labels indicating whether the given `KBTriple` belongs to the KB.



In [21]:
kbts_by_rel, labels_by_rel = dataset.build_dataset(
    include_positive=True, sampling_rate=0.1, seed=1)

In [22]:
print(kbts_by_rel['adjoins'][0], labels_by_rel['adjoins'][0])

KBTriple(rel='adjoins', sbj='France', obj='Spain') True


In [23]:
print(kbts_by_rel['capital'][637], labels_by_rel['capital'][637])

KBTriple(rel='capital', sbj='Sigmund_Freud', obj='Magnus_Hirschfeld') False


# ***Splitting the Data***
tiny split = 1%

train split= 74%

dev split= 25%

In [24]:
splits = dataset.build_splits(
    split_names=['tiny', 'train', 'dev'],
    split_fracs=[0.01, 0.74, 0.25],
    seed=1)

splits

{'all': Corpus with 331,696 examples; KB with 45,884 triples,
 'dev': Corpus with 79,219 examples; KB with 11,210 triples,
 'tiny': Corpus with 3,474 examples; KB with 445 triples,
 'train': Corpus with 249,003 examples; KB with 34,229 triples}

# ***Evaluating***

In [25]:
def lift(f):
    return lambda xs: [f(x) for x in xs]

def make_random_classifier(p=0.50):
    def random_classify(kb_triple):
        return random.random() < p
    return lift(random_classify)

In [26]:
def find_common_middles(split, top_k=3, show_output=False):
    corpus = split.corpus
    kb = split.kb
    mids_by_rel = {
        'fwd': defaultdict(lambda: defaultdict(int)),
        'rev': defaultdict(lambda: defaultdict(int))}
    for rel in kb.all_relations:
        for kbt in kb.get_triples_for_relation(rel):
            for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
                mids_by_rel['fwd'][rel][ex.middle] += 1
            for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
                mids_by_rel['rev'][rel][ex.middle] += 1
    def most_frequent(mid_counter):
        return sorted([(cnt, mid) for mid, cnt in mid_counter.items()], reverse=True)[:top_k]
    for rel in kb.all_relations:
        for dir in ['fwd', 'rev']:
            top = most_frequent(mids_by_rel[dir][rel])
            if show_output:
                for cnt, mid in top:
                    print('{:20s} {:5s} {:10d} {:s}'.format(rel, dir, cnt, mid))
            mids_by_rel[dir][rel] = set([mid for cnt, mid in top])
    return mids_by_rel

_ = find_common_middles(splits['train'], show_output=True)

adjoins              fwd         7667 ,
adjoins              fwd         5134 and
adjoins              fwd          903 , and
adjoins              rev         4582 ,
adjoins              rev         3000 and
adjoins              rev          507 , and
author               fwd         1007 by
author               fwd          124 ,
author               fwd          105 , by
author               rev          816 's
author               rev          210 ‘ s
author               rev          142 ’ s
capital              fwd           33 ,
capital              fwd           17 , after
capital              fwd           14 in
capital              rev         2506 ,
capital              rev          121 in
capital              rev           73 , the capital of
contains             fwd          319 's
contains             fwd          296 ,
contains             fwd          211 (
contains             rev        18511 ,
contains             rev         4160 in
contains             rev          

In [27]:
def train_top_k_middles_classifier(top_k=3):
    split = splits['train']
    corpus = split.corpus
    top_k_mids_by_rel = find_common_middles(split=split, top_k=top_k)
    def classify(kb_triple):
        fwd_mids = top_k_mids_by_rel['fwd'][kb_triple.rel]
        rev_mids = top_k_mids_by_rel['rev'][kb_triple.rel]
        for ex in corpus.get_examples_for_entities(kb_triple.sbj, kb_triple.obj):
            if ex.middle in fwd_mids:
                return True
        for ex in corpus.get_examples_for_entities(kb_triple.obj, kb_triple.sbj):
            if ex.middle in rev_mids:
                return True
        return False
    return lift(classify)

In [28]:
rel_ext.evaluate(splits, train_top_k_middles_classifier())

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.272      0.285      0.274        407       7057
author                    0.325      0.078      0.198        657       7307
capital                   0.093      0.159      0.101        126       6776
contains                  0.593      0.064      0.223       4487      11137
film_performance          0.625      0.005      0.025        984       7634
founders                  0.148      0.038      0.094        469       7119
genre                     0.000      0.000      0.000        205       6855
has_sibling               0.261      0.176      0.238        625       7275
has_spouse                0.348      0.211      0.308        754       7404
is_a                      0.071      0.024      0.051        618       7268
nationality               0.120      0.036      0.082        386       7036
parents     

0.11179102963297724

# ***Building a Classifier***

**Featurizers**

Finds all the corpus examples containing the two entities in the `KBTriple`, breaks the phrase appearing between the two entity mentions into words, and counts the words.

In [29]:
#a simple bag of words featurizer
# no distinction between 'forward' and 'reverse' examples

def simple_bag_of_words_featurizer(kbt, corpus, feature_counter):
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split(' '):
            feature_counter[word] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in ex.middle.split(' '):
            feature_counter[word] += 1
    return feature_counter

How the featurizer works on a single example :

In [30]:
kbt = kb.kb_triples[0]

kbt

KBTriple(rel='contains', sbj='Brickfields', obj='Kuala_Lumpur_Sentral_railway_station')

In [31]:
corpus.get_examples_for_entities(kbt.sbj, kbt.obj)[0].middle

'it was just a quick 10-minute walk to'

In [32]:
simple_bag_of_words_featurizer(kb.kb_triples[0], corpus, Counter())

Counter({'10-minute': 1,
         'a': 1,
         'it': 1,
         'just': 1,
         'quick': 1,
         'the': 1,
         'to': 2,
         'walk': 1,
         'was': 1})

Converting the datasets of `KBTriples` into feature matrices so that ML algos provided by `sklearn` can be used :

In [33]:
kbts_by_rel, labels_by_rel = dataset.build_dataset()

featurized = dataset.featurize(kbts_by_rel, featurizers=[simple_bag_of_words_featurizer])

***Experiments***

In [34]:
train_result = rel_ext.train_models(
    splits,
    featurizers=[simple_bag_of_words_featurizer])

In [35]:
predictions, true_labels = rel_ext.predict(
    splits, train_result, split_name='dev')

In [36]:

rel_ext.evaluate_predictions(predictions, true_labels)

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.886      0.383      0.702        407       7057
author                    0.790      0.521      0.716        657       7307
capital                   0.633      0.246      0.481        126       6776
contains                  0.786      0.601      0.740       4487      11137
film_performance          0.815      0.579      0.754        984       7634
founders                  0.836      0.424      0.700        469       7119
genre                     0.547      0.171      0.380        205       6855
has_sibling               0.812      0.242      0.551        625       7275
has_spouse                0.885      0.336      0.666        754       7404
is_a                      0.650      0.210      0.458        618       7268
nationality               0.588      0.174      0.398        386       7036
parents     

0.5656246770981269

In [37]:
_ = rel_ext.experiment(
    splits,
    featurizers=[simple_bag_of_words_featurizer])

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.886      0.383      0.702        407       7057
author                    0.790      0.521      0.716        657       7307
capital                   0.633      0.246      0.481        126       6776
contains                  0.786      0.601      0.740       4487      11137
film_performance          0.815      0.579      0.754        984       7634
founders                  0.836      0.424      0.700        469       7119
genre                     0.547      0.171      0.380        205       6855
has_sibling               0.812      0.242      0.551        625       7275
has_spouse                0.885      0.336      0.666        754       7404
is_a                      0.650      0.210      0.458        618       7268
nationality               0.588      0.174      0.398        386       7036
parents     

# ***Analysis***

In [38]:
rel_ext.examine_model_weights(train_result)

Highest and lowest feature weights for relation adjoins:

     2.457 Taluks
     2.457 Valais
     2.403 Córdoba
     ..... .....
    -1.155 Ireland
    -1.157 America
    -1.166 for

Highest and lowest feature weights for relation author:

     2.852 author
     2.572 books
     2.433 writer
     ..... .....
    -2.463 Alice
    -3.000 Daisy
    -6.948 1865

Highest and lowest feature weights for relation capital:

     3.512 capital
     1.685 km
     1.579 posted
     ..... .....
    -1.328 and
    -1.350 state
    -1.996 Dehradun

Highest and lowest feature weights for relation contains:

     2.764 third-largest
     2.337 bordered
     2.110 attended
     ..... .....
    -2.235 band
    -2.479 who
    -6.034 Bronx

Highest and lowest feature weights for relation film_performance:

     4.073 starring
     3.762 opposite
     3.378 alongside
     ..... .....
    -2.079 spy
    -2.171 Tamil
    -3.571 Mohabbatein

Highest and lowest feature weights for relation founders:

     3.88

***Discovering New Relation Instances***

In [39]:
rel_ext.find_new_relation_instances(
    dataset,
    featurizers=[simple_bag_of_words_featurizer])

Highest probability examples for relation adjoins:

     1.000 KBTriple(rel='adjoins', sbj='Canada', obj='Vancouver')
     1.000 KBTriple(rel='adjoins', sbj='Vancouver', obj='Canada')
     1.000 KBTriple(rel='adjoins', sbj='Lahore', obj='Pakistan')
     1.000 KBTriple(rel='adjoins', sbj='Pakistan', obj='Lahore')
     1.000 KBTriple(rel='adjoins', sbj='Atlantic_Ocean', obj='Mexico')
     1.000 KBTriple(rel='adjoins', sbj='Mexico', obj='Atlantic_Ocean')
     1.000 KBTriple(rel='adjoins', sbj='Blue_Ridge_Mountains', obj='Appalachian_Mountains')
     1.000 KBTriple(rel='adjoins', sbj='Appalachian_Mountains', obj='Blue_Ridge_Mountains')
     1.000 KBTriple(rel='adjoins', sbj='Sicily', obj='Italy')
     1.000 KBTriple(rel='adjoins', sbj='Italy', obj='Sicily')

Highest probability examples for relation author:

     1.000 KBTriple(rel='author', sbj='Dante_Alighieri', obj='Divine_Comedy')
     1.000 KBTriple(rel='author', sbj='Comic_book', obj='Marvel_Comics')
     1.000 KBTriple(rel='author',