In [1]:
import requests
import re
import hashlib
from spacy import Language, util
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline
import crosslingual_coreference
import spacy
from os.path import isfile
import os
import ftfy
import json
import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t1rk9\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
path = "preprocessed-rebel/"

In [3]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

In [4]:
@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Skip relationships, where the subject is the same as the object
            if triplet['head'] == triplet['tail']:
                continue

            # Regular expression search for subjects and objects
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip relationships, where both subject and object are not in text
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                doc._.rel[index] = {"relation": triplet["type"], "head_span": triplet['head'], "tail_span": triplet['tail']}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [5]:
DEVICE = -1

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE,
    'model_name':'Babelscape/rebel-large'}
    )

<__main__.RebelComponent at 0x199d3031610>

In [6]:
def find_relations(par):
    all_relations = []
    doc = rel_ext(par)  
    for value, rel_dict in doc._.rel.items():
        all_relations.append(rel_dict)
    
    return all_relations

In [67]:
"""
        Function to apply preprocessing on a selection of files and store it in a separate folder
        - path: root folder (preprocessed-rebel), 
        - subf desired subfolder: AA or AB, must be passed as a string (e.g. "AA") 
        - files in subfolder AA: e.g. p_r_wiki_00 
        - files in subfolder AB: e.g. p_r_wiki_00
        - start: start number file, e.g. 0-99 (no need to fill in 00, 0 is fine)
        - end: end number file, e.g. 0-99
        - the range is inclusive which means, e.g. with (0, 0) you select & pre-process file wiki_00,
        - with (32, 50) you select file wiki_32 up till wiki_50
""" 
def rel_extraction_mul_files(path, subf=None, start=None, end=None):
    relations = {}
    if subf:
        # from file start to end
        for i in tqdm(range(start, end+1)):
            # to match the filename p_r_wiki_00 up till p_r_wiki_09 we add a zero in front of the number from user input if necessary
            if i < 10:
                i = "0" + str(i)
            # construct path to file name that falls within range
            f = path + subf + "/p_r_wiki_{}".format(i) 
            
            # if file exists
            if isfile(f):
                # OPEN FILE, GO THROUGH EACH PAR AND PASS THAT INTO COREF FUNCTION
                file = open(f, 'r', encoding='utf-8')
                doc = json.load(file)
                for k, v in doc.items():
                    v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                    # if paragraph has more than one word
                    if len(v.split(" ")) > 1:
                        relations[k] = find_relations(v)
                        
    else:
        for f in glob.glob('preprocessed-rebel/*/*'):
            # OPEN EACH FILE, GO THROUGH EACH PARAGRAPH AND PASS THAT INTO COREF FUNC
            file = open(f, 'r')
            doc = json.load(file)
            for k, v in doc.items():
                v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                # if paragraph has more than one word
                if len(v.split(" ")) > 1:
                    relations[k] = find_relations(v)
    
    # save to file
    with open('relations.txt', 'w') as convert_file:
        convert_file.write(json.dumps(relations))
        
    return relations

In [None]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
relations = rel_extraction_mul_files(path, "AB", 0, 68) # rel_extraction_mul_files(path)

  7%|█████▌                                                                       | 5/69 [1:13:38<16:04:05, 903.84s/it]

In [39]:
#relations

LOAD EXTRACTED RELATIONS

In [11]:
new_f = open('example relations.txt', encoding="utf8").readlines()[0]
relations = json.loads(new_f)

REBEL OUTPUT TO DATAFRAME

In [12]:
# load evaluation set
evaluation_df = pd.read_csv('example.csv')
# unique indexes in EVAL set
indexes = evaluation_df['PAR ID'].unique()
# unique predicates in EVAL set
predicates = evaluation_df['PRED'].unique()

In [13]:
indexes

array(['620257-1', '620257-2', '651800-1', '651800-2', '825199-1',
       '825199-2', '825199-4', '892852-1', '892852-3', '892852-13',
       '980748-1', '980748-3', '980748-6', '998591-1', '998591-3',
       '998591-5', '1002526-1', '1002526-3', '1002526-13', '1051825-1',
       nan, '1051825-3', '1051825-4', '1051825-6', '1053071-1',
       '1053071-4', '1147236-1', '1147236-4', '1212157-1', '1212157-2',
       '1212157-4', '1330513-1', '1330513-2', '1368313-1', '1368313-2',
       '1368313-4', '1368313-6', '1370768-1', '1370768-2', '1385119-1',
       '1385119-4', '1385119-6', '1385119-7', '1385119-8', '1385119-10',
       '1396931-1', '1396931-3', '1396931-10', '1448006-1', '1448006-5',
       '1448006-9', '1498731-2', '1498731-3', '1498731-4', '1527871-1',
       '1527871-2', '1527871-3', '1573914-1', '1573914-2', '1581678-1',
       '1581678-4', '1581678-8', '1599940-1', '1599940-2', '1599940-4',
       '1651213-1', '1651213-2', '1651213-5', '1689979-1', '1689979-3',
       '1689

In [14]:
extract = dict()
for i in ['PAR ID', 'SUB', 'PRED', 'OBJ']:
    extract[i] = []


for key, list_of_dicts in relations.items():
    if key in indexes:
        for dic in list_of_dicts:
            extract['PAR ID'].append(key)
            extract['SUB'].append(dic['head_span'])
            extract['PRED'].append(dic['relation'])
            extract['OBJ'].append(dic['tail_span'])

# turn REBEL output to a DF
rebel_df = pd.DataFrame.from_dict(extract)

In [15]:
# all predicates found by REBEL
rebel_df['PRED'].unique()

array(['date of birth', 'occupation', 'educated at', 'field of work',
       'employer', 'country', 'country of citizenship', 'part of',
       'member of', 'parent organization', 'place of birth', 'founded by',
       'located in the administrative territorial entity', 'developer',
       'spouse', 'capital', 'child', 'father', 'award received',
       'sibling', 'date of death', 'participant in', 'winner',
       'headquarters location', 'said to be the same as',
       'diplomatic relation', 'followed by', 'inception', 'follows',
       'subclass of', 'instance of', 'work period (start)', 'author',
       'conferred by', 'designed by', 'used by', 'has part', 'subsidiary'],
      dtype=object)

In [16]:
# all predicates found in EVAL set
predicates

array(['date of birth', 'occupation', 'educated at', 'spouse',
       'place of birth', 'member of', 'founded by', nan, 'date of death'],
      dtype=object)

KEEP ONLY PREDICATES FROM EVAL SET

In [17]:
filtered_rebel_df = rebel_df[rebel_df['PRED'].isin(predicates)]
filtered_rebel_df

# remove par id so that we can filter duplicate triples later
filtered_rebel_df_id = filtered_rebel_df.set_index('PAR ID')
filtered_rebel_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,"University of California, Berkeley"
...,...,...,...
1775773-2,Charles Patrick 'Chuck' Thacker,date of birth,"February 26, 1943"
1775773-2,Ralph Scott Thacker,date of birth,1906
1775773-2,Charles Patrick 'Chuck' Thacker,date of birth,1922
1775773-3,Charles Patrick 'Chuck' Thacker,educated at,"University of California, Berkeley"


In [18]:
evaluation_df

# remove par id so that we can filter duplicate triples later
evaluation_df_id = evaluation_df.set_index('PAR ID')
evaluation_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,University of California
651800-1,Terence Aidan (Terry) Halpin,date of birth,1950s
...,...,...,...
1775773-1,"Charles Patrick ""Chuck"" Thacker",date of death,"June 12, 2017"
1775773-1,"Charles Patrick ""Chuck"" Thacker",occupation,computer designer
1775773-2,"Charles Patrick ""Chuck"" Thacker",place of birth,"Pasadena, California"
1775773-3,"Charles Patrick ""Chuck"" Thacker",educated at,"University of California, Berkeley"


REMOVE DUPLICATE TRIPLES

In [19]:
eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})

  eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})


In [20]:
eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})

  eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})


(OPTION 1) EVALUATION BASED ON CONTAINMENT ON EACH TRIPLET PART

In [21]:
new_true = [[str(element) for element in triple] for triple in eval_list_true_id]
new_pred = [[str(element) for element in triple] for triple in eval_list_pred_id]

In [22]:
correct = 0
for triple_pred in new_pred:
    for triple_true in new_true:
        if (triple_pred[0] in triple_true[0] or triple_true[0] in triple_pred[0]) and (triple_pred[1] in triple_true[1] or triple_true[1] in triple_pred[1]) and (triple_pred[2] in triple_true[2] or triple_true[2] in triple_pred[2]):
            print(triple_true)
            print(triple_pred)
            print()
            correct += 1

recall = correct/len(new_true)
precision = correct/len(new_pred)

['Silvio Micali', 'occupation', 'computer scientist']
['Silvio Micali', 'occupation', 'computer scientist']

['Adobe Systems', 'founded by', 'John Warnock']
['Adobe Systems', 'founded by', 'John Warnock']

['Bernard Chazelle', 'date of birth', 'November 5, 1955']
['Bernard Chazelle', 'date of birth', 'November 5, 1955']

['Gabor Tamas Herman', 'educated at', 'University of California, Berkeley']
['Gabor Tamas Herman', 'educated at', 'University of California, Berkeley']

['Silvio Micali', 'member of', 'American Academy of Arts and Sciences']
['Silvio Micali', 'member of', 'American Academy of Arts and Sciences']

['Thomas S. Ray', 'date of birth', 'September 21, 1954']
['Thomas S. Ray', 'date of birth', 'September 21, 1954']

['Peter Jay Weinberger', 'date of birth', 'August 6, 1942']
['Peter Jay Weinberger', 'date of birth', 'August 6, 1942']

['Silvio Micali', 'date of birth', 'October 13, 1954']
['Silvio Micali', 'date of birth', 'October 13, 1954']

['Silvio Micali', 'educated at',

In [23]:
print('Number of correct predictions: {}'.format(correct), 'Number of evaluation triples: {}'.format(len(new_true)))

print('Recall: {}'.format(recall))
print('Precision: {}'.format(precision))
print('F1: {}'.format(2*precision*recall/(precision+recall)))

Number of correct predictions: 87 Number of evaluation triples: 160
Recall: 0.54375
Precision: 0.7016129032258065
F1: 0.6126760563380281


SHOW ALL EVALUATION TRIPLETS THAT WERE NOT PREDICTED BY REBEL

In [24]:
missing_triples = []
for triple_true in new_true:
    triple_missing = True
    for triple_pred in new_pred:
        if (triple_pred[0] in triple_true[0] or triple_true[0] in triple_pred[0]) and (triple_pred[1] in triple_true[1] or triple_true[1] in triple_pred[1]) and (triple_pred[2] in triple_true[2] or triple_true[2] in triple_pred[2]):
            triple_missing = False
    if triple_missing:
        missing_triples.append(triple_true)

print('Number of articles not predicted by REBEL, but in Evaluation set: {}'.format(len(missing_triples)))
missing_triples

Number of articles not predicted by REBEL, but in Evaluation set: 78


[['Hendrik Pieter (Henk) Barendregt', 'place of birth', 'Amsterdam'],
 ['Andi (Andrei) Gutmans', 'occupation', 'entrepreneur'],
 ['John C. Hull', 'spouse', 'Michelle'],
 ['Diomidis D. Spinellis', 'occupation', 'author'],
 ['Gary Chevsky', 'occupation', 'entrepreneur'],
 ['Keith Oliver Geddes', 'member of', 'Association for Computing Machinery'],
 ['Charles Patrick "Chuck" Thacker', 'date of birth', 'February 26, 1943'],
 ['nan', 'nan', 'nan'],
 ['Terence Aidan (Terry) Halpin', 'educated at', 'University of Queensland'],
 ['Neil Immerman', 'educated at', 'Cornell University'],
 ['Jeffrey Outlaw Shallit', 'occupation', 'number theorist'],
 ['RealNetworks', 'founded by', 'Robert Denis Glaser'],
 ['Robert Stanley "Bob" Barton', 'educated at', 'University of Iowa'],
 ['Charles Patrick "Chuck" Thacker', 'date of death', 'June 12, 2017'],
 ['Charles Patrick "Chuck" Thacker', 'occupation', 'computer designer'],
 ['Frank Thomson "Tom" Leighton',
  'member of',
  'Computer Science and Artificial

SHOW ALL PREDICTED TRIPLETS THAT WERE NOT IN THE EVALUATION SET

In [25]:
missing_triples = []

for triple_pred in new_pred:
    triple_missing = True
    for triple_true in new_true:
        if (triple_pred[0] in triple_true[0] or triple_true[0] in triple_pred[0]) and (triple_pred[1] in triple_true[1] or triple_true[1] in triple_pred[1]) and (triple_pred[2] in triple_true[2] or triple_true[2] in triple_pred[2]):
            triple_missing = False
    if triple_missing:
        missing_triples.append(triple_pred)

print('Number of articles predicted by REBEL, but not in Evaluation set: {}'.format(len(missing_triples)))
missing_triples

Number of articles predicted by REBEL, but not in Evaluation set: 38


[['James Quentin Stafford-Fraser', 'educated at', 'Gonville and Caius'],
 ['Duane Call', 'educated at', 'University of the University of Utah'],
 ['Zeev Suraski', 'spouse', 'Andi Gutmans'],
 ['Bernard Chazelle', 'member of', 'ACM'],
 ['Bernard Chazelle', 'member of', 'NEC'],
 ['Zend Technologies', 'founded by', 'Zeev Suraski'],
 ['Pixar', 'founded by', 'Ed Catmull'],
 ['Andi Gutmans', 'member of', 'Apache Software Foundation'],
 ['Zend Technologies', 'founded by', 'Andi Gutmans'],
 ['Zeev Suraski', 'educated at', 'Technion in Haifa'],
 ['Andi Gutmans', 'educated at', 'Israel Institute of Technology in Haifa'],
 ["Charles Patrick 'Chuck' Thacker", 'date of birth', '1922'],
 ["Robert Stanley 'Bob' Barton", 'date of birth', '1925'],
 ['Samson Abramsky', 'member of', 'Royal Society'],
 ['Silvio Micali', 'educated at', 'Massachusetts Institute of Technology'],
 ['Andi Gutmans', 'educated at', 'Technion'],
 ["Charles Patrick 'Chuck' Thacker",
  'educated at',
  'University of California, Ber

EVALUATION ON EACH PREDICATE BASED ON CONTAINMENT ON EACH TRIPLET PART

In [26]:
predicates = ['date of birth', 'occupation', 'educated at', 'spouse', 'place of birth', 'member of', 'founded by', 'date of death']
recall_per_predicate = dict()
precision_per_predicate = dict()
f1_per_predicate = dict()

for predicate in predicates:
    
    true_predicate = []
    pred_predicate = []
    
    for triple in eval_list_true_id:
        if triple[1] == predicate:
            true_predicate.append(triple)
    
    for triple in eval_list_pred_id:
        if triple[1] == predicate:
            pred_predicate.append(triple)

    new_true = [[str(element) for element in triple] for triple in true_predicate]
    new_pred = [[str(element) for element in triple] for triple in pred_predicate]

    correct = 0
    for triple_pred in new_pred:
        for triple_true in new_true:
            if (triple_pred[0] in triple_true[0] or triple_true[0] in triple_pred[0]) and (triple_pred[1] in triple_true[1] or triple_true[1] in triple_pred[1]) and (triple_pred[2] in triple_true[2] or triple_true[2] in triple_pred[2]):
                correct += 1

    recall = correct/len(new_true)
    recall_per_predicate[predicate] = recall
    
    precision = correct/len(new_pred)
    precision_per_predicate[predicate] = precision
    
    f1 = 2*precision*recall/(precision+recall)
    f1_per_predicate[predicate] = f1

In [27]:
print('Recall per predicate')
recall_per_predicate

Recall per predicate


{'date of birth': 0.9130434782608695,
 'occupation': 0.2,
 'educated at': 0.7083333333333334,
 'spouse': 0.14285714285714285,
 'place of birth': 0.7142857142857143,
 'member of': 0.3333333333333333,
 'founded by': 0.3888888888888889,
 'date of death': 0.75}

In [28]:
print('Precision per predicate')
precision_per_predicate

Precision per predicate


{'date of birth': 0.8076923076923077,
 'occupation': 1.0,
 'educated at': 0.7555555555555555,
 'spouse': 0.16666666666666666,
 'place of birth': 0.8333333333333334,
 'member of': 0.38461538461538464,
 'founded by': 0.5833333333333334,
 'date of death': 0.75}

In [29]:
print('F1 per predicate')
f1_per_predicate

F1 per predicate


{'date of birth': 0.8571428571428572,
 'occupation': 0.33333333333333337,
 'educated at': 0.7311827956989247,
 'spouse': 0.15384615384615383,
 'place of birth': 0.7692307692307692,
 'member of': 0.3571428571428571,
 'founded by': 0.4666666666666666,
 'date of death': 0.75}

(OPTION 2) EVALUATION BASED ON SIMILARITY ON EACH TRIPLET PART (THUS NO NEED FOR PERFECT MATCH)

In [30]:
nlp = spacy.load('en_core_web_sm')
new_true = [[nlp(str(element)) for element in triple] for triple in eval_list_true_id]
new_pred = [[nlp(str(element)) for element in triple] for triple in eval_list_pred_id]

In [31]:
correct = 0
for triple_pred in new_pred:
    for triple_true in new_true:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            print(triple_true)
            print(triple_pred)
            print()
            correct += 1

recall = correct/len(new_true)
precision = correct/len(new_pred)

[Silvio Micali, occupation, computer scientist]
[Silvio Micali, occupation, computer scientist]

[Adobe Systems, founded by, John Warnock]
[Adobe Systems, founded by, John Warnock]

[Bernard Chazelle, date of birth, November 5, 1955]
[Bernard Chazelle, date of birth, November 5, 1955]

[Gabor Tamas Herman, educated at, University of California, Berkeley]
[Gabor Tamas Herman, educated at, University of California, Berkeley]

[Silvio Micali, member of, American Academy of Arts and Sciences]
[Silvio Micali, member of, American Academy of Arts and Sciences]

[Thomas S. Ray, date of birth, September 21, 1954]
[Thomas S. Ray, date of birth, September 21, 1954]

[Peter Jay Weinberger, date of birth, August 6, 1942]
[Peter Jay Weinberger, date of birth, August 6, 1942]

[Silvio Micali, date of birth, October 13, 1954]
[Silvio Micali, date of birth, October 13, 1954]

[Silvio Micali, educated at, La Sapienza University of Rome]
[Silvio Micali, educated at, La Sapienza University of Rome]

[Gary

[John C. Hull, educated at, Cambridge University]
[John C. Hull, educated at, Cambridge University]

[Samson Abramsky, date of birth, 12 March 1953]
[Samson Abramsky, date of birth, 12 March 1953]

[Christopher Stewart Wallace, date of death, 7 August 2004]
[Christopher Stewart Wallace, date of death, 7 August 2004]

[Martin Richards, occupation, computer scientist]
[Martin Richards, occupation, computer scientist]

[Jeffrey Outlaw Shallit, date of birth, October 17, 1957]
[Jeffrey Outlaw Shallit, date of birth, October 17, 1957]

[Bernard Chazelle, occupation, computer scientist]
[Bernard Chazelle, occupation, computer scientist]

[Peter Jay Weinberger, educated at, Swarthmore College]
[Peter Jay Weinberger, educated at, Swarthmore College]



In [32]:
print('Number of correct predictions: {}'.format(correct), 'Number of evaluation triples: {}'.format(len(new_true)))

print('Recall: {}'.format(recall))
print('Precision: {}'.format(precision))
print('F1: {}'.format(2*precision*recall/(precision+recall)))

Number of correct predictions: 83 Number of evaluation triples: 160
Recall: 0.51875
Precision: 0.6693548387096774
F1: 0.5845070422535212


SHOW ALL EVALUATION TRIPLETS THAT WERE NOT PREDICTED BY REBEL

In [33]:
missing_triples = []
for triple_true in new_true:
    triple_missing = True
    for triple_pred in new_pred:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            triple_missing = False
    if triple_missing:
        missing_triples.append(triple_true)

print('Number of articles not predicted by REBEL, but in Evaluation set: {}'.format(len(missing_triples)))
missing_triples

Number of articles not predicted by REBEL, but in Evaluation set: 85


[[Hendrik Pieter (Henk) Barendregt, place of birth, Amsterdam],
 [Jeffrey Outlaw Shallit, educated at, University of California],
 [Andi (Andrei) Gutmans, occupation, entrepreneur],
 [John C. Hull, spouse, Michelle],
 [Diomidis D. Spinellis, occupation, author],
 [Gary Chevsky, occupation, entrepreneur],
 [Keith Oliver Geddes, member of, Association for Computing Machinery],
 [Charles Patrick "Chuck" Thacker, date of birth, February 26, 1943],
 [nan, nan, nan],
 [Terence Aidan (Terry) Halpin, educated at, University of Queensland],
 [Neil Immerman, educated at, Cornell University],
 [Jeffrey Outlaw Shallit, occupation, number theorist],
 [RealNetworks, founded by, Robert Denis Glaser],
 [Robert Stanley "Bob" Barton, educated at, University of Iowa],
 [Charles Patrick "Chuck" Thacker, date of death, June 12, 2017],
 [Charles Patrick "Chuck" Thacker, occupation, computer designer],
 [Frank Thomson "Tom" Leighton,
  member of,
  Computer Science and Artificial Intelligence Laboratory (CSA

SHOW ALL PREDICTED TRIPLETS THAT WERE NOT IN THE EVALUATION SET

In [34]:
missing_triples = []

for triple_pred in new_pred:
    triple_missing = True
    for triple_true in new_true:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            triple_missing = False
    if triple_missing:
        missing_triples.append(triple_pred)

print('Number of articles predicted by REBEL, but not in Evaluation set: {}'.format(len(missing_triples)))
missing_triples

Number of articles predicted by REBEL, but not in Evaluation set: 47


[[James Quentin Stafford-Fraser, educated at, Gonville and Caius],
 [Duane Call, educated at, University of the University of Utah],
 [Zeev Suraski, spouse, Andi Gutmans],
 [Bernard Chazelle, member of, ACM],
 [Ian Tremere Foster, place of birth, Wellington],
 [Bernard Chazelle, member of, NEC],
 [Zend Technologies, founded by, Zeev Suraski],
 [Pixar, founded by, Ed Catmull],
 [Andi Gutmans, member of, Apache Software Foundation],
 [Zend Technologies, founded by, Andi Gutmans],
 [Zeev Suraski, educated at, Technion in Haifa],
 [Andi Gutmans, educated at, Israel Institute of Technology in Haifa],
 [Charles Patrick 'Chuck' Thacker, date of birth, 1922],
 [Robert Stanley 'Bob' Barton, date of birth, 1925],
 [Samson Abramsky, member of, Royal Society],
 [Silvio Micali, educated at, Massachusetts Institute of Technology],
 [Andi Gutmans, educated at, Technion],
 [Charles Patrick 'Chuck' Thacker,
  educated at,
  University of California, Berkeley],
 [Ralph Scott Thacker, date of birth, 1906

EVALUATION ON EACH PREDICATE BASED ON SIMILARITY ON EACH TRIPLET PART

In [35]:
predicates = ['date of birth', 'occupation', 'educated at', 'spouse', 'place of birth', 'member of', 'founded by', 'date of death']
recall_per_predicate = dict()
precision_per_predicate = dict()
f1_per_predicate = dict()

for predicate in predicates:
    
    true_predicate = []
    pred_predicate = []
    
    for triple in eval_list_true_id:
        if triple[1] == predicate:
            true_predicate.append(triple)
    
    for triple in eval_list_pred_id:
        if triple[1] == predicate:
            pred_predicate.append(triple)

    nlp = spacy.load('en_core_web_sm')
    new_true = [[nlp(str(element)) for element in triple] for triple in true_predicate]
    new_pred = [[nlp(str(element)) for element in triple] for triple in pred_predicate]

    correct = 0
    for triple_pred in new_pred:
        for triple_true in new_true:
            if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
                correct += 1

    recall = correct/len(new_true)
    recall_per_predicate[predicate] = recall
    
    precision = correct/len(new_pred)
    precision_per_predicate[predicate] = precision
    
    f1 = 2*precision*recall/(precision+recall)
    f1_per_predicate[predicate] = f1

In [36]:
print('Recall per predicate')
recall_per_predicate

Recall per predicate


{'date of birth': 0.8695652173913043,
 'occupation': 0.2,
 'educated at': 0.7291666666666666,
 'spouse': 0.14285714285714285,
 'place of birth': 0.35714285714285715,
 'member of': 0.4666666666666667,
 'founded by': 0.3333333333333333,
 'date of death': 0.75}

In [37]:
print('Precision per predicate')
precision_per_predicate

Precision per predicate


{'date of birth': 0.7692307692307693,
 'occupation': 1.0,
 'educated at': 0.7777777777777778,
 'spouse': 0.16666666666666666,
 'place of birth': 0.4166666666666667,
 'member of': 0.5384615384615384,
 'founded by': 0.5,
 'date of death': 0.75}

In [38]:
print('F1 per predicate')
f1_per_predicate

F1 per predicate


{'date of birth': 0.8163265306122449,
 'occupation': 0.33333333333333337,
 'educated at': 0.7526881720430108,
 'spouse': 0.15384615384615383,
 'place of birth': 0.3846153846153846,
 'member of': 0.5,
 'founded by': 0.4,
 'date of death': 0.75}