In [22]:
import requests
import re
import hashlib
from spacy import Language, util
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline
import crosslingual_coreference
import spacy
from os.path import isfile
import os
import ftfy
import json
import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
path = "preprocessed-rebel/"

In [3]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

In [4]:
@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Skip relationships, where the subject is the same as the object
            if triplet['head'] == triplet['tail']:
                continue

            # Regular expression search for subjects and objects
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip relationships, where both subject and object are not in text
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                doc._.rel[index] = {"relation": triplet["type"], "head_span": triplet['head'], "tail_span": triplet['tail']}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [5]:
DEVICE = -1

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE,
    'model_name':'Babelscape/rebel-large'}
    )

<__main__.RebelComponent at 0x1341e42a700>

In [6]:
def coreff(par):
    all_relations = []
    doc = rel_ext(par)  
    for value, rel_dict in doc._.rel.items():
        all_relations.append(rel_dict)
    
    return all_relations

In [7]:
"""
        Function to apply preprocessing on a selection of files and store it in a separate folder
        - path: root folder (preprocessed-rebel), 
        - subf desired subfolder: AA or AB, must be passed as a string (e.g. "AA") 
        - files in subfolder AA: e.g. p_r_wiki_00 
        - files in subfolder AB: e.g. p_r_wiki_00
        - start: start number file, e.g. 0-99 (no need to fill in 00, 0 is fine)
        - end: end number file, e.g. 0-99
        - the range is inclusive which means, e.g. with (0, 0) you select & pre-process file wiki_00,
        - with (32, 50) you select file wiki_32 up till wiki_50
""" 
def rel_extraction_mul_files(path, subf=None, start=None, end=None):
    relations = {}
    if subf:
        # from file start to end
        for i in tqdm(range(start, end+1)):
            # to match the filename p_r_wiki_00 up till p_r_wiki_09 we add a zero in front of the number from user input if necessary
            if i < 10:
                i = "0" + str(i)
            # construct path to file name that falls within range
            f = path + subf + "/p_r_wiki_{}".format(i) 
            
            # if file exists
            if isfile(f):
                # OPEN FILE, GO THROUGH EACH PAR AND PASS THAT INTO COREF FUNCTION
                file = open(f, 'r', encoding='utf-8')
                doc = json.load(file)
                for k, v in doc.items():
                    v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                    # if paragraph has more than one word
                    if len(v.split(" ")) > 1:
                        relations[k] = coreff(v)
                        #TODO: FORMAT OUTPUT -> KADIR
    else:
        for f in glob.glob('preprocessed-rebel/*/*'):
            # OPEN EACH FILE, GO THROUGH EACH PARAGRAPH AND PASS THAT INTO COREF FUNC
            file = open(f, 'r')
            doc = json.load(file)
            for k, v in doc.items():
                v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                # if paragraph has more than one word
                if len(v.split(" ")) > 1:
                    relations[k] = coreff(v)
                    #TODO: FORMAT OUTPUT -> KADIR
                    
    return relations

In [10]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
relations = rel_extraction_mul_files(path, "AA", 20, 26) # rel_extraction_mul_files(path)

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [51:48<00:00, 444.13s/it]


In [13]:
relations

{'620257-1': [{'relation': 'date of birth',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'August 6, 1942'},
  {'relation': 'occupation',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'computer scientist'}],
 '620257-2': [{'relation': 'date of birth',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'August 6, 1942'},
  {'relation': 'educated at',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'Swarthmore College'},
  {'relation': 'field of work',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'number theory'},
  {'relation': 'educated at',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'University of California, Berkeley'},
  {'relation': 'field of work',
   'head_span': 'Derrick Henry Lehmer',
   'tail_span': 'number theory'},
  {'relation': 'employer',
   'head_span': 'Derrick Henry Lehmer',
   'tail_span': 'University of California, Berkeley'},
  {'relation': 'employer',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'B

REBEL OUTPUT TO DATAFRAME

In [32]:
# load evaluation set
evaluation_df = pd.read_csv('example.csv')
# unique indexes in EVAL set
indexes = evaluation_df['PAR ID'].unique()
# unique predicates in EVAL set
predicates = evaluation_df['PRED'].unique()

In [36]:
extract = dict()
for i in ['PAR ID', 'SUB', 'PRED', 'OBJ']:
    extract[i] = []


for key, list_of_dicts in relations.items():
    if key in indexes:
        for dic in list_of_dicts:
            extract['PAR ID'].append(key)
            extract['SUB'].append(dic['head_span'])
            extract['PRED'].append(dic['relation'])
            extract['OBJ'].append(dic['tail_span'])

# turn REBEL output to a DF
rebel_df = pd.DataFrame.from_dict(extract)

In [37]:
# all predicates found by REBEL
rebel_df['PRED'].unique()

array(['date of birth', 'occupation', 'educated at', 'field of work',
       'employer', 'owned by', 'country', 'country of citizenship',
       'parent organization', 'subsidiary', 'part of', 'member of',
       'place of birth', 'product or material produced', 'founded by',
       'located in the administrative territorial entity', 'developer',
       'followed by', 'inception', 'publication date', 'author', 'editor',
       'spouse', 'capital', 'child', 'father', 'award received',
       'sibling', 'owner of', 'date of death', 'has part'], dtype=object)

In [38]:
# all predicates found in EVAL set
predicates

array(['date of birth', 'occupation', 'educated at', 'position held',
       'spouse', 'place of birth', 'founded by', 'date of death'],
      dtype=object)

KEEP ONLY PREDICATES FROM EVAL SET

In [39]:
filtered_rebel_df = rebel_df[rebel_df['PRED'].isin(predicates)]
filtered_rebel_df

# remove par id so that we can filter duplicate triples later
filtered_rebel_df_id = filtered_rebel_df.set_index('PAR ID')
filtered_rebel_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,"University of California, Berkeley"
...,...,...,...
1147236-3,Christopher Stewart Wallace,date of birth,26 October 1933
1147236-3,Christopher Stewart Wallace,date of death,7 August 2004
1147236-4,Christopher Stewart Wallace,date of birth,26 October 1933
1147236-4,Christopher Stewart Wallace,date of death,7 August 2004


In [40]:
evaluation_df

# remove par id so that we can filter duplicate triples later
evaluation_df_id = evaluation_df.set_index('PAR ID')
evaluation_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,University of California
620257-5,Peter Jay Weinberger,position held,head of Computer Science Research at Bell Labs
...,...,...,...
1147236-3,Christopher Stewart Wallace,position held,Professor Emeritus
1147236-3,Christopher Stewart Wallace,position held,fellow of the Australian Computer Society
1147236-3,Christopher Stewart Wallace,position held,fellow of the ACM
1147236-4,Christopher Stewart Wallace,educated at,University of Sydney


REMOVE DUPLICATE TRIPLES

In [41]:
eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})

  eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})


In [42]:
eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})

  eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})


(OPTION 1) EVALUATION BY CHECKING IF PREDICTION IS IN EVAL SET (REQUIRES TOTAL MATCH)

In [43]:
new_true = []
for row in eval_list_true_id:
    new_true.append(str(row))
    
new_pred = []
for row in eval_list_pred_id:
    new_pred.append(str(row))

In [44]:
len(new_pred), len(new_true)

(38, 103)

In [27]:
correct = 0
for triple in new_pred:
    if triple in new_true:
        correct += 1
    
print(correct/len(new_true))

0.24271844660194175


SHOW ALL EVALUATION TRIPLETS THAT WERE NOT PREDICTED BY REBEL

In [28]:
for triple in new_true:
    if triple not in new_pred:
        print(triple)

['Gabor Tamas Herman' 'position held' 'full professor']
['Terence Aidan (Terry) Halpin' 'position held'
 'Program Manager in Database Modeling']
['Jeffrey Outlaw Shallit' 'position held'
 'Vice-President of Electronic Frontier Canada']
['Jeffrey Outlaw Shallit' 'spouse' 'Anna Lubiw']
['Samson Abramsky' 'position held'
 'Editorial Boards of the North Holland Studies in Logic and the Foundations of Mathematics, and of the Cambridge Tracts in Theoretical Computer Science']
['Shuman Ghosemajumder' 'position held'
 'North American Public Speaking Champion']
['Bernard Chazelle' 'position held'
 'fellow of the ACM, the American Academy of Arts and Sciences, the John Simon Guggenheim Memorial Foundation, and NEC']
['Shuman Ghosemajumder' 'position held'
 'president of the Canadian University Society for Intercollegiate Debate']
['Samson Abramsky' 'position held' 'Member of Academia Europaea']
['Robert Denis Glaser' 'position held' 'interim CEO']
['Terence Aidan (Terry) Halpin' 'date of birth' 

SHOW ALL PREDICTED TRIPLETS THAT WERE NOT IN THE EVALUATION SET

In [29]:
for triple in new_pred:
    if triple not in new_true:
        print(triple)

['Gary Chevsky' 'place of birth' 'Odessa']
['Peter Jay Weinberger' 'educated at' 'University of California, Berkeley']
['Robert Denis Glaser' 'place of birth' 'New York City']
['Shuman Ghosemajumder' 'place of birth' 'Stuttgart']
['Bernard Chazelle' 'place of birth' 'Clamart']
['Jeffrey Outlaw Shallit' 'educated at'
 'University of California, Berkeley']
['Samson Abramsky' 'educated at' 'Hasmonean Grammar School for Boys']
['Bernard Chazelle' 'place of birth' 'Paris']
['Marilyn Kirsch' 'spouse' 'Gabor Tamas Herman']
['Ask Jeeves' 'founded by' 'Gary Chevsky']
['Denis Glaser' 'date of birth' 'January 16, 1962']
['Denis Glaser' 'educated at' 'Yale University']
['Denis Glaser' 'place of birth' 'New York City']


(OPTION 2) SIMILARITY EVALUATION ON EACH TRIPLET PART (THUS NO NEED FOR PERFECT MATCH)

In [30]:
nlp = spacy.load('en_core_web_sm')
new_true = [[nlp(str(element)) for element in triple] for triple in eval_list_true_id]
new_pred = [[nlp(str(element)) for element in triple] for triple in eval_list_pred_id]

In [31]:
correct = 0
for triple_pred in new_pred:
    for triple_true in new_true:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            print(triple_true)
            print(triple_pred)
            print()
            correct += 1
    
print(correct/len(new_true))

[Shuman Ghosemajumder, date of birth, 1974]
[Shuman Ghosemajumder, date of birth, 1974]

[Gary Chevsky, educated at, University of California Berkeley]
[Gary Chevsky, educated at, University of California Berkeley]

[TeachAids, founded by, Shuman Ghosemajumder]
[TeachAids, founded by, Shuman Ghosemajumder]

[Samson Abramsky, educated at, King's College, Cambridge]
[Samson Abramsky, educated at, King's College, Cambridge]

[Peter Jay Weinberger, occupation, computer scientist]
[Peter Jay Weinberger, occupation, computer scientist]

[Robert Denis Glaser, place of birth, New York City, New York]
[Robert Denis Glaser, place of birth, New York City]

[Jeffrey Outlaw Shallit, date of birth, October 17, 1957]
[Jeffrey Outlaw Shallit, date of birth, October 17, 1957]

[Shuman Ghosemajumder, educated at, MIT Sloan School of Management]
[Shuman Ghosemajumder, educated at, MIT Sloan School of Management]

[Samson Abramsky, educated at, Hasmonean Grammar School for Boys, Hendon]
[Samson Abramsky, 

(SCRAPPED) SIMILARITY WITH WHOLE TRIPLE INSTEAD OF EVALUATING EACH PART (NOT AS GOOD AS ABOVE)

In [339]:
# new_true = []
# for row in eval_list_true_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_true.append(triple[:-1])
    
# new_pred = []
# for row in eval_list_pred_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_pred.append(triple[:-1])

In [340]:
# nlp = spacy.load('en_core_web_sm')
# new_true = [nlp(triple) for triple in new_true]
# new_pred = [nlp(triple) for triple in new_pred]

In [341]:
# correct = 0
# for triple_pred in new_pred:
#     for triple_true in new_true:
#         if triple_pred.similarity(triple_true) >= 0.96:
#             print(triple_true)
#             print(triple_pred)
#             print()
#             correct += 1
    
# print(correct/len(new_true))

(SCRAPPED) EVALUATION SET TO DICT SIMILAR TO OUTPUT OF REBEL

In [294]:
#evaluation_dict = dict()
# indexes = []

# evaluation_df = pd.read_csv('example.csv')
# indexes = evaluation_df['PAR ID'].unique()
# predicates = evaluation_df['PRED'].unique()

# df = evaluation_df.set_index('PAR ID')
        
# for index in indexes:
#     evaluation_dict[index] = []
         
# for index, row in df.iterrows():
#     evaluation_dict[index].append({'relation': row['PRED'], 'head_span': row['SUB'], 'tail_span': row['OBJ']})

#evaluation_dict