In [1]:
import requests
import re
import hashlib
from spacy import Language, util
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline
import crosslingual_coreference
import spacy
from os.path import isfile
import os
import ftfy
import json
import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t1rk9\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
path = "preprocessed-rebel/"

In [3]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

In [4]:
@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Skip relationships, where the subject is the same as the object
            if triplet['head'] == triplet['tail']:
                continue

            # Regular expression search for subjects and objects
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip relationships, where both subject and object are not in text
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                doc._.rel[index] = {"relation": triplet["type"], "head_span": triplet['head'], "tail_span": triplet['tail']}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [5]:
DEVICE = -1

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE,
    'model_name':'Babelscape/rebel-large'}
    )

<__main__.RebelComponent at 0x159255abf40>

In [52]:
def find_relations(par):
    all_relations = []
    doc = rel_ext(par)  
    for value, rel_dict in doc._.rel.items():
        all_relations.append(rel_dict)
    
    return all_relations

In [53]:
"""
        Function to apply preprocessing on a selection of files and store it in a separate folder
        - path: root folder (preprocessed-rebel), 
        - subf desired subfolder: AA or AB, must be passed as a string (e.g. "AA") 
        - files in subfolder AA: e.g. p_r_wiki_00 
        - files in subfolder AB: e.g. p_r_wiki_00
        - start: start number file, e.g. 0-99 (no need to fill in 00, 0 is fine)
        - end: end number file, e.g. 0-99
        - the range is inclusive which means, e.g. with (0, 0) you select & pre-process file wiki_00,
        - with (32, 50) you select file wiki_32 up till wiki_50
""" 
def rel_extraction_mul_files(path, subf=None, start=None, end=None):
    relations = {}
    if subf:
        # from file start to end
        for i in tqdm(range(start, end+1)):
            # to match the filename p_r_wiki_00 up till p_r_wiki_09 we add a zero in front of the number from user input if necessary
            if i < 10:
                i = "0" + str(i)
            # construct path to file name that falls within range
            f = path + subf + "/p_r_wiki_{}".format(i) 
            
            # if file exists
            if isfile(f):
                # OPEN FILE, GO THROUGH EACH PAR AND PASS THAT INTO COREF FUNCTION
                file = open(f, 'r', encoding='utf-8')
                doc = json.load(file)
                for k, v in doc.items():
                    v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                    # if paragraph has more than one word
                    if len(v.split(" ")) > 1:
                        relations[k] = find_relations(v)
                        #TODO: FORMAT OUTPUT -> KADIR
    else:
        for f in glob.glob('preprocessed-rebel/*/*'):
            # OPEN EACH FILE, GO THROUGH EACH PARAGRAPH AND PASS THAT INTO COREF FUNC
            file = open(f, 'r')
            doc = json.load(file)
            for k, v in doc.items():
                v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                # if paragraph has more than one word
                if len(v.split(" ")) > 1:
                    relations[k] = find_relations(v)
                    #TODO: FORMAT OUTPUT -> KADIR
                    
    
    # save to file
    with open('relations.txt', 'w') as convert_file:
        convert_file.write(json.dumps(relations))
        
    return relations

In [8]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
relations = rel_extraction_mul_files(path, "AA", 20, 32) # rel_extraction_mul_files(path)

100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [1:37:03<00:00, 447.95s/it]


In [51]:
relations

{'620257-1': [{'relation': 'date of birth',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'August 6, 1942'},
  {'relation': 'occupation',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'computer scientist'}],
 '620257-2': [{'relation': 'date of birth',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'August 6, 1942'},
  {'relation': 'educated at',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'Swarthmore College'},
  {'relation': 'field of work',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'number theory'},
  {'relation': 'educated at',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'University of California, Berkeley'},
  {'relation': 'field of work',
   'head_span': 'Derrick Henry Lehmer',
   'tail_span': 'number theory'},
  {'relation': 'employer',
   'head_span': 'Derrick Henry Lehmer',
   'tail_span': 'University of California, Berkeley'},
  {'relation': 'employer',
   'head_span': 'Peter Jay Weinberger',
   'tail_span': 'B

REBEL OUTPUT TO DATAFRAME

In [28]:
# load evaluation set
evaluation_df = pd.read_csv('example.csv')
# unique indexes in EVAL set
indexes = evaluation_df['PAR ID'].unique()
# unique predicates in EVAL set
predicates = evaluation_df['PRED'].unique()

In [29]:
indexes

array(['620257-1', '620257-2', '651800-1', '651800-2', '825199-1',
       '825199-2', '825199-4', '892852-1', '892852-3', '892852-13',
       '980748-1', '980748-3', '980748-6', '998591-1', '998591-3',
       '998591-5', '1002526-1', '1002526-3', '1002526-13', '1051825-1',
       nan, '1051825-3', '1051825-4', '1051825-6', '1053071-1',
       '1053071-4', '1147236-1', '1147236-4', '1212157-1', '1212157-2',
       '1212157-4', '1330513-1', '1330513-2', '1368313-1', '1368313-2',
       '1368313-4', '1368313-6', '1370768-1', '1370768-2', '1385119-1',
       '1385119-4', '1385119-6', '1385119-7', '1385119-8', '1385119-10',
       '1396931-1', '1396931-3', '1396931-10', '1448006-1', '1448006-5',
       '1448006-9', '1498731-2', '1498731-3', '1498731-4', '1527871-1',
       '1527871-2', '1527871-3', '1573914-1', '1573914-2', '1581678-1',
       '1581678-4', '1581678-8', '1599940-1', '1599940-2', '1599940-4',
       '1651213-1', '1651213-2', '1651213-5', '1689979-1', '1689979-3',
       '1689

In [30]:
extract = dict()
for i in ['PAR ID', 'SUB', 'PRED', 'OBJ']:
    extract[i] = []


for key, list_of_dicts in relations.items():
    if key in indexes:
        for dic in list_of_dicts:
            extract['PAR ID'].append(key)
            extract['SUB'].append(dic['head_span'])
            extract['PRED'].append(dic['relation'])
            extract['OBJ'].append(dic['tail_span'])

# turn REBEL output to a DF
rebel_df = pd.DataFrame.from_dict(extract)

In [31]:
# all predicates found by REBEL
rebel_df['PRED'].unique()

array(['date of birth', 'occupation', 'educated at', 'field of work',
       'employer', 'country', 'country of citizenship', 'part of',
       'member of', 'parent organization', 'place of birth', 'founded by',
       'located in the administrative territorial entity', 'developer',
       'spouse', 'capital', 'child', 'father', 'award received',
       'sibling', 'date of death', 'participant in', 'winner',
       'headquarters location', 'said to be the same as',
       'diplomatic relation', 'followed by', 'inception', 'follows',
       'subclass of', 'instance of', 'work period (start)', 'author',
       'conferred by', 'designed by', 'used by', 'has part', 'subsidiary'],
      dtype=object)

In [32]:
# all predicates found in EVAL set
predicates

array(['date of birth', 'occupation', 'educated at', 'spouse',
       'place of birth', 'member of', 'founded by', nan, 'date of death'],
      dtype=object)

KEEP ONLY PREDICATES FROM EVAL SET

In [33]:
filtered_rebel_df = rebel_df[rebel_df['PRED'].isin(predicates)]
filtered_rebel_df

# remove par id so that we can filter duplicate triples later
filtered_rebel_df_id = filtered_rebel_df.set_index('PAR ID')
filtered_rebel_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,"University of California, Berkeley"
...,...,...,...
1775773-2,"Charles Patrick ""Chuck"" Thacker",date of birth,"February 26, 1943"
1775773-2,Ralph Scott Thacker,date of birth,1906
1775773-2,"Charles Patrick ""Chuck"" Thacker",date of birth,1922
1775773-3,"Charles Patrick ""Chuck"" Thacker",educated at,"University of California, Berkeley"


In [34]:
evaluation_df

# remove par id so that we can filter duplicate triples later
evaluation_df_id = evaluation_df.set_index('PAR ID')
evaluation_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,University of California
651800-1,Terence Aidan (Terry) Halpin,date of birth,1950s
...,...,...,...
1775773-1,"Charles Patrick ""Chuck"" Thacker",date of death,"June 12, 2017"
1775773-1,"Charles Patrick ""Chuck"" Thacker",occupation,computer designer
1775773-2,"Charles Patrick ""Chuck"" Thacker",place of birth,"Pasadena, California"
1775773-3,"Charles Patrick ""Chuck"" Thacker",educated at,"University of California, Berkeley"


REMOVE DUPLICATE TRIPLES

In [46]:
eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})

  eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})


In [36]:
eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})

  eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})


(OPTION 1) ACCURACY EVALUATION BY CHECKING IF PREDICTION IS IN EVAL SET (REQUIRES TOTAL MATCH)

In [37]:
new_true = []
for row in eval_list_true_id:
    new_true.append(str(row))
    
new_pred = []
for row in eval_list_pred_id:
    new_pred.append(str(row))

In [38]:
len(new_pred), len(new_true)

(124, 160)

In [39]:
correct = 0
for triple in new_pred:
    if triple in new_true:
        correct += 1
    
print(correct/len(new_true))

0.475


SHOW ALL EVALUATION TRIPLETS THAT WERE NOT PREDICTED BY REBEL

In [40]:
for triple in new_true:
    if triple not in new_pred:
        print(triple)

['Frank Thomson "Tom" Leighton' 'member of'
 'Computer Science and Artificial Intelligence Laboratory (CSAIL)']
['Diomidis D. Spinellis' 'occupation' 'author']
['Robert A. van de Geijn' 'occupation' 'parallel processing']
['Neil Immerman' 'occupation' 'theoretical computer scientist']
['Samson Abramsky' 'educated at'
 'Hasmonean Grammar School for Boys, Hendon']
['Neil Immerman' 'educated at' 'Yale University']
['Access Project' 'founded by' 'Robert Denis Glaser']
['Thomas S. Ray' 'occupation' 'ecologist']
['Terence Aidan (Terry) Halpin' 'date of birth' '1950s']
['Robert A. van de Geijn' 'occupation' 'numerical analysis']
['Andi (Andrei) Gutmans' 'member of' 'Apache Software Foundation']
['Research Science Institute (RSI)' 'founded by'
 'Admiral Hyman G. Rickover']
['nan' 'nan' 'nan']
['CoreStreet Ltd' 'founded by' 'Silvio Micali']
['Charles Patrick "Chuck" Thacker' 'occupation' 'computer designer']
['John C. Hull' 'occupation' 'researcher']
['Robert A. van de Geijn' 'spouse' 'Netherla

SHOW ALL PREDICTED TRIPLETS THAT WERE NOT IN THE EVALUATION SET

In [41]:
for triple in new_pred:
    if triple not in new_true:
        print(triple)

['Zend Technologies' 'founded by' 'Andi Gutmans']
['Samson Abramsky' 'member of' 'Royal Society']
['Andi Gutmans' 'spouse' 'Zeev Suraski']
['Peter Jay Weinberger' 'educated at' 'University of California, Berkeley']
['Andi Gutmans' 'educated at' 'Technion in Haifa']
['Marilyn Kirsch' 'spouse' 'Gabor Tamas Herman']
['Ian Tremere Foster' 'date of birth' '1959']
['Bernard Chazelle' 'member of' 'NEC']
['Bernard Chazelle' 'member of' 'American Academy of Arts and Sciences']
['Denis Glaser' 'educated at' 'Yale University']
['Zend Technologies' 'founded by' 'Zeev Suraski']
['Robert Stanley "Bob" Barton' 'date of birth' '1925']
['Jeffrey Outlaw Shallit' 'educated at'
 'University of California, Berkeley']
['Bruce Gillies' 'date of death' 'July 17, 1975']
['Ralph Scott Thacker' 'date of birth' '1906']
['Zeev Suraski' 'educated at' 'Technion in Haifa']
['Silvio Micali' 'educated at' 'Massachusetts Institute of Technology']
['Denis Glaser' 'date of birth' 'January 16, 1962']
['Zeev Suraski' 'spous

(OPTION 2) ACCURACY EVALUATION BASED ON SIMILARITY ON EACH TRIPLET PART (THUS NO NEED FOR PERFECT MATCH)

In [42]:
nlp = spacy.load('en_core_web_sm')
new_true = [[nlp(str(element)) for element in triple] for triple in eval_list_true_id]
new_pred = [[nlp(str(element)) for element in triple] for triple in eval_list_pred_id]

In [43]:
correct = 0
for triple_pred in new_pred:
    for triple_true in new_true:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            print(triple_true)
            print(triple_pred)
            print()
            correct += 1

accuracy = correct/len(new_true)
print(accuracy)

[Donald Bruce Gillies, educated at, University of Illinois]
[Donald Bruce Gillies, educated at, University of Illinois]

[Donald Bruce Gillies, educated at, University of Toronto]
[Donald Bruce Gillies, educated at, University of Illinois]

[Bernard Chazelle, member of, European Academy of Sciences]
[Bernard Chazelle, member of, European Academy of Sciences]

[Peter Jay Weinberger, date of birth, August 6, 1942]
[Peter Jay Weinberger, date of birth, August 6, 1942]

[Shuman Ghosemajumder, educated at, University of Western Ontario]
[Shuman Ghosemajumder, educated at, University of Western Ontario]

[Ian Tremere Foster, educated at, Imperial College London]
[Ian Tremere Foster, educated at, Imperial College London]

[Gabor Tamas Herman, educated at, University of California, Berkeley]
[Gabor Tamas Herman, educated at, University of California, Berkeley]

[Ian Colin Graham Bell, date of birth, 31 October 1962]
[Ian Colin Graham Bell, date of birth, 31 October 1962]

[Silicon Graphics, fo

[Silvio Micali, member of, National Academy of Sciences]
[Silvio Micali, member of, National Academy of Engineering]

[Silvio Micali, member of, National Academy of Engineering]
[Silvio Micali, member of, National Academy of Engineering]

[Adobe Systems, founded by, John Warnock]
[Adobe Systems, founded by, John Warnock]

[Donald Bruce Gillies, educated at, University of Illinois]
[Donald Bruce Gillies, educated at, University of Toronto]

[Donald Bruce Gillies, educated at, University of Toronto Schools]
[Donald Bruce Gillies, educated at, University of Toronto]

[Donald Bruce Gillies, educated at, University of Toronto]
[Donald Bruce Gillies, educated at, University of Toronto]

[Ian Tremere Foster, date of birth, 1 January 1959]
[Ian Tremere Foster, date of birth, 1 January 1959]

[TeachAids, founded by, Shuman Ghosemajumder]
[TeachAids, founded by, Shuman Ghosemajumder]

[Samson Abramsky, educated at, Queen Mary, University of London]
[Samson Abramsky, educated at, Queen Mary, Univ

ACCURACY EVALUATION ON EACH PREDICATE BASED ON SIMILARITY ON EACH TRIPLET PART

In [49]:
predicates = ['date of birth', 'occupation', 'educated at', 'spouse', 'place of birth', 'member of', 'founded by', 'date of death']
accuracy_per_predicate = dict()

for predicate in predicates:
    
    true_predicate = []
    pred_predicate = []
    
    for triple in eval_list_true_id:
        if triple[1] == predicate:
            true_predicate.append(triple)
    
    for triple in eval_list_pred_id:
        if triple[1] == predicate:
            pred_predicate.append(triple)

    nlp = spacy.load('en_core_web_sm')
    new_true = [[nlp(str(element)) for element in triple] for triple in true_predicate]
    new_pred = [[nlp(str(element)) for element in triple] for triple in pred_predicate]

    correct = 0
    for triple_pred in new_pred:
        for triple_true in new_true:
            if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
                correct += 1

    accuracy = correct/len(new_true)
    accuracy_per_predicate[predicate] = accuracy
    
accuracy_per_predicate

{'date of birth': 0.9565217391304348,
 'occupation': 0.2,
 'educated at': 0.7916666666666666,
 'spouse': 0.14285714285714285,
 'place of birth': 0.42857142857142855,
 'member of': 0.4666666666666667,
 'founded by': 0.3333333333333333,
 'date of death': 1.0}

(SCRAPPED) SIMILARITY WITH WHOLE TRIPLE INSTEAD OF EVALUATING EACH PART (NOT AS GOOD AS ABOVE)

In [339]:
# new_true = []
# for row in eval_list_true_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_true.append(triple[:-1])
    
# new_pred = []
# for row in eval_list_pred_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_pred.append(triple[:-1])

In [340]:
# nlp = spacy.load('en_core_web_sm')
# new_true = [nlp(triple) for triple in new_true]
# new_pred = [nlp(triple) for triple in new_pred]

In [341]:
# correct = 0
# for triple_pred in new_pred:
#     for triple_true in new_true:
#         if triple_pred.similarity(triple_true) >= 0.96:
#             print(triple_true)
#             print(triple_pred)
#             print()
#             correct += 1
    
# print(correct/len(new_true))

(SCRAPPED) EVALUATION SET TO DICT SIMILAR TO OUTPUT OF REBEL

In [294]:
#evaluation_dict = dict()
# indexes = []

# evaluation_df = pd.read_csv('example.csv')
# indexes = evaluation_df['PAR ID'].unique()
# predicates = evaluation_df['PRED'].unique()

# df = evaluation_df.set_index('PAR ID')
        
# for index in indexes:
#     evaluation_dict[index] = []
         
# for index, row in df.iterrows():
#     evaluation_dict[index].append({'relation': row['PRED'], 'head_span': row['SUB'], 'tail_span': row['OBJ']})

#evaluation_dict