In [1]:
import requests
import re
import hashlib
from spacy import Language, util
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline
import crosslingual_coreference
import spacy
from os.path import isfile
import os
import ftfy
import json
import glob
from tqdm import tqdm
import pandas as pd

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Sverre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
path = "preprocessed-rebel/"

In [3]:
def call_wiki_api(item):
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
        data = requests.get(url).json()
        # Return the first id (Could upgrade this in the future)
        return data['search'][0]['id']
    except:
        return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

In [4]:
@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})
   
    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
            return mapping
        else:
            res = call_wiki_api(item)
            self.entity_mapping[item] = res
            return res

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [5]:
DEVICE = -1 # Number of the GPU, -1 if want to use CPU
# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE})

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )


error loading _jsonnet (this is expected on Windows), treating C:\Users\Sverre\AppData\Local\Temp\tmpnj2bumph\config.json as plain json
Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large a

<__main__.RebelComponent at 0x1cf17737a30>

In [6]:
def coreff(par):
    all_relations = []
    # process it
    # coref_text = coref(par)._.resolved_text
    # print(coref_text)
    doc = rel_ext(par)  
    print(doc._.rel.items())
    for value, rel_dict in doc._.rel.items():
        all_relations.append(rel_dict)
    
    return all_relations

In [7]:
"""
        Function to apply preprocessing on a selection of files and store it in a separate folder
        - path: root folder (preprocessed-rebel), 
        - subf desired subfolder: AA or AB, must be passed as a string (e.g. "AA") 
        - files in subfolder AA: e.g. p_r_wiki_00 
        - files in subfolder AB: e.g. p_r_wiki_00
        - start: start number file, e.g. 0-99 (no need to fill in 00, 0 is fine)
        - end: end number file, e.g. 0-99
        - the range is inclusive which means, e.g. with (0, 0) you select & pre-process file wiki_00,
        - with (32, 50) you select file wiki_32 up till wiki_50
""" 
def rel_extraction_mul_files(path, subf=None, start=None, end=None):
    relations = {}
    if subf:
        # from file start to end
        for i in tqdm(range(start, end+1)):
            # to match the filename p_r_wiki_00 up till p_r_wiki_09 we add a zero in front of the number from user input if necessary
            if i < 10:
                i = "0" + str(i)
            # construct path to file name that falls within range
            f = path + subf + "/p_r_wiki_{}".format(i) 
            
            # if file exists
            if isfile(f):
                # OPEN FILE, GO THROUGH EACH PAR AND PASS THAT INTO COREF FUNCTION
                file = open(f, 'r', encoding='utf-8')
                doc = json.load(file)
                for k, v in doc.items():
                    v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                    # if paragraph has more than one word
                    if len(v.split(" ")) > 1:
                        relations[k] = coreff(v)
                        #TODO: FORMAT OUTPUT -> KADIR
    else:
        for f in glob.glob('preprocessed-rebel/*/*'):
            # OPEN EACH FILE, GO THROUGH EACH PARAGRAPH AND PASS THAT INTO COREF FUNC
            file = open(f, 'r')
            doc = json.load(file)
            for k, v in doc.items():
                v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                # if paragraph has more than one word
                if len(v.split(" ")) > 1:
                    relations[k] = coreff(v)
                    #TODO: FORMAT OUTPUT -> KADIR
                    
    return relations

In [45]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
relations = rel_extraction_mul_files(path, "AA", 0, 0) # rel_extraction_mul_files(path)

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [34:22<00:00, 294.59s/it]


In [383]:
relations

{'620257-1': [{'relation': 'date of birth',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'August 6, 1942', 'id': 'Q69275337'}},
  {'relation': 'occupation',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'computer scientist', 'id': 'Q82594'}}],
 '620257-2': [{'relation': 'date of birth',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'August 6, 1942', 'id': 'Q69275337'}},
  {'relation': 'educated at',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'Swarthmore College', 'id': 'Q1378320'}},
  {'relation': 'field of work',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'number theory', 'id': 'Q12479'}},
  {'relation': 'educated at',
   'head_span': {'text': 'Peter Jay Weinberger', 'id': 'Q92959'},
   'tail_span': {'text': 'University of California, Berkeley',
    'id': '

REBEL OUTPUT TO DATAFRAME

In [384]:
# load evaluation set
evaluation_df = pd.read_csv('example.csv')
# unique indexes in EVAL set
indexes = evaluation_df['PAR ID'].unique()
# unique predicates in EVAL set
predicates = evaluation_df['PRED'].unique()

In [385]:
extract = dict()
for i in ['PAR ID', 'SUB', 'PRED', 'OBJ']:
    extract[i] = []


for k, v in relations.items():
    if k in indexes:
        for dic in v:
            extract['PAR ID'].append(k)
            extract['SUB'].append(dic['head_span']['text'])
            extract['PRED'].append(dic['relation'])
            extract['OBJ'].append(dic['tail_span']['text'])

# turn REBEL output to a DF
rebel_df = pd.DataFrame.from_dict(extract)

In [386]:
# all predicates found by REBEL
rebel_df['PRED'].unique()

array(['date of birth', 'occupation', 'educated at', 'field of work',
       'employer', 'owned by', 'country', 'country of citizenship',
       'parent organization', 'subsidiary', 'part of', 'member of',
       'place of birth', 'product or material produced', 'founded by',
       'located in the administrative territorial entity', 'developer',
       'followed by', 'inception', 'publication date', 'author', 'editor',
       'spouse', 'capital', 'child', 'father', 'award received',
       'sibling', 'owner of', 'date of death', 'has part'], dtype=object)

In [387]:
# all predicates found in EVAL set
predicates

array(['date of birth', 'occupation', 'educated at', 'position held',
       'spouse', 'place of birth', 'founded by', 'date of death'],
      dtype=object)

KEEP ONLY PREDICATES FROM EVAL SET

In [388]:
filtered_rebel_df = rebel_df[rebel_df['PRED'].isin(predicates)]
filtered_rebel_df

# remove par id so that we can filter duplicate triples later
filtered_rebel_df_id = filtered_rebel_df.set_index('PAR ID')
filtered_rebel_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,"University of California, Berkeley"
...,...,...,...
1147236-3,Christopher Stewart Wallace,date of birth,26 October 1933
1147236-3,Christopher Stewart Wallace,date of death,7 August 2004
1147236-4,Christopher Stewart Wallace,date of birth,26 October 1933
1147236-4,Christopher Stewart Wallace,date of death,7 August 2004


In [389]:
evaluation_df

# remove par id so that we can filter duplicate triples later
evaluation_df_id = evaluation_df.set_index('PAR ID')
evaluation_df_id

Unnamed: 0_level_0,SUB,PRED,OBJ
PAR ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620257-1,Peter Jay Weinberger,date of birth,"August 6, 1942"
620257-1,Peter Jay Weinberger,occupation,computer scientist
620257-2,Peter Jay Weinberger,educated at,Swarthmore College
620257-2,Peter Jay Weinberger,educated at,University of California
620257-5,Peter Jay Weinberger,position held,head of Computer Science Research at Bell Labs
...,...,...,...
1147236-3,Christopher Stewart Wallace,position held,Professor Emeritus
1147236-3,Christopher Stewart Wallace,position held,fellow of the Australian Computer Society
1147236-3,Christopher Stewart Wallace,position held,fellow of the ACM
1147236-4,Christopher Stewart Wallace,educated at,University of Sydney


REMOVE DUPLICATE TRIPLES

In [390]:
eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})

  eval_list_true_id = np.vstack({tuple(row) for row in evaluation_df_id.to_numpy()})


In [391]:
eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})

  eval_list_pred_id = np.vstack({tuple(row) for row in filtered_rebel_df_id.to_numpy()})


(OPTION 1) EVALUATION BY CHECKING IF PREDICTION IS IN EVAL SET (REQUIRES TOTAL MATCH)

In [392]:
new_true = []
for row in eval_list_true_id:
    new_true.append(str(row))
    
new_pred = []
for row in eval_list_pred_id:
    new_pred.append(str(row))

In [393]:
len(new_pred), len(new_true)

(38, 103)

In [394]:
correct = 0
for triple in new_pred:
    if triple in new_true:
        correct += 1
    
print(correct/len(new_true))

0.24271844660194175


SHOW ALL EVALUATION TRIPLETS THAT WERE NOT PREDICTED BY REBEL

In [395]:
for triple in new_true:
    if triple not in new_pred:
        print(triple)

['Christopher Stewart Wallace' 'occupation' 'physicist']
['Gabor Tamas Herman' 'occupation' 'professor of computer science']
['Gary Chevsky' 'occupation' 'engineer']
['Christopher Stewart Wallace' 'position held' 'Professor Emeritus']
['Terence Aidan (Terry) Halpin' 'position held'
 'Professor at Neumont University']
['Gabor Tamas Herman' 'position held'
 'Emiritas Professor of Computer Science at The Graduate Center, City University of New York (CUNY)']
['Gary Chevsky' 'occupation' 'entrepreneur']
['Jeffrey Outlaw Shallit' 'place of birth' 'Philadelphia, Pennsylvania']
['Shuman Ghosemajumder' 'position held' 'click fraud czar']
['Robert Denis Glaser' 'educated at' 'Yale University']
['Jeffrey Outlaw Shallit' 'position held'
 'Professor in the School of Computer Science']
['Bernard Chazelle' 'spouse' 'Celia Chazelle']
['Terence Aidan (Terry) Halpin' 'position held'
 'Principal Scientist at LogicBlox']
['Robert Denis Glaser' 'place of birth' 'New York City, New York']
['PutinTrump.org' 

SHOW ALL PREDICTED TRIPLETS THAT WERE NOT IN THE EVALUATION SET

In [396]:
for triple in new_pred:
    if triple not in new_true:
        print(triple)

['Robert Denis Glaser' 'place of birth' 'New York City']
['Samson Abramsky' 'educated at' 'Hasmonean Grammar School for Boys']
['Denis Glaser' 'place of birth' 'New York City']
['Ask Jeeves' 'founded by' 'Gary Chevsky']
['Gary Chevsky' 'place of birth' 'Odessa']
['Peter Jay Weinberger' 'educated at' 'University of California, Berkeley']
['Shuman Ghosemajumder' 'place of birth' 'Stuttgart']
['Marilyn Kirsch' 'spouse' 'Gabor Tamas Herman']
['Bernard Chazelle' 'place of birth' 'Paris']
['Denis Glaser' 'educated at' 'Yale University']
['Jeffrey Outlaw Shallit' 'educated at'
 'University of California, Berkeley']
['Denis Glaser' 'date of birth' 'January 16, 1962']
['Bernard Chazelle' 'place of birth' 'Clamart']


(OPTION 2) SIMILARITY EVALUATION ON EACH TRIPLET PART (THUS NO NEED FOR PERFECT MATCH)

In [397]:
nlp = spacy.load('en_core_web_sm')
new_true = [[nlp(str(element)) for element in triple] for triple in eval_list_true_id]
new_pred = [[nlp(str(element)) for element in triple] for triple in eval_list_pred_id]

In [398]:
correct = 0
for triple_pred in new_pred:
    for triple_true in new_true:
        if triple_pred[0].similarity(triple_true[0]) >= 0.9 and triple_pred[1].similarity(triple_true[1]) >= 0.99 and triple_pred[2].similarity(triple_true[2]) >= 0.9:
            print(triple_true)
            print(triple_pred)
            print()
            correct += 1
    
print(correct/len(new_true))

[Jeffrey Outlaw Shallit, educated at, Princeton University]
[Jeffrey Outlaw Shallit, educated at, Princeton University]

[Peter Jay Weinberger, occupation, computer scientist]
[Peter Jay Weinberger, occupation, computer scientist]

[Samson Abramsky, educated at, King's College, Cambridge]
[Samson Abramsky, educated at, King's College, Cambridge]

[Peter Jay Weinberger, educated at, Swarthmore College]
[Peter Jay Weinberger, educated at, Swarthmore College]

[Robert Denis Glaser, place of birth, New York City, New York]
[Robert Denis Glaser, place of birth, New York City]

[Gabor Tamas Herman, spouse, Marilyn Kirsch]
[Gabor Tamas Herman, spouse, Marilyn Kirsch]

[Samson Abramsky, educated at, Hasmonean Grammar School for Boys, Hendon]
[Samson Abramsky, educated at, Hasmonean Grammar School for Boys]

[TeachAids, founded by, Shuman Ghosemajumder]
[TeachAids, founded by, Shuman Ghosemajumder]

[Robert Denis Glaser, place of birth, New York City, New York]
[Denis Glaser, place of birth, Ne

(SCRAPPED) SIMILARITY WITH WHOLE TRIPLE INSTEAD OF EVALUATING EACH PART (NOT AS GOOD AS ABOVE)

In [339]:
# new_true = []
# for row in eval_list_true_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_true.append(triple[:-1])
    
# new_pred = []
# for row in eval_list_pred_id:
#     triple = ''
#     for element in row:
#         triple += element + ' '
#     new_pred.append(triple[:-1])

In [340]:
# nlp = spacy.load('en_core_web_sm')
# new_true = [nlp(triple) for triple in new_true]
# new_pred = [nlp(triple) for triple in new_pred]

In [341]:
# correct = 0
# for triple_pred in new_pred:
#     for triple_true in new_true:
#         if triple_pred.similarity(triple_true) >= 0.96:
#             print(triple_true)
#             print(triple_pred)
#             print()
#             correct += 1
    
# print(correct/len(new_true))

(SCRAPPED) EVALUATION SET TO DICT SIMILAR TO OUTPUT OF REBEL

In [294]:
#evaluation_dict = dict()
# indexes = []

# evaluation_df = pd.read_csv('example.csv')
# indexes = evaluation_df['PAR ID'].unique()
# predicates = evaluation_df['PRED'].unique()

# df = evaluation_df.set_index('PAR ID')
        
# for index in indexes:
#     evaluation_dict[index] = []
         
# for index, row in df.iterrows():
#     evaluation_dict[index].append({'relation': row['PRED'], 'head_span': row['SUB'], 'tail_span': row['OBJ']})

#evaluation_dict