In [1]:
import requests
import re
import hashlib
from spacy import Language, util
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline
import crosslingual_coreference
import spacy
from os.path import isfile
import os
import ftfy
import json
import glob
from tqdm import tqdm

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Sverre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
path = "preprocessed-rebel/"

In [3]:
def call_wiki_api(item):
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
        data = requests.get(url).json()
        # Return the first id (Could upgrade this in the future)
        return data['search'][0]['id']
    except:
        return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets

In [4]:
@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})
   
    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
            return mapping
        else:
            res = call_wiki_api(item)
            self.entity_mapping[item] = res
            return res

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [5]:
DEVICE = -1 # Number of the GPU, -1 if want to use CPU
# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE})

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':DEVICE, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )


error loading _jsonnet (this is expected on Windows), treating C:\Users\Sverre\AppData\Local\Temp\tmpnj2bumph\config.json as plain json
Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large a

<__main__.RebelComponent at 0x1cf17737a30>

In [6]:
def coreff(par):
    all_relations = []
    # process it
    # coref_text = coref(par)._.resolved_text
    # print(coref_text)
    doc = rel_ext(par)  
    print(doc._.rel.items())
    for value, rel_dict in doc._.rel.items():
        all_relations.append(rel_dict)
    
    return all_relations

In [7]:
"""
        Function to apply preprocessing on a selection of files and store it in a separate folder
        - path: root folder (preprocessed-rebel), 
        - subf desired subfolder: AA or AB, must be passed as a string (e.g. "AA") 
        - files in subfolder AA: e.g. p_r_wiki_00 
        - files in subfolder AB: e.g. p_r_wiki_00
        - start: start number file, e.g. 0-99 (no need to fill in 00, 0 is fine)
        - end: end number file, e.g. 0-99
        - the range is inclusive which means, e.g. with (0, 0) you select & pre-process file wiki_00,
        - with (32, 50) you select file wiki_32 up till wiki_50
""" 
def rel_extraction_mul_files(path, subf=None, start=None, end=None):
    relations = {}
    if subf:
        # from file start to end
        for i in range(start, end+1):
            # to match the filename p_r_wiki_00 up till p_r_wiki_09 we add a zero in front of the number from user input if necessary
            if i < 10:
                i = "0" + str(i)
            # construct path to file name that falls within range
            f = path + subf + "/p_r_wiki_{}".format(i) 
            
            # if file exists
            if isfile(f):
                # OPEN FILE, GO THROUGH EACH PAR AND PASS THAT INTO COREF FUNCTION
                file = open(f, 'r', encoding='utf-8')
                doc = json.load(file)
                for k, v in tqdm(doc.items()):
                    v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                    # if paragraph has more than one word
                    if len(v.split(" ")) > 1:
                        relations[k] = coreff(v)
                        #TODO: FORMAT OUTPUT -> KADIR
    else:
        for f in glob.glob('preprocessed-rebel/*/*'):
            # OPEN EACH FILE, GO THROUGH EACH PARAGRAPH AND PASS THAT INTO COREF FUNC
            file = open(f, 'r')
            doc = json.load(file)
            for k, v in doc.items():
                v = ftfy.fix_text(v) # FIX ANY ENCODINGS
                # if paragraph has more than one word
                if len(v.split(" ")) > 1:
                    relations[k] = coreff(v)
                    #TODO: FORMAT OUTPUT -> KADIR
                    
    return relations

In [8]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
relations = rel_extraction_mul_files(path, "AA", 0, 0) # rel_extraction_mul_files(path)

  0%|          | 0/802 [00:00<?, ?it/s]

Aztlan Underground is a band from Los Angeles, California that combines Hip-Hop, Punk Rock, Jazz, and electronic music with Chicano and Native American themes, and indigenous instrumentation. Aztlan Underground are often cited as progenitors of Chicano rap.


  0%|          | 1/802 [00:08<1:52:14,  8.41s/it]

dict_items([('bdfbd12e40515f8d2f5f3529a81303dbd4598fda', {'relation': 'location of formation', 'head_span': {'text': 'Aztlan Underground', 'id': 'Q4832994'}, 'tail_span': {'text': 'Los Angeles', 'id': 'Q65'}}), ('9bda233d90607745443ab7c58c5f94548e4189f5', {'relation': 'genre', 'head_span': {'text': 'Aztlan Underground', 'id': 'Q4832994'}, 'tail_span': {'text': 'Chicano rap', 'id': 'Q1399695'}})])
The band traces The band's roots to the late-1980s hardcore scene in the Eastside of Los Angeles. The band have played rapcore, with elements of punk, hip hop, rock, funk, jazz, indigenous music, and spoken word. Indigenous drums, flutes, and rattles are also commonly used in The band's music. The band's lyrics often address the family and economic issues faced by the Chicano community, and The band have been noted as activists for the Chicano community.


  0%|          | 2/802 [00:22<2:33:24, 11.51s/it]

dict_items([('6b09b25f619449ce5e14a281d30bce8c72b9bf08', {'relation': 'located in the administrative territorial entity', 'head_span': {'text': 'Eastside', 'id': 'Q55975144'}, 'tail_span': {'text': 'Los Angeles', 'id': 'Q65'}}), ('3f58fc0f84bd2aab94831cfe5551b20269a3076b', {'relation': 'genre', 'head_span': {'text': 'rapcore', 'id': 'Q3930216'}, 'tail_span': {'text': 'hip hop', 'id': 'Q1132127'}}), ('41f616b3d214629c6b8566bd2d1e0a33ace3fe5c', {'relation': 'subclass of', 'head_span': {'text': 'rattles', 'id': 'Q2132068'}, 'tail_span': {'text': 'drum', 'id': 'Q11404'}}), ('5b72f81ba38ee399ed49b2e9ac7bce220c00c2d1', {'relation': 'ethnic group', 'head_span': {'text': 'Chicano', 'id': 'Q581921'}, 'tail_span': {'text': 'Chicano community', 'id': 'Q113243605'}})])
As an example of the politically active and culturally important artists in Los Angeles in the 1990s, Aztlan Underground appeared on "Culture Clash" on Fox in 1993; and was part of "Breaking Out", a concert on pay per view in 1998, 

  0%|          | 3/802 [00:36<2:52:07, 12.93s/it]

dict_items([('d0aa9d65542b23b7f9aa1a2354b01000214c7ba5', {'relation': 'original broadcaster', 'head_span': {'text': 'Culture Clash', 'id': 'Q5193269'}, 'tail_span': {'text': 'Fox', 'id': 'Q166419'}}), ('986517f1163473cba3db92ba8d4306d6603ea2f1', {'relation': 'main subject', 'head_span': {'text': 'BLU Magazine', 'id': 'id-less'}, 'tail_span': {'text': 'underground hip hop', 'id': 'Q965635'}}), ('3a6d69252bc880ea374ea13445d5f3ab6c282555', {'relation': 'author', 'head_span': {'text': 'It is Not About a Salary', 'id': 'id-less'}, 'tail_span': {'text': 'Brian Cross', 'id': 'Q19842375'}})])
Aztlan Underground remains active in the community, lending Aztlan Underground's voice to annual events such as The Farce of July, and the recent movement to recognize Indigenous People's Day in Los Angeles and beyond.


  0%|          | 4/802 [00:41<2:07:32,  9.59s/it]

dict_items([('520fed7029097ee245b6e50f09fd3c2c6709a7fd', {'relation': 'performer', 'head_span': {'text': 'The Farce of July', 'id': 'id-less'}, 'tail_span': {'text': 'Aztlan Underground', 'id': 'Q4832994'}})])
In addition to forming Aztlan Underground's own label, Xicano Records and Film, Aztlan Underground were signed to the Basque record label Esan Ozenki in 1999 which enabled Aztlan Underground to tour Spain extensively and perform in France and Portugal. Aztlan Underground have also performed in Canada, Australia, and Venezuela. Aztlan Underground has been recognized for Aztlan Underground's music with nominations in the "New Times" 1998 "Best Latin Influenced" category, the "BAM Magazine" 1999 "Best Rock en Español" category, and the "LA Weekly" 1999 "Best Hip Hop" category. The release of Aztlan Underground's eponymous third album on August 29, 2009 was met with positive reviews and earned Aztlan Underground four Native American Music Award (NAMMY) nominations in 2010.


  0%|          | 4/802 [00:43<2:24:54, 10.90s/it]


KeyboardInterrupt: 

In [None]:
# OPT 1: SELECT SPECIFIC FILES TO FEED REBEL IN ONE PARTICULAR MAP / # OPT 2: FEED ALL FILES, in AA and AB BY ONLY KEEPING PATH IN THERE
#relations = rel_extraction_mul_files(path, "AA", 0, 0) # rel_extraction_mul_files(path)