In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install transformers
!pip install coreferee
!python3 -m coreferee install en
!python3 -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 30.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.7 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting coreferee
  Downloading coreferee-1.3.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 33.4 

In [2]:
import re
import coreferee
from typing import List
import json
import pandas as pd 
import spacy
from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline
import time
import string

In [3]:
import re
from typing import List

from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline


def extract_triplets(text: str) -> List[str]:
    """
    parses the text to triplets
    1. Split the text into tokens
    2. If the token is <triplet>, <subj>, or <obj>, then set the current variable to the appropriate value
    3. If the token is not one of the above, then append it to the appropriate variable
    4. If the current variable is <subj>, then append the triplet to the list of triplets
    :param text: str - the text to be parsed
    :type text: str
    :return: A list of dictionaries.
    """
    triplets = []
    relation, subject, relation, object_ = "", "", "", ""
    text = text.strip()
    current = "x"
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = "t"
            if relation != "":
                triplets.append(
                    {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
                )
                relation = ""
            subject = ""
        elif token == "<subj>":
            current = "s"
            if relation != "":
                triplets.append(
                    {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
                )
            object_ = ""
        elif token == "<obj>":
            current = "o"
            relation = ""
        else:
            if current == "t":
                subject += " " + token
            elif current == "s":
                object_ += " " + token
            elif current == "o":
                relation += " " + token
    if subject != "" and relation != "" and object_ != "":
        triplets.append(
            {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
        )

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline(
            "text2text-generation", model=model_name, tokenizer=model_name, device=device
        )
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sents: List[Span]) -> List[List[dict]]:
        """
        1. We pass the text of the sentence to the triplet extractor.
        2. The triplet extractor returns a list of dictionaries.
        3. We extract the token ids from the dictionaries.
        4. We decode the token ids into text.
        5. We extract the triplets from the text.
        6. We return the triplets.
        The triplet extractor is a model that takes a sentence as input and returns a list of dictionaries.
        Each dictionary contains the token ids of the extracted triplets.
        The token ids are the numbers that represent the words in the sentence.
        For example, the token id of the word "the" is 2.
        The token ids are decoded into text using the tokenizer.
        The tokenizer is a model that takes a list of token ids as input and returns a list of words.
        :param sents: List[Span]
        :type sents: List[Span]
        :return: A list of lists of dicts.
        """
        output_ids = self.triplet_extractor(
            [sent.text for sent in sents], return_tensors=True, return_text=False
        )  # [0]["generated_token_ids"]
        extracted_texts = self.triplet_extractor.tokenizer.batch_decode(
            [out["generated_token_ids"] for out in output_ids]
        )
        extracted_triplets = []
        for text in extracted_texts:
            extracted_triplets.extend(extract_triplets(text))
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        """
        The function takes a spacy Doc object and a list of triplets (dictionaries) as input.
        For each triplet, it finds the substring in the Doc object that matches the head and tail of the triplet.
        It then creates a spacy span object for each of the head and tail.
        Finally, it creates a dictionary of the relation type, head span and tail span and adds it to the Doc object
        :param doc: the spacy Doc object
        :type doc: Doc
        :param triplets: List[dict]
        :type triplets: List[dict]
        """
        for triplet in triplets:
            # get substring to spacy span
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)
            # get spacy span
            if head_span is not None:
                head_span = doc.char_span(head_span.start(), head_span.end())
            else:
                #print(f"can't find spacy head span: {triplet['head']}")
                continue
                #head_span = triplet["head"]
            if tail_span is not None:
                tail_span = doc.char_span(tail_span.start(), tail_span.end())
            else:
                #print(f"can't find spacy tail span: {triplet['tail']}")
                continue
                #tail_span = triplet["tail"]
            
            if head_span is not None and tail_span is not None:
              offset = (head_span.start, tail_span.start)
              if offset not in doc._.rel:
                  doc._.rel[offset] = {
                      "relation": triplet["type"],
                      "head_span": head_span,
                      "tail_span": tail_span,
                  }

    def __call__(self, doc: Doc) -> Doc:
        """
        The function takes a doc object and returns a doc object
        :param doc: Doc
        :type doc: Doc
        :return: A Doc object with the sentence triplets added as annotations.
        """
        sentence_triplets = self._generate_triplets(doc.sents)
        self.set_annotations(doc, sentence_triplets)
        return doc

    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each document,
        it generates a list of sentence triplets,
        and then sets the annotations for each sentence in the document
        :param stream: a generator of Doc objects
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        """
        for docs in util.minibatch(stream, size=batch_size):
            sents = []
            for doc in docs:
                sents += doc.sents
            sentence_triplets = self._generate_triplets(sents)
            index = 0
            for doc in docs:
                n_sent = len(list(doc.sents))
                self.set_annotations(doc, sentence_triplets[index : index + n_sent])
                index += n_sent
                yield doc



In [4]:
def resolve_corefs(doc):
  s = set(["his","her","their"])
  mutable_doc = [str(word) for word in doc]
  if doc._.coref_chains:
    for chain in doc._.coref_chains.chains:
      #print(chain)
      for token in chain:
        #print(token)
        if len(token) == 1:
          coref = doc._.coref_chains.resolve(doc[token[0]])
          if coref:
            if len(coref) > 1:
              mutable_doc[token[0]] = " and ".join([str(word) for word in coref])
            else:
              t = str(doc[token[0]])
              c = str(coref[0])
              #print(t)
              if t in s:
                mutable_doc[token[0]] = f"{c}'s"
              else:
                mutable_doc[token[0]] = c
  output = ""
  cnt = 0
  for token in mutable_doc:
    if token not in string.punctuation and cnt>0:
      output+=f" {token}"
    else:
      output+=token
    cnt+=1
  return output

In [7]:

# Using readlines()
file1 = open('/content/drive/MyDrive/Data Science and AI/Text mining_Shared/enwiki20220701-stripped/AB/wiki_63', 'r')
Lines = file1.readlines()
  
content = []
count = 0
# Strips the newline character
for line in Lines:
    content.append(json.loads(line.split('\n')[0]))

In [8]:
df = pd.DataFrame(content)

In [9]:
df

Unnamed: 0,id,revid,url,title,text
0,67248252,18087347,https://en.wikipedia.org/wiki?curid=67248252,Barragem de Idanha,
1,67248254,577301,https://en.wikipedia.org/wiki?curid=67248254,Cyclone Ola (2015),
2,67248256,11292982,https://en.wikipedia.org/wiki?curid=67248256,Romanian-Soviet split,
3,67248259,577301,https://en.wikipedia.org/wiki?curid=67248259,Cyclone Diamondra (2015),
4,67248260,10951369,https://en.wikipedia.org/wiki?curid=67248260,131 Houston Street,
...,...,...,...,...,...
150202,67931250,41195652,https://en.wikipedia.org/wiki?curid=67931250,War of the Rohirrim,
150203,67931254,35936988,https://en.wikipedia.org/wiki?curid=67931254,K33DS-D,
150204,67931256,13892613,https://en.wikipedia.org/wiki?curid=67931256,Entalophoridae,Entalophoridae is a family of bryozoans belong...
150205,67931262,41015179,https://en.wikipedia.org/wiki?curid=67931262,Indonesian theatre,


In [10]:
df = df.loc[df['text']!= ""] # Discard empty pages
print(len(df))

51268


## Demo

In [None]:
preprocess = spacy.load("en_core_web_lg")
preprocess.add_pipe('coreferee')

In [None]:
doc = preprocess("Although he was very busy with his work, Peter had had enough of it. He and his wife Janet decided they needed a holiday. They travelled to Spain because they loved the country very much.")
#print(doc._.coref_chains.pretty_representation)
new_text = resolve_corefs(doc)
print(doc)
print(new_text)


In [None]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )

In [None]:
idx = 1000
input_text = df["text"].values[idx]
print(input_text) 

In [None]:
%%time
### With preprocessing
start = time.time()
doc = preprocess(input_text)
print([f"{ent}:{ent.label_}" for ent in doc.ents])
print("Preprocessing pipeline took: ",time.time()-start)
start = time.time()
prepped = resolve_corefs(doc)
print("Resolving corefs took: ",time.time()-start)
start = time.time()
#print(prepped)
doc = nlp(prepped)
print("NLP pipeline took:", time.time() - start)
doc_list = nlp.pipe([prepped])
for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

['Heinrich Stuhlmann:PERSON', '28 December 1803:DATE', 'Hamburg:GPE', 'October 1886:DATE', 'Hamburg:GPE', 'German:NORP', 'Johann Daniel Stuhlmann:PERSON', 'Johann:PERSON', '1814:DATE', 'Heinrich:PERSON', 'Matthias Heinrich Stuhlmann:PERSON', '1774–1822:CARDINAL', "St. Catherine's Church:ORG", 'Gerdt Hardorff:PERSON', 'the Royal Danish Academy of Fine Arts:ORG', 'Christian:NORP', 'David Gebauer:PERSON', '1825:DATE', 'Munich:GPE', 'Newspaper for Horse Lovers:ORG', 'one year:DATE', 'Frederick Francis I:PERSON', 'Mecklenburg-Schwerin:PERSON', 'Berlin:GPE', '1830:DATE', 'Dresden:GPE', 'Johan Christian Clausen Dahl:PERSON', '1832:DATE', 'fourteen:CARDINAL', '1838 and 1839:DATE', 'the Great Fire of 1842:EVENT', 'Künstlerverein:PERSON', '1843:DATE', 'daguerrotypes:ORG', '1863 to 1868:DATE', 'the Hamburger Kunsthalle:FAC', 'Museum:ORG', 'Hamburgische Geschichte:PERSON', 'the Altonaer Museum:ORG', 'the Philadelphia Museum of Art:ORG']
Preprocessing pipeline took:  0.7666091918945312
Resolving co

In [None]:
%%time 
## Baseline
doc = nlp(input_text)
doc_list = nlp.pipe([input_text])
for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

(0, 3): {'relation': 'date of birth', 'head_span': Heinrich Stuhlmann, 'tail_span': 28 December 1803}
(0, 7): {'relation': 'place of birth', 'head_span': Heinrich Stuhlmann, 'tail_span': Hamburg}
(0, 9): {'relation': 'date of death', 'head_span': Heinrich Stuhlmann, 'tail_span': 23 October 1886}
(29, 27): {'relation': 'part of', 'head_span': work, 'tail_span': Life}
(37, 36): {'relation': 'father', 'head_span': Daniel Stuhlmann, 'tail_span': Johann Daniel Stuhlmann}
(61, 69): {'relation': 'occupation', 'head_span': Matthias Heinrich Stuhlmann, 'tail_span': Pastor}
(98, 95): {'relation': 'field of work', 'head_span': Gerdt Hardorff, 'tail_span': drawing}
(130, 116): {'relation': 'employer', 'head_span': Christian David Gebauer, 'tail_span': Royal Danish Academy of Fine Arts}
(149, 162): {'relation': 'publisher', 'head_span': Zeitung für Pferdeliebhaber, 'tail_span': Major}
(180, 197): {'relation': 'owner of', 'head_span': Frederick Francis I, Grand Duke of Mecklenburg-Schwerin, 'tail_sp

## Actual Pipeline

In [11]:
preprocess = spacy.load("en_core_web_lg")
preprocess.add_pipe('coreferee')
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

<__main__.RebelComponent at 0x7f8be4cf8e90>

In [13]:
from collections import Counter
def has_multiple_people(doc):
  people = []
  for entity in doc.ents:
    if entity.label_ == 'PERSON' and entity.text not in people:
      people.append(entity.text)
    if len(people)>1:
      return True
  return False

  # Old way
  # people = [person for person in doc.ents if person.label_ == 'PERSON' and person not in people]
  # ents = Counter([ent.label_ for ent in doc.ents])
  # #print(ents["PERSON"])
  # if ents["PERSON"] > 1:
  #   return True
  # else:
  #   return False

In [12]:
#defining the function to remove punctuation except dot (.)
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation or i == '.'])
    return punctuationfree

In [18]:
%%time
# Keep only rows with non empty text
df = df[df.text != '']
df['text_preprocessed'] = df['text']
relationships = ['spouse','sibling','father','child','employer','family','mother','relative','student of']

# Reindexing
df.index = range(len(df))
test= df[:100] # only use first 100 for test change this later to process everything
test["relations"] = ''
df['text_preprocessed']= df['text_preprocessed'].apply(lambda x: remove_punctuation(x))
for idx,row in test.iterrows():
  start = time.time()
  processed_doc = preprocess(row["text_preprocessed"])
  if has_multiple_people(processed_doc):
    #print(processed_doc)
    resolved = resolve_corefs(processed_doc)
    doc = nlp(resolved)
    doc_list = nlp.pipe([doc])
    test.loc[idx]["relations"] = [rel_dict for _,rel_dict in doc._.rel.items() if rel_dict in relationships]

  print(f"{idx} took {time.time()-start} seconds..")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


0 took 14.277567148208618 seconds..
1 took 4.7197911739349365 seconds..
2 took 15.268502950668335 seconds..
3 took 0.2790069580078125 seconds..
4 took 25.292376041412354 seconds..
5 took 0.007658481597900391 seconds..
6 took 43.38259506225586 seconds..
7 took 28.32498335838318 seconds..
8 took 0.07208514213562012 seconds..
9 took 0.03410840034484863 seconds..
10 took 0.03456473350524902 seconds..
11 took 0.08064150810241699 seconds..
12 took 8.971463441848755 seconds..
13 took 0.04165792465209961 seconds..
14 took 32.11370277404785 seconds..
15 took 15.748425722122192 seconds..




16 took 32.8307900428772 seconds..
17 took 23.226982355117798 seconds..
18 took 7.805516719818115 seconds..
19 took 0.007638454437255859 seconds..
20 took 115.04669857025146 seconds..
21 took 13.841336488723755 seconds..
22 took 0.009171009063720703 seconds..
23 took 0.2914555072784424 seconds..
24 took 6.711658239364624 seconds..
25 took 27.47822618484497 seconds..
26 took 0.18968725204467773 seconds..
27 took 0.009111166000366211 seconds..
28 took 5.4196648597717285 seconds..
29 took 1.5494201183319092 seconds..
30 took 23.829732179641724 seconds..
31 took 11.940603494644165 seconds..
32 took 0.015177011489868164 seconds..
33 took 2.287781000137329 seconds..
34 took 21.7009174823761 seconds..
35 took 5.124451398849487 seconds..
36 took 0.012056112289428711 seconds..
37 took 11.79311490058899 seconds..
38 took 17.660746097564697 seconds..
39 took 24.27408242225647 seconds..
40 took 0.1780698299407959 seconds..
41 took 35.42898774147034 seconds..
42 took 29.38997507095337 seconds..
43 

KeyboardInterrupt: ignored

In [None]:
test

Unnamed: 0,id,revid,url,title,text,preprocessed
5,67248261,41840956,https://en.wikipedia.org/wiki?curid=67248261,Clepardia Kraków,"Clepardia Kraków (""KS Clepardia Kraków"") is a ...","(Clepardia, Kraków, (, "", KS, Clepardia, Krakó..."
9,67248272,12809580,https://en.wikipedia.org/wiki?curid=67248272,Mason Duval,Mason Duval (born 24 August 2001) is a Caymani...,"(Mason, Duval, (, born, 24, August, 2001, ), i..."
15,67248291,754619,https://en.wikipedia.org/wiki?curid=67248291,Metamorfosi railway station,Metamorfosi railway station () is a station on...,"(Metamorfosi, railway, station, (, ), is, a, s..."
21,67248312,754619,https://en.wikipedia.org/wiki?curid=67248312,"3rd Battalion, Yorkshire Volunteers","The 3rd Battalion (West Yorkshire), Yorkshire ...","(The, 3rd, Battalion, (, West, Yorkshire, ), ,..."
29,67248329,3311318,https://en.wikipedia.org/wiki?curid=67248329,Mawa Gare,Mawa Gare (Mawa Station) is a village in the B...,"(Mawa, Gare, (, Mawa, Station, ), is, a, villa..."
...,...,...,...,...,...,...
363,67249704,25829265,https://en.wikipedia.org/wiki?curid=67249704,Jack McKnight,Jack McKnight (born 10 June 1994) is a Turks a...,"(Jack, McKnight, (, born, 10, June, 1994, ), i..."
364,67249708,7098284,https://en.wikipedia.org/wiki?curid=67249708,Yanchan,"Yanchan Rajmohan (born March 8, 1995), known p...","(Yanchan, Rajmohan, (, born, March, 8, ,, 1995..."
365,67249717,42313471,https://en.wikipedia.org/wiki?curid=67249717,Workplace exposure monitoring,Workplace exposure monitoring is the monitorin...,"(Workplace, exposure, monitoring, is, the, mon..."
369,67249744,28786153,https://en.wikipedia.org/wiki?curid=67249744,Irwin Shepard,Irwin Shepard (5 July 1843 – 17 April 1916) wa...,"(Irwin, Shepard, (, 5, July, 1843, –, 17, Apri..."
