In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [76]:
!pip install transformers
!pip install coreferee
!python3 -m coreferee install en
!python3 -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/explosion/coreferee/raw/master/models/coreferee_model_en.zip
  Using cached https://github.com/explosion/coreferee/raw/master/models/coreferee_model_en.zip (65.4 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 10 kB/s 
Installi

In [33]:
import re
import coreferee
from typing import List
import json
import pandas as pd 
import spacy
from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline
import time
import string

In [89]:
import re
from typing import List

from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline


def extract_triplets(text: str) -> List[str]:
    """
    parses the text to triplets
    1. Split the text into tokens
    2. If the token is <triplet>, <subj>, or <obj>, then set the current variable to the appropriate value
    3. If the token is not one of the above, then append it to the appropriate variable
    4. If the current variable is <subj>, then append the triplet to the list of triplets
    :param text: str - the text to be parsed
    :type text: str
    :return: A list of dictionaries.
    """
    triplets = []
    relation, subject, relation, object_ = "", "", "", ""
    text = text.strip()
    current = "x"
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = "t"
            if relation != "":
                triplets.append(
                    {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
                )
                relation = ""
            subject = ""
        elif token == "<subj>":
            current = "s"
            if relation != "":
                triplets.append(
                    {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
                )
            object_ = ""
        elif token == "<obj>":
            current = "o"
            relation = ""
        else:
            if current == "t":
                subject += " " + token
            elif current == "s":
                object_ += " " + token
            elif current == "o":
                relation += " " + token
    if subject != "" and relation != "" and object_ != "":
        triplets.append(
            {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
        )

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline(
            "text2text-generation", model=model_name, tokenizer=model_name, device=device
        )
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sents: List[Span]) -> List[List[dict]]:
        """
        1. We pass the text of the sentence to the triplet extractor.
        2. The triplet extractor returns a list of dictionaries.
        3. We extract the token ids from the dictionaries.
        4. We decode the token ids into text.
        5. We extract the triplets from the text.
        6. We return the triplets.
        The triplet extractor is a model that takes a sentence as input and returns a list of dictionaries.
        Each dictionary contains the token ids of the extracted triplets.
        The token ids are the numbers that represent the words in the sentence.
        For example, the token id of the word "the" is 2.
        The token ids are decoded into text using the tokenizer.
        The tokenizer is a model that takes a list of token ids as input and returns a list of words.
        :param sents: List[Span]
        :type sents: List[Span]
        :return: A list of lists of dicts.
        """
        output_ids = self.triplet_extractor(
            [sent.text for sent in sents], return_tensors=True, return_text=False
        )  # [0]["generated_token_ids"]
        extracted_texts = self.triplet_extractor.tokenizer.batch_decode(
            [out["generated_token_ids"] for out in output_ids]
        )
        extracted_triplets = []
        for text in extracted_texts:
            extracted_triplets.extend(extract_triplets(text))
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        """
        The function takes a spacy Doc object and a list of triplets (dictionaries) as input.
        For each triplet, it finds the substring in the Doc object that matches the head and tail of the triplet.
        It then creates a spacy span object for each of the head and tail.
        Finally, it creates a dictionary of the relation type, head span and tail span and adds it to the Doc object
        :param doc: the spacy Doc object
        :type doc: Doc
        :param triplets: List[dict]
        :type triplets: List[dict]
        """
        for triplet in triplets:
            # get substring to spacy span
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)
            # get spacy span
            if head_span is not None:
                head_span = doc.char_span(head_span.start(), head_span.end())
            else:
                #print(f"can't find spacy head span: {triplet['head']}")
                continue
                #head_span = triplet["head"]
            if tail_span is not None:
                tail_span = doc.char_span(tail_span.start(), tail_span.end())
            else:
                #print(f"can't find spacy tail span: {triplet['tail']}")
                continue
                #tail_span = triplet["tail"]
            
            if head_span is not None and tail_span is not None:
              offset = (head_span.start, tail_span.start)
              if offset not in doc._.rel:
                  doc._.rel[offset] = {
                      "relation": triplet["type"],
                      "head_span": head_span,
                      "tail_span": tail_span,
                  }

    def __call__(self, doc: Doc) -> Doc:
        """
        The function takes a doc object and returns a doc object
        :param doc: Doc
        :type doc: Doc
        :return: A Doc object with the sentence triplets added as annotations.
        """
        sentence_triplets = self._generate_triplets(doc.sents)
        self.set_annotations(doc, sentence_triplets)
        return doc

    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each document,
        it generates a list of sentence triplets,
        and then sets the annotations for each sentence in the document
        :param stream: a generator of Doc objects
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        """
        for docs in util.minibatch(stream, size=batch_size):
            sents = []
            for doc in docs:
                sents += doc.sents
            sentence_triplets = self._generate_triplets(sents)
            index = 0
            for doc in docs:
                n_sent = len(list(doc.sents))
                self.set_annotations(doc, sentence_triplets[index : index + n_sent])
                index += n_sent
                yield doc



TypeError: ignored

In [90]:
def resolve_corefs(doc):
  s = set(["his","her","their"])
  mutable_doc = [str(word) for word in doc]
  if doc._.coref_chains:
    for chain in doc._.coref_chains.chains:
      #print(chain)
      for token in chain:
        #print(token)
        if len(token) == 1:
          coref = doc._.coref_chains.resolve(doc[token[0]])
          if coref:
            if len(coref) > 1:
              mutable_doc[token[0]] = " and ".join([str(word) for word in coref])
            else:
              t = str(doc[token[0]])
              c = str(coref[0])
              #print(t)
              if t in s:
                mutable_doc[token[0]] = f"{c}'s"
              else:
                mutable_doc[token[0]] = c
  output = ""
  cnt = 0
  for token in mutable_doc:
    if token not in string.punctuation and cnt>0:
      output+=f" {token}"
    else:
      output+=token
    cnt+=1
  return output

In [5]:

# Using readlines()
file1 = open('/content/drive/MyDrive/Text Mining/enwiki20220701-stripped/AB/wiki_63', 'r')
Lines = file1.readlines()
  
content = []
count = 0
# Strips the newline character
for line in Lines:
    content.append(json.loads(line.split('\n')[0]))

In [6]:
df = pd.DataFrame(content)

In [7]:
df

Unnamed: 0,id,revid,url,title,text
0,67248252,18087347,https://en.wikipedia.org/wiki?curid=67248252,Barragem de Idanha,
1,67248254,577301,https://en.wikipedia.org/wiki?curid=67248254,Cyclone Ola (2015),
2,67248256,11292982,https://en.wikipedia.org/wiki?curid=67248256,Romanian-Soviet split,
3,67248259,577301,https://en.wikipedia.org/wiki?curid=67248259,Cyclone Diamondra (2015),
4,67248260,10951369,https://en.wikipedia.org/wiki?curid=67248260,131 Houston Street,
...,...,...,...,...,...
150202,67931250,41195652,https://en.wikipedia.org/wiki?curid=67931250,War of the Rohirrim,
150203,67931254,35936988,https://en.wikipedia.org/wiki?curid=67931254,K33DS-D,
150204,67931256,13892613,https://en.wikipedia.org/wiki?curid=67931256,Entalophoridae,Entalophoridae is a family of bryozoans belong...
150205,67931262,41015179,https://en.wikipedia.org/wiki?curid=67931262,Indonesian theatre,


In [8]:
df = df.loc[df['text']!= ""] # Discard empty pages
print(len(df))

51268


## Demo

In [80]:
preprocess = spacy.load("en_core_web_lg")
preprocess.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x7f55f11fbc90>

In [81]:
doc = preprocess("Although he was very busy with his work, Peter had had enough of it. He and his wife Janet decided they needed a holiday. They travelled to Spain because they loved the country very much.")
#print(doc._.coref_chains.pretty_representation)
new_text = resolve_corefs(doc)
print(doc)
print(new_text)


Although he was very busy with his work, Peter had had enough of it. He and his wife Janet decided they needed a holiday. They travelled to Spain because they loved the country very much.
Although Peter was very busy with Peter's work, Peter had had enough of work. Peter and Peter's wife Janet decided Peter and wife and Janet needed a holiday. Peter and wife and Janet travelled to Spain because Peter and wife and Janet loved the Spain very much.


In [82]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )

<__main__.RebelComponent at 0x7f53e98cb2d0>

In [84]:
idx = 1000
input_text = df["text"].values[idx]
print(input_text) 

Heinrich Stuhlmann (28 December 1803, Hamburg - 23 October 1886, Hamburg) was a German painter, graphic artist and amateur photographer.
Life and work.
He was born to Johann Daniel Stuhlmann, an insurance and coffee broker. Following Johann's untimely death in 1814, Heinrich was adopted by his uncle, Matthias Heinrich Stuhlmann (1774–1822), the Pastor of St. Catherine's Church, who helped him complete his education. As he developed a desire to become an artist, he took drawing lessons from Gerdt Hardorff. After serving a commercial apprenticeship, rather than go into business, he attended the Royal Danish Academy of Fine Arts, where he worked in the studios of Christian David Gebauer.
In 1825, he returned to Munich and found a job with the "Zeitung für Pferdeliebhaber" (Newspaper for Horse Lovers), published by Major , and made several trips to study horses. For one year, he was employed by Frederick Francis I, Grand Duke of Mecklenburg-Schwerin, and drew horses for the (Redefin Stud F

In [85]:
%%time
### With preprocessing
start = time.time()
doc = preprocess(input_text)
print([f"{ent}:{ent.label_}" for ent in doc.ents])
print("Preprocessing pipeline took: ",time.time()-start)
start = time.time()
prepped = resolve_corefs(doc)
print("Resolving corefs took: ",time.time()-start)
start = time.time()
#print(prepped)
doc = nlp(prepped)
print("NLP pipeline took:", time.time() - start)
doc_list = nlp.pipe([prepped])
for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

['Heinrich Stuhlmann:PERSON', '28 December 1803:DATE', 'Hamburg:GPE', 'October 1886:DATE', 'Hamburg:GPE', 'German:NORP', 'Johann Daniel Stuhlmann:PERSON', 'Johann:PERSON', '1814:DATE', 'Heinrich:PERSON', 'Matthias Heinrich Stuhlmann:PERSON', '1774–1822:CARDINAL', "St. Catherine's Church:ORG", 'Gerdt Hardorff:PERSON', 'the Royal Danish Academy of Fine Arts:ORG', 'Christian:NORP', 'David Gebauer:PERSON', '1825:DATE', 'Munich:GPE', 'Newspaper for Horse Lovers:ORG', 'one year:DATE', 'Frederick Francis I:PERSON', 'Mecklenburg-Schwerin:PERSON', 'Berlin:GPE', '1830:DATE', 'Dresden:GPE', 'Johan Christian Clausen Dahl:PERSON', '1832:DATE', 'fourteen:CARDINAL', '1838 and 1839:DATE', 'the Great Fire of 1842:EVENT', 'Künstlerverein:PERSON', '1843:DATE', 'daguerrotypes:ORG', '1863 to 1868:DATE', 'the Hamburger Kunsthalle:FAC', 'Museum:ORG', 'Hamburgische Geschichte:PERSON', 'the Altonaer Museum:ORG', 'the Philadelphia Museum of Art:ORG']
Preprocessing pipeline took:  0.7666091918945312
Resolving co

In [86]:
%%time 
## Baseline
doc = nlp(input_text)
doc_list = nlp.pipe([input_text])
for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

(0, 3): {'relation': 'date of birth', 'head_span': Heinrich Stuhlmann, 'tail_span': 28 December 1803}
(0, 7): {'relation': 'place of birth', 'head_span': Heinrich Stuhlmann, 'tail_span': Hamburg}
(0, 9): {'relation': 'date of death', 'head_span': Heinrich Stuhlmann, 'tail_span': 23 October 1886}
(29, 27): {'relation': 'part of', 'head_span': work, 'tail_span': Life}
(37, 36): {'relation': 'father', 'head_span': Daniel Stuhlmann, 'tail_span': Johann Daniel Stuhlmann}
(61, 69): {'relation': 'occupation', 'head_span': Matthias Heinrich Stuhlmann, 'tail_span': Pastor}
(98, 95): {'relation': 'field of work', 'head_span': Gerdt Hardorff, 'tail_span': drawing}
(130, 116): {'relation': 'employer', 'head_span': Christian David Gebauer, 'tail_span': Royal Danish Academy of Fine Arts}
(149, 162): {'relation': 'publisher', 'head_span': Zeitung für Pferdeliebhaber, 'tail_span': Major}
(180, 197): {'relation': 'owner of', 'head_span': Frederick Francis I, Grand Duke of Mecklenburg-Schwerin, 'tail_sp

## Actual Pipeline

In [77]:
preprocess = spacy.load("en_core_web_lg")
preprocess.add_pipe('coreferee')
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("rebel", after="senter", config={
    'device':0, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )

<__main__.RebelComponent at 0x7f54da056d90>

In [78]:
from collections import Counter
def has_multiple_people(doc):
  ents = Counter([ent.label_ for ent in doc.ents])
  #print(ents["PERSON"])
  if ents["PERSON"] > 1:
    return True
  else:
    return False

In [None]:
%%time
test= df[:100] # only use first 100 for test change this later to process everything
test["relations"] = ''
for idx,row in test.iterrows():
  start = time.time()
  processed_doc = preprocess(row["text"])
  if has_multiple_people(processed_doc):
    #print(processed_doc)
    resolved = resolve_corefs(doc)
    doc = nlp(resolved)
    doc_list = nlp.pipe([prepped])
    test.loc[idx]["relations"] = [rel_dict for _,rel_dict in doc._.rel.items()]

  print(f"{idx} took {time.time()-start} seconds..")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


can't find spacy head span: 2016 Summer Olympics
can't find spacy head span: 2016 Summer Olympics
can't find spacy head span: 2016 Summer Olympics
can't find spacy head span: 2016 Summer Olympics
can't find spacy head span: 2016 Summer Olympics
5 took 7.702909231185913 seconds..
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
9 took 7.403293132781982 seconds..
15 took 0.10424494743347168 seconds..
21 took 0.3404879570007324 seconds..




can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
29 took 7.842551231384277 seconds..
32 took 0.0069844722747802734 seconds..
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
42 took 8.625006675720215 seconds..
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
can't find spacy head span: 2008 Summer Olympics
56 took 8.328632354736328 seconds..
57 took 0.06678605079650879 seconds..
67 took 0.02746129035949707 seconds..
77 took 0.03363180160522461 seconds..
80 took 0.07282900810241699 seconds..
can't find spa

In [46]:
test

Unnamed: 0,id,revid,url,title,text,preprocessed
5,67248261,41840956,https://en.wikipedia.org/wiki?curid=67248261,Clepardia Kraków,"Clepardia Kraków (""KS Clepardia Kraków"") is a ...","(Clepardia, Kraków, (, "", KS, Clepardia, Krakó..."
9,67248272,12809580,https://en.wikipedia.org/wiki?curid=67248272,Mason Duval,Mason Duval (born 24 August 2001) is a Caymani...,"(Mason, Duval, (, born, 24, August, 2001, ), i..."
15,67248291,754619,https://en.wikipedia.org/wiki?curid=67248291,Metamorfosi railway station,Metamorfosi railway station () is a station on...,"(Metamorfosi, railway, station, (, ), is, a, s..."
21,67248312,754619,https://en.wikipedia.org/wiki?curid=67248312,"3rd Battalion, Yorkshire Volunteers","The 3rd Battalion (West Yorkshire), Yorkshire ...","(The, 3rd, Battalion, (, West, Yorkshire, ), ,..."
29,67248329,3311318,https://en.wikipedia.org/wiki?curid=67248329,Mawa Gare,Mawa Gare (Mawa Station) is a village in the B...,"(Mawa, Gare, (, Mawa, Station, ), is, a, villa..."
...,...,...,...,...,...,...
363,67249704,25829265,https://en.wikipedia.org/wiki?curid=67249704,Jack McKnight,Jack McKnight (born 10 June 1994) is a Turks a...,"(Jack, McKnight, (, born, 10, June, 1994, ), i..."
364,67249708,7098284,https://en.wikipedia.org/wiki?curid=67249708,Yanchan,"Yanchan Rajmohan (born March 8, 1995), known p...","(Yanchan, Rajmohan, (, born, March, 8, ,, 1995..."
365,67249717,42313471,https://en.wikipedia.org/wiki?curid=67249717,Workplace exposure monitoring,Workplace exposure monitoring is the monitorin...,"(Workplace, exposure, monitoring, is, the, mon..."
369,67249744,28786153,https://en.wikipedia.org/wiki?curid=67249744,Irwin Shepard,Irwin Shepard (5 July 1843 – 17 April 1916) wa...,"(Irwin, Shepard, (, 5, July, 1843, –, 17, Apri..."
