# Install the relevant libraries

In [11]:
!pip install transformers wikipedia fuzzywuzzy[speedup] wikipedia-api



In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipediaapi
import wikipedia
import IPython
from fuzzywuzzy import fuzz




# Load the REBEL model

In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [3]:
nodes = ['Genetics']

In [24]:
async def similar(a, b):
    return fuzz.token_set_ratio(a,b)

async def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
            relations.append({
                'head': subject.strip(),
                'type': relation.strip(),
                'tail': object_.strip()
            })
            
    to_delete = []
    for rel in relations:
        for name in nodes:
            if await similar(rel['head'].lower(), name.lower()) > 80 or similar(rel['tail'].lower(), name.lower()) > 80:
                break
        else:
            to_delete.append(rel)

    for rel_to_delete in to_delete:
        relations.remove(rel_to_delete)

          
    return relations

# Split spans: from long text to KB

In [28]:
import asyncio
import wikipediaapi
import wikipedia

class KB():
    def __init__(self):
        self.entities = {}
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    async def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add
    
    async def check_wikipedia_entity(self, entity):
        wiki_wiki = wikipediaapi.Wikipedia('Entity validation', 'en')  # Creating a Wikipedia object for English Wikipedia
        page = wiki_wiki.page(entity)  # Get the Wikipedia page for the given entity
        return page.exists()

    async def get_wikipedia_data(self, candidate_entity):
        try:
            page = await asyncio.to_thread(wikipedia.page, candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    async def add_relation(self, r):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        
        check_entities = [ent if await self.check_wikipedia_entity(ent) else ent for ent in candidate_entities]
        
        if len(check_entities) != 2:
            return
        
        entities = [await self.get_wikipedia_data(ent.replace(' ', '_')) if await self.get_wikipedia_data(ent)
                    else {'title' : ent} for ent in check_entities]

        # manage new entities
        for e in entities:
            self.add_entity(e)
        
        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            await self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")


In [30]:
async def from_text_to_kb(text, span_length=128, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = await extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            await kb.add_relation(relation)
        i += 1

    return kb

# Filter and normalize entities with Wikipedia

- remove all entities that doesn't have a page on Wikipedia
- merge entities if they have the same wikipedia page

In [7]:
nodes.append("Kafka")

In [31]:
text1 = """
Biochemistry and molecular biology are used interchangeably in this book, and they are increasingly recognized as a unified discipline, rather than as two distinct disciplines or subdisciplines. Probably this is due to widespread adoption of the techniques of both disciplines by all biologists. However, historically molecular biology emerged from a convergence between biochemistry and genetics. Biochemistry is centred in chemistry and principally deals with understanding the function of proteins. Genetics is principally concerned with the function of genes and inheritance. Molecular biology emerged later, once the molecular mechanisms (biochemistry) of gene function were determined, and focuses on the relationship between genes and functional gene products (which are in most cases proteins). So molecular biology could be viewed as the nexus between biochemistry and genetics
"""

text2 = """Apache Kafka is an open-source distributed event streaming platform used for building real-time data pipelines and streaming applications. It's designed to handle high volumes of data in a fault-tolerant and scalable manner. At its core, Kafka is built around the concept of a distributed commit log. It allows you to publish and subscribe to streams of records, store these records in a fault-tolerant way, and process them in real time or batch. Kafka operates on a publish-subscribe model, where producers publish data to topics, and consumers subscribe to those topics to process the data.
"""

# return_values = []

# import threading
# thread1 = threading.Thread(target=lambda: return_values.append(from_text_to_kb(text1)))
# thread2 = threading.Thread(target=lambda: return_values.append(from_text_to_kb(text2)))

# # Start both threads
# thread1.start()
# thread2.start()

# # Join both threads to the main program
# thread1.join()
# thread2.join()

# return_values
# return_values[0].print()
# return_values[1].print()

import asyncio

async def main():
    task1 = asyncio.create_task(from_text_to_kb(text1))
    task2= asyncio.create_task(from_text_to_kb(text2))

    return await task1, await task2

return_values = await main()

return_values[0].print()
return_values[1].print()



  lis = BeautifulSoup(html).find_all('li')


Entities:
  ('Genetics', {'url': 'https://en.wikipedia.org/wiki/Genetics', 'summary': 'Genetics is the study of genes, genetic variation, and heredity in organisms. It is an important branch in biology because heredity is vital to organisms\' evolution. Gregor Mendel, a Moravian Augustinian friar working in the 19th century in Brno, was the first to study genetics scientifically. Mendel studied "trait inheritance", patterns in the way traits are handed down from parents to offspring over time. He observed that organisms (pea plants) inherit traits by way of discrete "units of inheritance". This term, still used today, is a somewhat ambiguous definition of what is referred to as a gene.\nTrait inheritance and molecular inheritance mechanisms of genes are still primary principles of genetics in the 21st century, but modern genetics has expanded to study the function and behavior of genes. Gene structure and function, variation, and distribution are studied within the context of the cell,