In [45]:
!pip install transformers wikipedia neo4j kuzu langchain > /dev/null

In [46]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
import IPython
import pandas as pd

# Load the REBEL model

In [47]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [48]:
test_text = "This notebook shows how to use LLMs to provide a natural language interface to Kùzu database."

In [49]:
# Tokenizer text
model_inputs = tokenizer(test_text,
                          max_length=512,
                          padding=True,
                          truncation=True,
                        return_tensors='pt')

print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

# Generate
gen_kwargs = {
    "max_length": 216,
    "length_penalty": 0,
    "num_beams": 5,
    "num_return_sequences": 4
}
generated_tokens = model.generate(
    **model_inputs,
    **gen_kwargs,
)
decoded_preds = tokenizer.batch_decode(generated_tokens,
                                        skip_special_tokens=False)

decoded_preds

Num tokens: 23


['<s><triplet> LLMs <subj> natural language interface <obj> use</s>',
 '<s><triplet> LLMs <subj> natural language <obj> use</s><pad>',
 '<s><triplet> LLM <subj> natural language interface <obj> use</s>',
 '<s><triplet> LLMs <subj> natural language <obj> subclass of</s>']

In [50]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [51]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [52]:
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text,
                             max_length=512,
                             padding=True,
                             truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [53]:
small_kb = from_small_text_to_kb(test_text)

In [54]:
small_kb.print()

Relations:
  {'head': 'LLMs', 'type': 'use', 'tail': 'natural language interface'}
  {'head': 'LLMs', 'type': 'use', 'tail': 'natural language'}
  {'head': 'LLM', 'type': 'use', 'tail': 'natural language interface'}


In [55]:
text = """If you are just getting started,
and you have relatively simple apis,
you should get started with chains.
Chains are a sequence of predetermined steps,
so they are good to get started with as they
give you more control and let you understand
what is happening better."""

kb = from_small_text_to_kb(text,
                           verbose=True)
kb.print()

Num tokens: 62
Relations:
  {'head': 'Get started with chains', 'type': 'subclass of', 'tail': 'Chain'}
  {'head': 'Get started with chains', 'type': 'uses', 'tail': 'Chain'}
  {'head': 'Get started with chains', 'type': 'instance of', 'tail': 'Chain'}


# Split spans: from long text to KB

In [56]:
class SpankB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head",
                                                     "type",
                                                     "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

    def save_csv(self,file_name):
        print(f"Saving to file {file_name}")
        reln_df = pd.DataFrame(self.relations)
        reln_df.to_csv(file_name,index=False)

In [57]:
def from_text_to_kb(text,
                    span_length=50,
                    verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = SpankB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb

In [58]:
halo_text = """The Didact, born Shadow-of-Sundered-Star,[3] is a Forerunner Promethean who held an extremely high status in the Forerunner society as protector of the ecumene,[9] head of the Warrior-Servant rate and supreme commander of the entire Forerunner military.[10] He wholeheartedly believed in the Mantle the Forerunners held to protect life, and fervently opposed the Halo Array as a sin beyond measure. He was also the lover and husband of the Librarian.

Originally thought to have been killed by the Master Builder, Faber,[11] the Didact effectively existed as two individuals during the final days of the Forerunner-Flood war; his original self, as well as his implanted consciousness within a young Forerunner known as Bornstellar Makes Eternal Lasting. To differentiate these two incarnations, the original Didact was referred to as the Ur-Didact, while his other incarnation was known as the IsoDidact.[12] The Ur-Didact was eventually exiled on Requiem, not to be awakened until 2557;[13] meanwhile, the IsoDidact served until the final days of the Flood conflict and was responsible for activating the Halo Array.[14]

After being released from his Cryptum many millennia later, the Ur-Didact resumed his war against humanity. Spartan John-117 and Cortana fought him on his ship, and sent him falling into slipspace, seemingly defeating him. After surviving slipspace, John-117 faced him again, this time with the rest of Blue Team. After a long and perilous battle, the Ur-Didact's biological form was destroyed under the combined power of several Composers. While he is considered "contained" by the Office of Naval Intelligence, his digitized consciousness apparently survives.[2] """

In [59]:
kb = from_text_to_kb(halo_text,
                     verbose=True)

Input has 375 tokens
Input has 8 spans
Span boundaries are [[0, 50], [46, 96], [92, 142], [138, 188], [184, 234], [230, 280], [276, 326], [322, 372]]


In [60]:
kb.print()

Relations:
  {'head': 'Didact', 'type': 'instance of', 'tail': 'Promethean', 'meta': {'spans': [[0, 50]]}}
  {'head': 'Didact', 'type': 'instance of', 'tail': 'Forerunner Promethean', 'meta': {'spans': [[0, 50]]}}
  {'head': 'Forerunner', 'type': 'subclass of', 'tail': 'Promethean', 'meta': {'spans': [[0, 50]]}}
  {'head': 'Warrior-Servant', 'type': 'instance of', 'tail': 'Mantle', 'meta': {'spans': [[46, 96]]}}
  {'head': 'Warrior-Servant', 'type': 'instance of', 'tail': 'rate', 'meta': {'spans': [[46, 96]]}}
  {'head': 'Warrior-Servant rate', 'type': 'instance of', 'tail': 'Mantle', 'meta': {'spans': [[46, 96]]}}
  {'head': 'Didact', 'type': 'spouse', 'tail': 'Librarian', 'meta': {'spans': [[92, 142]]}}
  {'head': 'Librarian', 'type': 'spouse', 'tail': 'Didact', 'meta': {'spans': [[92, 142]]}}
  {'head': 'Faber', 'type': 'occupation', 'tail': 'Master Builder', 'meta': {'spans': [[92, 142]]}}
  {'head': 'Bornstellar Makes Eternal Lasting', 'type': 'instance of', 'tail': 'Forerunner', 

In [61]:
kb.save_csv("relations.csv")

Saving to file relations.csv
