In [1]:
!pip install transformers wikipedia neo4j kuzu langchain > /dev/null

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
import IPython
import pandas as pd

# Load the REBEL model

In [3]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading (…)okenizer_config.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [4]:
test_text = "This notebook shows how to use LLMs to provide a natural language interface to Kùzu database."

In [5]:
# Tokenizer text
model_inputs = tokenizer(test_text,
                          max_length=512,
                          padding=True,
                          truncation=True,
                        return_tensors='pt')

print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

# Generate
gen_kwargs = {
    "max_length": 216,
    "length_penalty": 0,
    "num_beams": 5,
    "num_return_sequences": 4
}
generated_tokens = model.generate(
    **model_inputs,
    **gen_kwargs,
)
decoded_preds = tokenizer.batch_decode(generated_tokens,
                                        skip_special_tokens=False)

decoded_preds

Num tokens: 23


['<s><triplet> LLMs <subj> natural language interface <obj> use</s>',
 '<s><triplet> LLMs <subj> natural language <obj> use</s><pad>',
 '<s><triplet> LLM <subj> natural language interface <obj> use</s>',
 '<s><triplet> LLMs <subj> natural language <obj> subclass of</s>']

In [6]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [7]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [8]:
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text,
                             max_length=512,
                             padding=True,
                             truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [9]:
small_kb = from_small_text_to_kb(test_text)

In [10]:
small_kb.print()

Relations:
  {'head': 'LLMs', 'type': 'use', 'tail': 'natural language interface'}
  {'head': 'LLMs', 'type': 'use', 'tail': 'natural language'}
  {'head': 'LLM', 'type': 'use', 'tail': 'natural language interface'}


In [11]:
text = """If you are just getting started,
and you have relatively simple apis,
you should get started with chains.
Chains are a sequence of predetermined steps,
so they are good to get started with as they
give you more control and let you understand
what is happening better."""

kb = from_small_text_to_kb(text,
                           verbose=True)
kb.print()

Num tokens: 62
Relations:
  {'head': 'Get started with chains', 'type': 'subclass of', 'tail': 'Chain'}
  {'head': 'Get started with chains', 'type': 'uses', 'tail': 'Chain'}
  {'head': 'Get started with chains', 'type': 'instance of', 'tail': 'Chain'}


# Split spans: from long text to KB

In [12]:
class SpankB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head",
                                                     "type",
                                                     "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

    def save_csv(self,file_name):
        print(f"Saving to file {file_name}")
        reln_df = pd.DataFrame(self.relations)
        reln_df.to_csv(file_name,index=False)

In [13]:
def from_text_to_kb(text,
                    span_length=50,
                    verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = SpankB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb

In [14]:
cal_text = """in an effort to curb the wildfires that regularly engulf the state, California is now fighting fire with … well, you know.

So far in 2023, only 6,700 acres of Golden State land has burned, but fire season has barely begun — since 2017, the state has been averaging 1.8 million acres burned per year, overwhelmingly concentrated in the back half of the year.

To deal with the rising threats from fire and smoke, the state is sponsoring a program to encourage more prescribed burns and cultural burns. The “Prescribed Fire Liability Claims Fund Pilot” will allocate $20 million to cover losses that may occur if a prescribed or cultural burn ends up slipping control.

Setting fires intentionally may sound like a paradoxical approach to fire control, but if they are done correctly, controlled burns reduce the fuel available for wildfires, helping to prevent the catastrophically large, hot fires that overwhelm firefighting resources. They also are beneficial for a healthy ecosystem."""

In [15]:
kb = from_text_to_kb(cal_text,
                     verbose=True)

Input has 202 tokens
Input has 5 spans
Span boundaries are [[0, 50], [38, 88], [76, 126], [114, 164], [152, 202]]


https://www.freethink.com/science/prescribed-and-cultural-burns

In [16]:
kb.print()

Relations:
  {'head': 'regularly engulf the state', 'type': 'subclass of', 'tail': 'wildfires', 'meta': {'spans': [[0, 50]]}}
  {'head': 'wildfires regularly engulf the state', 'type': 'located in the administrative territorial entity', 'tail': 'California', 'meta': {'spans': [[0, 50]]}}
  {'head': 'fighting fire with … well, you know.', 'type': 'facet of', 'tail': 'wildfire', 'meta': {'spans': [[0, 50]]}}
  {'head': '2017', 'type': 'point in time', 'tail': '2017', 'meta': {'spans': [[38, 88]]}}
  {'head': 'fire season', 'type': 'point in time', 'tail': '2017', 'meta': {'spans': [[38, 88]]}}
  {'head': 'fire season has barely begun', 'type': 'point in time', 'tail': '2017', 'meta': {'spans': [[38, 88]]}}
  {'head': 'cultural burns', 'type': 'subclass of', 'tail': 'prescribed burns', 'meta': {'spans': [[76, 126]]}}
  {'head': 'cultural burn', 'type': 'subclass of', 'tail': 'prescribed burn', 'meta': {'spans': [[76, 126]]}}
  {'head': 'cultural burn', 'type': 'subclass of', 'tail': 'pres

In [17]:
kb.save_csv("relations.csv")

Saving to file relations.csv
