# Load the Rebel model and tokenizer

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
from docx import Document
import os
import pickle
import subprocess
import re
import pickle
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Load the Rebel model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

corenlp_directory = 'stanford-corenlp-4.5.7'
temp_txt_files_directory = 'temp_txt_files'



# Define a class to manage the knowledge base (KB)

In [4]:
# Define a class to manage the knowledge base (KB)
class KB:
    def __init__(self):
        self.entities = {}
        self.relations = []
        self.sources = {}

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            self.add_relation(r)

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations if self.are_relations_equal(r2, r)][0]
        spans_to_add = [span for span in r2["meta"]["spans"]
                        if span not in r1["meta"]["spans"]]
        r1["meta"]["spans"] += spans_to_add

    def add_entity(self, e):
        self.entities[e["title"]] = {k: v for k, v in e.items() if k != "title"}

    def add_relation(self, r):
        candidate_entities = [r["head"], r["tail"]]
        entities = [{"title": ent} for ent in candidate_entities]

        for e in entities:
            self.add_entity(e)

        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")


# Function to extract relations from model output
def extract_relations_from_text(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations










# Function to process a single document file
def process_document(file_path):
    try:
        doc = Document(file_path)
        
        # Extract text
        text = "\n".join(paragraph.text for paragraph in doc.paragraphs)

        # Print text type
        print(f"Text type: {type(text)}")  # Should be <class 'str'>

        # Convert text to KB
        kb = from_text_to_kb(text, span_length=128, verbose=False)
        return kb

    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None


# Function to convert text to KB (Knowledge Base)


def from_text_to_kb(text, span_length=128, verbose=False, extend_kb=None):
    
    # Tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")
    
    # Compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        end = start + span_length
        # Ensure end does not exceed the number of tokens
        if end > num_tokens:
            end = num_tokens
        spans_boundaries.append([start, end])
        start = end - overlap
        # Ensure start does not go negative
        if start < 0:
            start = 0
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # Transform input with spans
    tensor_ids = []
    tensor_masks = []
    for boundary in spans_boundaries:
        start, end = boundary
        if start < num_tokens and end <= num_tokens:
            tensor_ids.append(inputs["input_ids"][0][start:end])
            tensor_masks.append(inputs["attention_mask"][0][start:end])

    # Check if we have tensors to stack
    if not tensor_ids or not tensor_masks:
        raise ValueError("No valid spans found to create tensors.")

    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # Generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 512,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(**inputs, **gen_kwargs)
    
    # Decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # Create KB
    if extend_kb is None:
        kb = KB()
    else:
        kb = extend_kb

    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_text(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb




# Function to save KB to a pickle file

In [5]:
# Function to save KB to a pickle file
def save_kb(kb, filename):
    save_folder = "Saved_Knowledge/"
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    # Print entities and relations before saving

    print("Relations before saving:", kb.relations)
    
    with open(os.path.join(save_folder, filename), "wb") as f:
        pickle.dump(kb, f)
    print(f"KB saved to '{filename}'")

# Main script to process all .docx files in the current directory
def main():
    # Update this path to the directory wheremarie your .docx files are stored
    directory = 'Marie Curie doc/'  # Replace with your directory path

    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            kb = process_document(file_path)
            if kb:
                save_kb(kb, f"{filename.split('.')[0]}.pkl")
                print(f"Processed '{filename}' and saved KB.")


if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (1384 > 1024). Running this sequence through the model will result in indexing errors


Text type: <class 'str'>
Relations before saving: [{'head': 'Marie Curie', 'type': 'spouse', 'tail': 'Pierre Curie', 'meta': {'spans': [[0, 128]]}}, {'head': 'Pierre Curie', 'type': 'spouse', 'tail': 'Marie Curie', 'meta': {'spans': [[0, 128]]}}, {'head': 'Kingdom of Poland', 'type': 'capital', 'tail': 'Warsaw', 'meta': {'spans': [[125, 253]]}}, {'head': 'Kingdom of Poland', 'type': 'country', 'tail': 'Russian Empire', 'meta': {'spans': [[125, 253]]}}, {'head': 'Kingdom of Poland', 'type': 'located in the administrative territorial entity', 'tail': 'Russian Empire', 'meta': {'spans': [[125, 253]]}}, {'head': 'Pierre Curie', 'type': 'place of death', 'tail': 'Paris', 'meta': {'spans': [[250, 378]]}}, {'head': 'Henri Becquerel', 'type': 'place of death', 'tail': 'Paris', 'meta': {'spans': [[250, 378]]}}, {'head': 'Pierre Curie', 'type': 'place of death', 'tail': 'Paris street accident', 'meta': {'spans': [[250, 378]]}}, {'head': 'Henri Becquerel', 'type': 'place of death', 'tail': 'Paris

In [6]:
import stanza

# Initialize the Stanford NLP pipeline
stanza_nlp = stanza.Pipeline('en')

def extract_relations_with_stanza(text):
    doc = stanza_nlp(text)
    relations = []
    
    # Extract entities
    entities = {}
    for sentence in doc.sentences:
        for entity in sentence.ents:
            entities[entity.text] = entity.type
    
    # Extract relations using dependency parsing
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel == 'nsubj':  # Subject
                subject = word.text
                relation = next((w for w in sentence.words if w.head == word.id), None)
                if relation:
                    relation_type = relation.deprel
                    obj = next((w for w in sentence.words if w.head == relation.id), None)
                    if obj:
                        relations.append({
                            'head': subject,
                            'type': relation_type,
                            'tail': obj.text,
                            'meta': {'spans': []}  # Placeholder for span data
                        })
    
    return relations

def extend_kb_with_stanza(kb, text):
    relations = extract_relations_with_stanza(text)
    for relation in relations:
        kb.add_relation(relation)
    return kb

2024-08-29 11:26:25 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-08-29 11:26:25 INFO: Downloaded file to C:\Users\ASUS\stanza_resources\resources.json
2024-08-29 11:26:26 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-08-29 11:26:26 INFO: Using device: cpu
2024-08-29 11:26:26 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-08-29 11:26:26 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-08-29 11:26:26 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load

In [7]:
def load_kb(file_path):
    # Assuming your KB is stored as a pickle file
    with open(file_path, 'rb') as f:
        kb = pickle.load(f)
    return kb

def saved_kb(kb, file_path):
    # Save the extended KB as a pickle file
    with open(file_path, 'wb') as f:
        pickle.dump(kb, f)

for i in range(1, 9):  # Assuming you have 8 text files
    # Load the text file from the temp_txt_files directory
    file_path = f"temp_txt_files/Marie Curie {i}.txt"
    with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
        text = file.read()
        
        # Load the corresponding knowledge base (KB) from the saved_knowledge directory
        kb_path = f"Saved_Knowledge/Marie Curie {i}.pkl"
        kb = load_kb(kb_path)
        
        # Extend the KB using the Stanza processing method
        kb = extend_kb_with_stanza(kb, text)
        
        # Save the extended KB back to the saved_knowledge directory as a new .pkl file
        extended_kb_path = f"Saved_Knowledge/Marie Curie_Extended {i}.pkl"
        saved_kb(kb, extended_kb_path)
        
        # Optional: Print the KB (or any other desired output)
        print(f"KB {i} has been extended and saved to {extended_kb_path}.")

KB 1 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 1.pkl.
KB 2 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 2.pkl.
KB 3 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 3.pkl.
KB 4 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 4.pkl.
KB 5 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 5.pkl.
KB 6 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 6.pkl.
KB 7 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 7.pkl.
KB 8 has been extended and saved to Saved_Knowledge/Marie Curie_Extended 8.pkl.
