# Raptor

# Setup

In [1]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [2]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [3]:
# Jupyter extensions
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# Parameters

In [4]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [5]:
# Use optimum
use_optimum = config["use_optimum"]

# Show
use_optimum

False

# Reference

- https://github.com/parthsarthi03/raptor/tree/master

# Directory

In [6]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/home/jovyan/workspace/LawGPT'

# Libraries

In [7]:
# General libraries
import pandas as pd
import numpy as np
import time
import yaml
import csv
import sys
import gc
import os

# Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import transformers
import accelerate

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from nltk.corpus import stopwords
import torch
import spacy
import nltk

# Raptor
from raptor import RetrievalAugmentation, BaseSummarizationModel, BaseQAModel, BaseEmbeddingModel, RetrievalAugmentationConfig

# Tokenizer
from nltk.tokenize import word_tokenize

# Optimization
import xformers

# Cloud
from google.cloud import storage

# Other
from tqdm.notebook import tqdm

# Local
from functions import *

2024-03-26 10:52:31,080 - Loading faiss with AVX2 support.
2024-03-26 10:52:31,171 - Successfully loaded faiss with AVX2 support.


In [8]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

In [9]:
# Start timing the notebook run
start_time = time.time()

# Device

In [10]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

Using device: cuda

NVIDIA A10G
Memory Usage:
Allocated:    0.0 GB
Cached:       0.0 GB
Available:   22.0 GB
Total:       22.0 GB


# Load data

In [11]:
# Set file path with complete data
file_path = 'prepared_data/splitted_input_base.csv'

# Read the data into a DataFrame
df_txt = pd.read_csv(file_path)

# Show
df_txt.head()

Unnamed: 0,id,url,title,date,legislative_origin,department,rang,text_id,text
0,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk1,JUAN CARLOS I\nREY DE ESPAÑA\nA todos los que ...
1,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk2,EXPOSICION DE MOTIVOS\nSi se ha llegado a defi...
2,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk3,A partir de los distintos intentos de reforma ...
3,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk4,"El eje de dichos criterios ha sido, como es ló..."
4,1995-25444,https://www.boe.es/diario_boe/xml.php?id=BOE-A...,"Ley Orgánica 10/1995, de 23 de noviembre, del ...",1995-11-24,Estatal,Jefatura del Estado,Ley Orgánica,1995-25444_chunk5,"En segundo lugar, se ha afrontado la antinomia..."


# Parameters

In [12]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Custom model

## Quantization

In [13]:
# Model ID
use_quantization = config["core_quantization"]

# Show
use_quantization

True

## Embedding model

In [14]:
# Model ID
embed_model_id = config["embedding_model"]

# Show
embed_model_id

'sentence-transformers/multi-qa-mpnet-base-cos-v1'

In [15]:
# Embedding class
class Embedding_Model(BaseEmbeddingModel):
    
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def create_embedding(self, text):
        return self.model.encode(text)

## Summarization

In [16]:
# Summarizer
core_model_id = config['core_model']

# Show
core_model_id

'google/gemma-2b-it'

In [17]:
# Summarization class
class Summarization_Model(BaseSummarizationModel):
    
    def __init__(self, model_name, use_quantization = True):
        
        # Initialize the tokenizer and the pipeline for the model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if use_quantization:
            # Quantization
            self.bnb_config = transformers.BitsAndBytesConfig(
                load_in_4bit = True,
                bnb_4bit_quant_type = 'nf4',
                bnb_4bit_use_double_quant = True,
                bnb_4bit_compute_dtype = torch.bfloat16
            )
        else:
            self.bnb_config = None
        
        # Set model
        self.model = transformers.AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code = True,
            quantization_config = self.bnb_config,
            device_map = "auto"
        )

        # Create pipeline
        self.summarization_pipeline = transformers.pipeline(
            model = self.model,
            tokenizer = self.tokenizer,
            task = 'text-generation',
            model_kwargs = {"torch_dtype": torch.bfloat16},
            return_full_text = config["core_return_full_text"],
            max_new_tokens = config["core_max_new_tokens"],
            repetition_penalty = config["core_repetition_penalty"],
            temperature = config["core_temperature"],
            pad_token_id = self.tokenizer.eos_token_id
        )

    def summarize(self, context):
        
        # Format the prompt for summarization
        messages = [
            {"role": "user", "content": f"Write a summary of the following, including as many key details as possible: {context}:"}
        ]
        
        # Tokenizer for chat
        prompt = self.tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
        
        # Generate the summary using the pipeline
        outputs = self.summarization_pipeline(
            prompt,
            max_new_tokens = config["core_max_new_tokens"],
            do_sample = True,
            temperature = 0.5,
            top_k = 50,
            top_p = 0.95
        )
        
        # Extracting and returning the generated summary
        summary = outputs[0]["generated_text"].strip()
        
        # Return
        return summary

## QA Model

In [18]:
# Summarizer
core_model_id = config['core_model']

# Show
core_model_id

'google/gemma-2b-it'

In [19]:
# QA class
class QA_Model(BaseQAModel):
    
    def __init__(self, model_name, use_quantization = True):
        
        # Initialize the tokenizer and the pipeline for the model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if use_quantization:
            # Quantization
            self.bnb_config = transformers.BitsAndBytesConfig(
                load_in_4bit = True,
                bnb_4bit_quant_type = 'nf4',
                bnb_4bit_use_double_quant = True,
                bnb_4bit_compute_dtype = torch.bfloat16
            )
        else:
            self.bnb_config = None
        
        # Set model
        self.model = transformers.AutoModelForCausalLM.from_pretrained(
            model_name,
            trust_remote_code = True,
            quantization_config = self.bnb_config,
            device_map = "auto"
        )

        # Create pipeline
        self.qa_pipeline = transformers.pipeline(
            model = self.model,
            tokenizer = self.tokenizer,
            task = 'text-generation',
            model_kwargs = {"torch_dtype": torch.bfloat16},
            return_full_text = config["core_return_full_text"],
            max_new_tokens = config["core_max_new_tokens"],
            repetition_penalty = config["core_repetition_penalty"],
            temperature = config["core_temperature"],
            pad_token_id = self.tokenizer.eos_token_id
        )

    def answer_question(self, context, question):
        
        # Apply the chat template for the context and question
        messages=[
              {"role": "user", "content": f"Dado el contexto: {context} Da la mejor respuesta completa entre las opciones a la pregunta {question}"}
        ]
        
        # pROMPT
        prompt = self.tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
        
        # Generate the answer using the pipeline
        outputs = self.qa_pipeline(
            prompt,
            max_new_tokens = config["core_max_new_tokens"],
            do_sample = True,
            temperature = 0.5,
            top_k = 50,
            top_p = 0.95
        )
        
        # Extracting and returning the generated answer
        answer = outputs[0]["generated_text"][len(prompt):]
        
        # Return
        return answer

# Custom RAC

In [20]:
# Set custom RAC
RAC = RetrievalAugmentationConfig(
    embedding_model = Embedding_Model(embed_model_id),
    summarization_model = Summarization_Model(core_model_id, use_quantization), 
    qa_model = QA_Model(core_model_id, use_quantization)
)

# Show
RAC

2024-03-26 10:52:31,428 - Load pretrained SentenceTransformer: sentence-transformers/multi-qa-mpnet-base-cos-v1
2024-03-26 10:52:31,632 - Use pytorch device: cuda
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
2024-03-26 10:52:33,201 - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-03-26 10:52:36,173 - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<raptor.RetrievalAugmentation.RetrievalAugmentationConfig at 0x7f2683a12110>

In [21]:
# Create RA
RA = RetrievalAugmentation(config = RAC)

# Show
RA

2024-03-26 10:52:38,320 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <__main__.Summarization_Model object at 0x7f2683ef9ff0>
            Embedding Models: {'EMB': <__main__.Embedding_Model object at 0x7f2683ef94e0>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-03-26 10:52:38,320 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Le

<raptor.RetrievalAugmentation.RetrievalAugmentation at 0x7f26837148b0>

In [22]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3, 1), 'GB')
    properties = torch.cuda.get_device_properties(0)
    total_memory = properties.total_memory / 1024**3  # Convert bytes to GB
    allocated_memory = torch.cuda.memory_allocated(0) / 1024**3  # Convert bytes to GB
    available_memory = total_memory - allocated_memory
    print('Total:     ', round(total_memory, 1), 'GB')
    print('Available: ', round(available_memory, 1), 'GB')

NVIDIA A10G
Memory Usage:
Allocated: 3.9 GB
Cached:    3.9 GB
Total:      22.0 GB
Available:  18.1 GB


# Add documents

In [23]:
# Example run
with open('raw_data/sample.txt', 'r') as file:
    text = file.read()

In [None]:
# Add documents
RA.add_documents(text)

2024-03-26 10:52:38,346 - Creating Leaf Nodes


In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3, 1), 'GB')
    properties = torch.cuda.get_device_properties(0)
    total_memory = properties.total_memory / 1024**3  # Convert bytes to GB
    allocated_memory = torch.cuda.memory_allocated(0) / 1024**3  # Convert bytes to GB
    available_memory = total_memory - allocated_memory
    print('Total:     ', round(total_memory, 1), 'GB')
    print('Available: ', round(available_memory, 1), 'GB')

# Test model

In [None]:
# Query
# query = "Explícame el Artículo 245 del Código Penal de España"
query = "How did Cinderella reach her happy ending?"

In [None]:
# Get asnwer
answer = RA.answer_question(question = query)

# Show
print("Answer: ", answer)

# Save tree

In [None]:
# Save path
save_path = "raptor_dir/test"

# Save
RA.save(save_path)

# Load tree

In [None]:
# Load tree
RA = RetrievalAugmentation(tree = save_path)

# Show
RA

# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()