# Hybrid RAG model

# Setup

In [1]:
# Python version
import sys 
print(sys.version)

In [2]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [None]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# Parameters

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
# Use optimum
use_optimum = config["use_optimum"]

# Show
use_optimum

# Reference

- https://colab.research.google.com/drive/1rt318Ew-5dDw21YZx2zK2vnxbsuDAchH?usp=sharing#scrollTo=YFw8HWIyTCnJ
- https://www.reddit.com/r/LocalLLaMA/comments/16j624z/some_questions_of_implementing_llm_to_generate_qa
- https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1
- https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
- https://github.com/edumunozsala/question-answering-pinecone-sts
- https://medium.com/@pankaj_pandey/fine-tuning-rag-models-for-custom-content-generation-849d7ffce97f

# Directory

In [3]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)

# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [4]:
# Libraries for display and visualization
from IPython.display import Markdown, display
import gradio as gr

# Libraries for managing data and serialization
import pinecone
import yaml
import json

# General utility libraries
import gc
import os
import time

# Libraries related to HuggingFace
from huggingface_hub import notebook_login

# Libraries related to Transformers
from transformers import BitsAndBytesConfig
from sentence_transformers import CrossEncoder
from typing import List
import accelerate

# Libraries related to Langchain
from sentence_transformers import SentenceTransformer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chains import (
    SimpleSequentialChain, 
    RetrievalQA, 
    LLMChain,
    RetrievalQAWithSourcesChain,
    ConversationalRetrievalChain
)
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain import HuggingFacePipeline
from langchain import PromptTemplate

# Libraries related to Pinecone
from langchain_pinecone import PineconeVectorStore  
from pinecone import Pinecone

# Libraries related to optimization
import xformers

# Other miscellaneous libraries
from tqdm.notebook import tqdm
from nomic import atlas
import nomic

# Local custom functions
from functions import *

In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Device

In [5]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

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


In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

248

# Pinecone

Let's get Pinecone vector store ready.

In [9]:
# Init pinecone
pinecone = Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

# Connect
index_name = 'lawgpt-unstructured-db'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.03524,
 'namespaces': {'': {'vector_count': 3524}},
 'total_vector_count': 3524}

# Embedding model

In [11]:
# Model ID
embed_model_id = config["embedding_model"]

# Show
embed_model_id

.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [12]:
# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
) 

# Show
embed_model

Quadro P5000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Load LLM model

In [13]:
# Model ID
use_quantization = config["use_quantization"]

# Show
use_quantization

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [14]:
# Select model
model_id = config["model"]

# Show
model_id

Quadro P5000
Memory Usage:
Allocated: 4.3 GB
Cached:    4.5 GB


In [None]:
# Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

In [None]:
# Set BNB configuration if quantization is enabled
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16
) if use_quantization else None

In [None]:
# Set model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    quantization_config = bnb_config,
    device_map = "auto"
)

In [None]:

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Command

We now import the pre_prompt and the prompt_context from the yaml file.

In [15]:
# Get pre-prompt
pre_prompt = config["pre_prompt"]

# Create prompt context
prompt_context = config["prompt_context"]

In [16]:
# General template
general_template = pre_prompt + prompt_context + "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}"

In [17]:
# Mistral template
mistral_template = "<s>[INST]" + pre_prompt + prompt_context +  "A continuación se proporciona el contexto: [/INST] {context}" + "</s>" + "[INST] pregunta: {query} [/INST]"

In [18]:
# Google template
google_template = f"""
<start_of_turn>user
{pre_prompt}. {prompt_context} A continuación se proporciona el contexto: 
Contexto: {{context}} 
Pregunta: {{query}}
<end_of_turn>
<start_of_turn>model
Respuesta: """

In [None]:
# Define the final template selection logic
if "mistral" in model_id:
    template = mistral_template
    selected_template_message = "Mistral template selected."
elif "google" in model_id:
    template = google_template
    selected_template_message = "Google template selected."
else:
    template = general_template
    selected_template_message = "Default template selected."

# Print out the selected template message
print(selected_template_message)

In [None]:
# Prompt Template
prompt = PromptTemplate(
    template = template, 
    input_variables = ["context", "query"]
)

We can now print the prompt.

In [None]:
# Show
prompt

# LLM Pipeline

Let's define the LLM Pipeline.

In [19]:
# Define pipeline with parameters from config file
generate_text = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    return_full_text = config["return_full_text"],
    max_new_tokens = config["max_new_tokens"],
    repetition_penalty = config["repetition_penalty"],
    temperature = config["temperature"],
    pad_token_id = tokenizer.eos_token_id,
    batch_size = 1
)

# HF pipeline
llm = HuggingFacePipeline(pipeline = generate_text)

# Create llm chain 
llm_chain = LLMChain(llm = llm, prompt = prompt)

# Vector store

In [20]:
# Text field
text_field = "text"  

# Vector store
vectorstore = PineconeVectorStore(index, embed_model, text_field)  

# Show
vectorstore

# Test models

In [21]:
# Simple context
context = "Eres una API con conocimientos legales. Debes responder a preguntas en Español. Si no conoces la respuesta, admítelo."

# Query
query = 'Explícame el Artículo 245 del Código Penal de España referente a ocupaciones ilegales de bienes inmuebles'

# Find closer docs

We can now see the closer docs to the query and it's scores.

In [22]:
# Similarity output
similarity_output = vectorstore.similarity_search_with_score(query, k = config['top_k_docs'])

# Context preprocessed
context_processed = [{"context": doc.page_content, "score": score} for doc, score in similarity_output]

# Show
context_processed[0:3]

# Re-ranking

In [23]:
# Model ID
reranking_model = config["reranking_model"]

# Show
reranking_model 

In [24]:
# Model ID
top_reranked_docs = config["top_reranked_docs"]

# Show
top_reranked_docs

[{'context': 'Ley Orgánica 8/1983, de 25 de junio, de Reforma Urgente y Parcial del Código Penal.: E1 párrafo 1. queda así redactado: <Los delitos prescriben a los veinte años cuando la Ley señalare al delito la pena de reclusión mayor>. Artículo 115. El apartado 1. queda así redactado: <Las de reclusión mayor a los treinta y cinco años>. Artículo 120. Queda así redactado: <El español que indujere a una potencia extranjera a declarar la guerra a España o se concertase con ella para el mismo fin, será castigado con la pena de reclusión mayor>. Artículo 137 bis. Queda redactado así: <Los que, con propósito de destruir, total o parcialmente, a un grupo nacional étnico, racial o religioso perpetraren alguno de los actos siguientes, serán castigados: 1. Con la pena de reclusión mayor si causaren la muerte castración, esterilización, mutilación o lesión grave a alguno de sus miembros. 2. Con la reclusión menor, si sometieren al grupo o a cualquiera de sus individuos a condiciones de existenc

In [None]:
# Extracting 'title' keys
final_context = [entry['context'] for entry in context_processed]

In [None]:
# Cross encoder
cross_encoder = CrossEncoder(reranking_model)

# Show
cross_encoder

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Re-ranking
text_field = 'context'
ranked_context = rank_documents(cross_encoder, text_field, query, context_processed)

In [None]:
# Sort
sorted_ranked_context = dict(sorted(ranked_context.items())[:top_reranked_docs])

In [None]:
# Format
sorted_ranked_context = list(sorted_ranked_context.values())

In [None]:
# Show
sorted_ranked_context[0:3]

# Get max docs

In [25]:
# Model ID
max_model_tokens = config["max_model_tokens"]

# Show
max_model_tokens

In [26]:
# Initialize cumulative token count
cumulative_tokens = 0

# Filtered list to store dictionaries
filtered_context = []

# Iterate through the list of dictionaries
for item in sorted_ranked_context:
    # Calculate number of tokens for 'context' value
    token_count = count_tokens(item['context'])
    
    # Cumulative sum of token counts
    cumulative_tokens += token_count
    
    # Check if cumulative tokens are still less than max_model_tokens
    if cumulative_tokens < max_model_tokens:
        filtered_context.append(item)
    else:
        break

In [27]:
# Show
filtered_context[0:3]

<b>Explícame el Artículo 245 del Código Penal de España, que hace referencia a la usurpación de inmuebles.</b>

<p>Context: {
"context": {
"lema": "Código Penal de España",
"articulo": 245
},
"score": 0.9
}

Respuesta: El Artículo 245 del Código Penal de España regula la usurpación de inmuebles. Según este artículo, quien, sin título o derecho, ocupa, posee, disfruta o goza de algún inmueble, o lo haga por medio de terceros, cometerá un delito. La pena prevista para esta infracción puede variar desde una multa de seis meses a dos años, o incluso, hasta cinco años en determinados casos graves. Es importante señalar que para que se aplique este artículo, no es necesario que el inmueble pertenezca al Estado o a alguna entidad pública, sino que basta que el dueño legítimo lo esté desconociendo.

Referencias:
- Ley 31/2004, de 1 de octubre, del Código Penal.
- Boletín Oficial del Estado nº 278, de 6 de noviembre de 2004.
- Artículo 245 del Código Penal de España.</p>

In [None]:
# Sum tokens for all contents in filtered_data
total_tokens = sum(count_tokens(item["context"]) for item in filtered_context)

# Print total tokens
print("Total tokens for all contexts in filtered_context:", total_tokens)

In [None]:
len(filtered_context)

# Enhanced model

Now, let's use the RAG model.

In [28]:
# Filter contexts, keeping only the context strings
filtered_context_ready = [item["context"] for item in filtered_context]

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.62 GiB (GPU 0; 15.89 GiB total capacity; 9.59 GiB already allocated; 5.39 GiB free; 9.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Enhanced model
enhanced_model = llm_chain({"context": str(filtered_context_ready), "query": query})

In [None]:
# Output
enhanced_result = enhanced_model['text'].strip()

# Markdown
display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{enhanced_result}</p>"))

# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()