# Hybrid RAG model

### This notebook enhances the model and returns the result after the RAG Appication.

In [2]:
pip install gradio accelerate pinecone-client tqdm langchain-pinecone xformers nomic huggingface_hub stack urllib3==1.26.6 flash_attn langchain_pinecone python-dotenv langchain  tiktoken

Collecting gradio
  Using cached gradio-4.32.2-py3-none-any.whl.metadata (15 kB)
Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting langchain
  Using cached langchain-0.2.1-py3-none-any.whl.metadata (13 kB)
Collecting tqdm
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.1-py3-none-any.whl.metadata (8.9 kB)
Collecting xformers
  Using cached xformers-0.0.26.post1.tar.gz (4.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting nomic
  Using cached nomic-3.0.29.tar.gz (44 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  P

ERROR: Could not find a version that satisfies the requirement stack (from versions: none)
ERROR: No matching distribution found for stack


In [3]:
pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --upgrade transformers

In [None]:
pip install -U langchain-community

# Setup

In [4]:
# Python version
import sys 
print(sys.version)

3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


In [5]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os
import tiktoken

# Load env
load_dotenv()

False


We need the following torch configuration:

In [6]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# Parameters

In [7]:
# Load parameters from YAML file
import os

# Change the current working directory to the directory containing the YAML file
os.chdir('/notebooks/TFM_LAW_LLM')

# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
# Use optimum
use_optimum = config["use_optimum"]

# Show
use_optimum

False

# Reference

- https://colab.research.google.com/drive/1rt318Ew-5dDw21YZx2zK2vnxbsuDAchH?usp=sharing#scrollTo=YFw8HWIyTCnJ
- https://www.reddit.com/r/LocalLLaMA/comments/16j624z/some_questions_of_implementing_llm_to_generate_qa
- https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1
- https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
- https://github.com/edumunozsala/question-answering-pinecone-sts
- https://medium.com/@pankaj_pandey/fine-tuning-rag-models-for-custom-content-generation-849d7ffce97f

# Directory

In [9]:
# Set directory to file location
from pathlib import Path
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)

# Get the current working directory
current_directory = os.getcwd()
current_directory

'C:\\Users\\polri\\Desktop\\Git_TFM\\TFM_LAW_LLM'

# Libraries

In [10]:
# Libraries for display and visualization
from IPython.display import Markdown, display
#import gradio as gr

# Libraries for managing data and serialization
#import pinecone
import json
import numpy as np

# General utility libraries
import gc
import time

# Libraries related to HuggingFace
from huggingface_hub import notebook_login

# Libraries related to Transformers
from transformers import BitsAndBytesConfig
from sentence_transformers import CrossEncoder
from typing import List
import accelerate

# Libraries related to Langchain
import langchain
from sentence_transformers import SentenceTransformer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chains import SimpleSequentialChain, RetrievalQA, LLMChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain import HuggingFacePipeline
from langchain import PromptTemplate

# Libraries related to Pinecone
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore  
from pinecone import Pinecone

# Libraries related to optimization
#import xformers

# Other miscellaneous libraries
from tqdm.notebook import tqdm
#import nomic
#from nomic import atlas

# Local custom functions
from functions import *
from utils import *

In [11]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Device

In [12]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

Using device: cpu



In [13]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

0

# Pinecone

Let's get Pinecone vector store ready.

In [14]:
# Initialize pinecone
api_key = os.getenv('PINECONE_API_KEY')
index_name = config["index_name"]
pinecone = Pinecone(api_key = "api_key")

# Connect
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1068}},
 'total_vector_count': 1068}

# Embedding model

In [15]:
# Model ID
embed_model_id = config["embed_model_id"]

# Show
embed_model_id

'dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn'

In [16]:
# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
) 

# Show
embed_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
), model_name='dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn', cache_folder=None, model_kwargs={'device': device(type='cpu')}, encode_kwargs={'device': device(type='cpu'), 'batch_size': 32}, multi_process=False, show_progress=False)

In [17]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Load LLM model

In [18]:
# Model ID
use_quantization = config["use_quantization"]

# Show
use_quantization

True

In [20]:
import transformers
from transformers import AutoTokenizer

# Your Hugging Face API token
api_token = os.getenv('PINECONE_API_KEY')

# Model identifier
model_id = config["model"]

# Load tokenizer with authentication
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=api_token
)

In [21]:
# Set BNB configuration if quantization is enabled
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16
) if use_quantization else None

In [22]:
# Set model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=api_token,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Command

We now import the pre_prompt and the prompt_context from the yaml file.

In [24]:
# Get pre-prompt
pre_prompt = config["pre_prompt"]

# Create prompt context
prompt_context = config["prompt_context"]

In [25]:
# General template
general_template = pre_prompt + prompt_context + "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}"

In [26]:
# Mistral template
mistral_template = "<s>[INST]" + pre_prompt + prompt_context +  "A continuación se proporciona el contexto: [/INST] {context}" + "</s>" + "[INST] pregunta: {query} [/INST]"

In [27]:
# Google template
google_template = f"""
<start_of_turn>user
{pre_prompt}. {prompt_context} A continuación se proporciona el contexto: 
Contexto: {{context}} 
Pregunta: {{query}}
<end_of_turn>
<start_of_turn>model
Respuesta: """

In [28]:
# Define the final template selection logic
if "mistral" in model_id:
    template = mistral_template
    selected_template_message = "Mistral template selected."
elif "google" in model_id:
    template = google_template
    selected_template_message = "Google template selected."
else:
    template = general_template
    selected_template_message = "Default template selected."

# Print out the selected template message
print(selected_template_message)

Mistral template selected.


In [29]:
# Prompt Template
prompt = PromptTemplate(
    template = template, 
    input_variables = ["context", "query"]
)

We can now print the prompt.

In [30]:
# Show
prompt

PromptTemplate(input_variables=['context', 'query'], template='<s>[INST]Eres un asistente experto en derecho y leyes espaÃ±olas y tu objetivo es proporcionar respuestas exhaustivas y precisas a las preguntas planteadas por tus clientes.\nAsegÃºrate de basar tus respuestas en el contexto proporcionado, utilizando todas las leyes y normativas relevantes para fundamentar tus argumentos.\nEs crucial que todas las respuestas estÃ©n redactadas en espaÃ±ol y presentadas de forma clara y coherente.\nConsidera ofrecer ejemplos o casos hipotÃ©ticos para ilustrar tus puntos de vista.\nA continuaciÃ³n, se presenta la informaciÃ³n relevante que debes usar para responder a las consultas de los clientes. \nEn caso de no encontrar la respuesta, debes indicarlo de forma explÃ\xadcita.\nA continuación se proporciona el contexto: [/INST] {context}</s>[INST] pregunta: {query} [/INST]')

# LLM Pipeline

Let's define the LLM Pipeline.

In [31]:
# Define pipeline with parameters from config file
generate_text = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    return_full_text = config["return_full_text"],
    max_new_tokens = config["max_new_tokens"],
    repetition_penalty = config["repetition_penalty"],
    temperature = config["temperature"],
    pad_token_id = tokenizer.eos_token_id,
    batch_size = 1
)

# HF pipeline
llm = HuggingFacePipeline(pipeline = generate_text)

# Create llm chain 
llm_chain = LLMChain(llm = llm, prompt = prompt)

# Vector store

In [32]:
# Text field
text_field = "text"  

# Vector store
vectorstore = PineconeVectorStore(index, embed_model, text_field)  

# Show
vectorstore

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x24525829f70>

# Test models

In [33]:
# Simple context
context = "Eres una API con conocimientos legales. Debes responder a preguntas en Español. Si no conoces la respuesta, admítelo."

# Query
query = '¿Qué diferencia hay entre homicidio y asesinato según el Código Penal español, y cuáles son las penas asociadas a cada uno?'

# Find closer docs

We can now see the closer docs to the query and it's scores.

In [34]:
# Similarity output
similarity_output = vectorstore.similarity_search_with_score(query, k = config['top_k_docs'])

# Context preprocessed
context_processed = [{"context": doc.page_content, "score": score} for doc, score in similarity_output]

# Show
context_processed[0:3]

[{'context': 'a) Que la industria o actividad funcione clandestinamente, sin\nhaber obtenido la preceptiva autorización o aprobación\nadministrativa de sus instalaciones.\nb) Que se hayan desobedecido las órdenes expresas de la\nautoridad administrativa de corrección o suspensión de las\nactividades tipificadas en el artículo anterior.\nc)\n\nQue se haya falseado u ocultado información sobre los\naspectos ambientales de la misma.\n\nd) Que se haya obstaculizado la actividad inspectora de la\nAdministración.\ne) Que se haya producido un riesgo de deterioro irreversible o\ncatastrófico.\nf)\n\nQue se produzca una extracción ilegal de aguas en período\nde restricciones.',
  'score': 0.0985036716},
 {'context': 'por el delito conforme al apartado 1 del artículo 84 no serán\nrestituidos. Sin embargo, el juez o tribunal abonará a la pena\nlos pagos y la prestación de trabajos que hubieran sido realizados o cumplidos conforme a las medidas 2.ª y 3.ª\n4. En todos los casos anteriores, el juez 

# Re-ranking

In [35]:
# Model ID
reranking_model = config["reranking_model"]

# Show
reranking_model 

'cross-encoder/ms-marco-MiniLM-L-6-v2'

In [36]:
# Model ID
top_reranked_docs = config["top_reranked_docs"]

# Show
top_reranked_docs

15

In [37]:
# Extracting 'title' keys
final_context = [entry['context'] for entry in context_processed]

In [38]:
# Cross encoder
cross_encoder = CrossEncoder(reranking_model)

# Show
cross_encoder

<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder at 0x24524491400>

In [39]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [41]:
# Re-ranking
text_field = 'context'
ranked_context = rank_documents(cross_encoder, text_field, query, context_processed)

In [42]:
# Format
sorted_ranked_context = dict(sorted(ranked_context.items())[:top_reranked_docs])
sorted_ranked_context = list(sorted_ranked_context.values())

sorted_ranked_context[0:3]

# Get max docs

In [45]:
# Model ID
max_model_tokens = config["max_model_tokens"]
max_model_tokens

5120

Calculate number of tokens for 'context' value and check if cumulative tokens are still less than max_model_tokens

In [47]:
cumulative_tokens = 0
filtered_context = []

for item in sorted_ranked_context:

    token_count = count_tokens(item['context'])
    cumulative_tokens += token_count
    
    # Check if cumulative tokens are still less than max_model_tokens
    if cumulative_tokens < max_model_tokens:
        filtered_context.append(item)
    else:
        break
        
filtered_context[0:3]

In [49]:
# Sum tokens for all contents in filtered_data
total_tokens = sum(count_tokens(item["context"]) for item in filtered_context)
print("Total tokens for all contexts in filtered_context:", total_tokens)

Total tokens for all contexts in filtered_context: 3501


In [50]:
len(filtered_context)

15

# Enhanced model

Now, let's use the RAG model.

In [51]:
# Filter contexts, keeping only the context strings
filtered_context_ready = [item["context"] for item in filtered_context]

In [52]:
# Call enhanced model
enhanced_model = llm_chain({"context": str(filtered_context_ready), "query": query})

KeyboardInterrupt: 

In [None]:
# Output
enhanced_result = enhanced_model['text'].strip()

# Markdown
display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{enhanced_result}</p>"))

# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()