# Similarity search

# Setup

In [1]:
# Python version
import sys 
print(sys.version)

In [2]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [None]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
# Jupyter extensions
!jupyter nbextension enable --py widgetsnbextension

# Parameters

We get the needed parameters from the yaml file.

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
# Use optimum
use_optimum = config["use_optimum"]

# Show
use_optimum

# Reference

- https://colab.research.google.com/drive/1rt318Ew-5dDw21YZx2zK2vnxbsuDAchH?usp=sharing#scrollTo=YFw8HWIyTCnJ
- https://www.reddit.com/r/LocalLLaMA/comments/16j624z/some_questions_of_implementing_llm_to_generate_qa
- https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1
- https://medium.com/@saurabhgssingh/why-your-rag-is-not-working-96053b4d5305
- https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
- https://andriymulyar.com/blog/how-to-visualize-pinecone-vector-database
- https://github.com/edumunozsala/question-answering-pinecone-sts
- https://www.pinecone.io/learn/hybrid-search-intro/

# Directory

In [3]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [4]:
# General
from IPython.display import Markdown, display
import gradio as gr
import pinecone
import time
import yaml
import json

import gc
import os

# HuggingFace
from huggingface_hub import notebook_login

# Transformers
from transformers import BitsAndBytesConfig
from sentence_transformers import CrossEncoder
from typing import List
import accelerate

# Langchain
from sentence_transformers import SentenceTransformer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chains import SimpleSequentialChain, RetrievalQA, LLMChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain import HuggingFacePipeline
from langchain import PromptTemplate

# Pinecone
from langchain_pinecone import PineconeVectorStore  
from pinecone import Pinecone

# Optimization
import xformers

# Other
from tqdm.notebook import tqdm

# Atlas
from nomic import atlas
import nomic

# Local
from functions import *

In [None]:
if use_optimum:
    from transformers import AutoTokenizer
    from optimum.nvidia import AutoModelForCausalLM
    from optimum.nvidia.pipelines import pipeline
else:
    from transformers import AutoTokenizer
    from transformers import AutoModelForCausalLM
    from transformers import pipeline

In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Device

In [7]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:  ', round(allocated_memory, 1), 'GB')
    print('Cached:     ', round(cached_memory, 1), 'GB')
    print('Available: ', round(available_memory, 1), 'GB')
    print('Total:     ', round(total_memory, 1), 'GB')

Using device: cuda

Quadro P5000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

248

# Pinecone

In [9]:
# Init pinecone
pinecone = Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

# Connect
index_name = 'lawgpt-unstructured-db'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.03524,
 'namespaces': {'': {'vector_count': 3524}},
 'total_vector_count': 3524}

# Embedding model

In [11]:
# Model ID
embed_model_id = config["embedding_model"]

# Show
embed_model_id

.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [12]:
# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
) 

# Show
embed_model

Quadro P5000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:  ', round(allocated_memory, 1), 'GB')
    print('Cached:     ', round(cached_memory, 1), 'GB')
    print('Available: ', round(available_memory, 1), 'GB')
    print('Total:     ', round(total_memory, 1), 'GB')

# Prepared Data

In [20]:
# Get list of IDS
folder_path = "prepared_data/"
file_name = "splitted_input_core.csv"
file_path = f'{folder_path}{file_name}'

# Read csv
df_txt = pd.read_csv(file_path)

# Format
df_txt['text_id'] = df_txt['text_id'].astype(str)

# Unique IDs
unique_ids = df_txt['text_id'].unique()

# To list
unique_ids = unique_ids.tolist()

In [None]:
# Length of IDs
len(unique_ids)

# Parameters

In [21]:
# Model ID
use_quantization = config["use_quantization"]

# Show
use_quantization

In [None]:
# Summarizer
model_id = config['core_model']

# Show
model_id

In [None]:
# Summarizer
expand_prompt = config['expand_prompt']

# Show
expand_prompt

# User Query

In [22]:
# Query
initial_query = 'Explícame el Artículo 245 del Código Penal de España referente a ocupaciones ilegales de bienes inmuebles'

# Show
initial_query

# Enhance query

In [23]:
# Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

In [24]:
# Set BNB configuration if quantization is enabled
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
) if use_quantization else None

[{'context': 'Ley Orgánica 8/1983, de 25 de junio, de Reforma Urgente y Parcial del Código Penal.: E1 párrafo 1. queda así redactado: <Los delitos prescriben a los veinte años cuando la Ley señalare al delito la pena de reclusión mayor>. Artículo 115. El apartado 1. queda así redactado: <Las de reclusión mayor a los treinta y cinco años>. Artículo 120. Queda así redactado: <El español que indujere a una potencia extranjera a declarar la guerra a España o se concertase con ella para el mismo fin, será castigado con la pena de reclusión mayor>. Artículo 137 bis. Queda redactado así: <Los que, con propósito de destruir, total o parcialmente, a un grupo nacional étnico, racial o religioso perpetraren alguno de los actos siguientes, serán castigados: 1. Con la pena de reclusión mayor si causaren la muerte castración, esterilización, mutilación o lesión grave a alguno de sus miembros. 2. Con la reclusión menor, si sometieren al grupo o a cualquiera de sus individuos a condiciones de existenc

In [None]:
# Set model
if use_optimum:
    model = transformers.AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-chat-hf",
        trust_remote_code=True,
        quantization_config = bnb_config,
        device_map="auto",
        use_fp8=True
    )
else:
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        quantization_config = bnb_config,
        device_map="auto"

In [None]:
# Pipeline
pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    model_kwargs = {"torch_dtype": torch.bfloat16},
    return_full_text = config["return_full_text"],
    max_new_tokens = config["max_new_tokens"],
    repetition_penalty = config["repetition_penalty"],
    temperature = config["temperature"],
    pad_token_id = tokenizer.eos_token_id
)

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Run generate summary
expanded_query = generate_summary(pipeline, tokenizer, expand_prompt, initial_query)

In [None]:
# Show
expanded_query

# Final query

In [None]:
# Final query
# final_query = initial_query + ': ' + expanded_query
final_query = initial_query

# Show
final_query

# Find closer docs

In [None]:
# Text field
text_field = "text"  

# Vector store
vectorstore = PineconeVectorStore(index, embed_model, text_field)  

# Show
vectorstore

In [None]:
# Similarity output
similarity_output = vectorstore.similarity_search_with_score(final_query, k = config['top_k_docs'])

In [None]:
# Assuming similarity_output is the list of Document objects
context_processed = []

for doc, score in similarity_output:
    metadata = doc.metadata
    context_processed.append({
        "date": metadata.get('date', ''),
        "department": metadata.get('department', ''),
        "legislative_origin": metadata.get('legislative_origin', ''),
        "rang": metadata.get('rang', ''),
        "text_id": metadata.get('text_id', ''),
        "title": metadata.get('title', ''),
        "context": doc.page_content,
        "score": score
    })
    
# Show
context_processed[0:3]

# Re-Ranking

In [None]:
# Model ID
reranking_model = config["reranking_model"]

# Show
reranking_model 

In [None]:
# Model ID
top_reranked_docs = config["top_reranked_docs"]

# Show
top_reranked_docs

In [None]:
# Cross encoder
cross_encoder = CrossEncoder(reranking_model)

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Re-ranking
text_field = 'context'
ranked_context = rank_documents(cross_encoder, text_field, final_query, context_processed)

In [None]:
# Sort
sorted_ranked_context = dict(sorted(ranked_context.items())[:top_reranked_docs])

In [None]:
# Format
sorted_ranked_context = list(sorted_ranked_context.values())

In [None]:
# Show
sorted_ranked_context[0:3]

# Get max docs

In [None]:
# Model ID
max_model_tokens = config["max_model_tokens"]

# Show
max_model_tokens

In [None]:
# Initialize cumulative token count
cumulative_tokens = 0

# Filtered list to store dictionaries
filtered_context = []

# Iterate through the list of dictionaries
for item in sorted_ranked_context:
    # Calculate number of tokens for 'context' value
    token_count = count_tokens(item['context'])
    
    # Cumulative sum of token counts
    cumulative_tokens += token_count
    
    # Check if cumulative tokens are still less than max_model_tokens
    if cumulative_tokens < max_model_tokens:
        filtered_context.append(item)
    else:
        break

# Show
filtered_context[0:3]

In [None]:
# Sum tokens for all contexts in filtered_data
total_tokens = sum(count_tokens(item['context']) for item in filtered_context)

print("Total tokens for all contexts in filtered_context:", total_tokens)

In [None]:
# Number of contexts added
len(filtered_context)

# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()