# Deploy Model

# Setup

In [1]:
# Python version
import sys 
print(sys.version)

In [2]:
# Environment Variables
from dotenv import load_dotenv
import yaml
import os

# Load env
load_dotenv()

True

In [None]:
# Torch config
from torch import cuda, bfloat16, float16
import torch

# Torch options
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# Parameters

In [None]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
# Use optimum
use_optimum = config["use_optimum"]

# Show
use_optimum

# Reference

- https://wandb.ai/tensorgirl/uncategorized/reports/Restaurant-menu-ordering-Chatbot-using-Gemma-Langchain-and-Chroma-DB--Vmlldzo2OTUxNjE3
- https://www.youtube.com/watch?v=6dyz2M_UWLwhttps://www.youtube.com/watch?v=6dyz2M_UWLw
- https://www.youtube.com/watch?v=XctooiH0moI&ab_channel=IBMTechnology
- https://www.youtube.com/watch?v=h5wLuVDr0oc&ab_channel=AssemblyAI
- https://www.youtube.com/watch?v=sBhK-2K9bUc&ab_channel=CodingIsFun
- https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks

# Directory

In [3]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)

# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [4]:
# Libraries for display and visualization
from IPython.display import Markdown, display
import gradio as gr

# Libraries for managing data and serialization
import yaml

# General utility libraries
import gc
import os

# Libraries related to HuggingFace
from huggingface_hub import notebook_login

# Libraries related to Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Libraries related to Langchain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import HumanMessage, AIMessage
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain import HuggingFacePipeline, PromptTemplate

# Libraries related to Pinecone
from langchain_pinecone import PineconeVectorStore  
from pinecone import Pinecone

# Other miscellaneous libraries
from tqdm.notebook import tqdm

# Local custom functions
from functions import *

In [None]:
# Warnings
import warnings
warnings.filterwarnings("ignore")

# Device

In [5]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB'))

In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

248

# Pinecone

In [9]:
# Init pinecone
pinecone = Pinecone(api_key = os.environ.get('PINECONE_API_KEY'))

# Connect
index_name = 'lawgpt-unstructured-db'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.05653,
 'namespaces': {'': {'vector_count': 5653}},
 'total_vector_count': 5653}

# Parameters

In [10]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Embedding model

In [11]:
# Model ID
embed_model_id = config["embedding_model"]

# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
) 

# Load LLM model

In [12]:
# Select model
model_id = config["model"]

# Show
model_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 15.73 GiB total capacity; 3.64 GiB already allocated; 3.12 MiB free; 3.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Model ID
use_quantization = config["use_quantization"]

# Show
use_quantization

In [None]:
# Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id
)

In [None]:
# Set BNB configuration if quantization is enabled
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
) if use_quantization else None

In [None]:
# Set model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    quantization_config = bnb_config,
    device_map = "auto"
)

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

# Command

We get the prompt and pre_prompt from yaml file.

In [None]:
# Get pre-prompt
pre_prompt = config["pre_prompt"]

# Create prompt context
prompt_context = config["prompt_context"]

In [None]:
# General template
prompt_template = pre_prompt + prompt_context + "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}"

In [None]:
# Mistral template
mistral_template = "[INST]" + pre_prompt + prompt_context +  "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}" + "[/INST]"

In [None]:
# Google template
google_template = f"""
<start_of_turn>user
{pre_prompt}. {prompt_context} A continuación se proporciona el contexto: 
Contexto: {{context}} 
Pregunta: {{query}}
<start_of_turn>model
Respuesta: """

In [None]:
# Define the final template selection logic
if "mistral" in model_id:
    template = mistral_template
    selected_template_message = "Mistral template selected."
elif "google" in model_id:
    template = google_template
    selected_template_message = "Google template selected."
else:
    template = general_template
    selected_template_message = "Default template selected."

# Print out the selected template message
print(selected_template_message)

In [None]:
# Prompt Template
prompt = PromptTemplate(
    template = template, 
    input_variables = ["context", "query"]
)

In [None]:
# Show
prompt

# LLM Pipeline

In [None]:

# Define pipeline with parameters from JSON file
generate_text = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    return_full_text = config["return_full_text"],
    max_new_tokens = config["max_new_tokens"],
    repetition_penalty = config["repetition_penalty"],
    temperature = config["temperature"],
    pad_token_id = tokenizer.eos_token_id
)

# HF pipeline
llm = HuggingFacePipeline(pipeline = generate_text)

# Create llm chain 
llm_chain = LLMChain(llm = llm, prompt = prompt)

# Vector store

In [None]:
# Text field
text_field = "text"  

# Vector store
vectorstore = PineconeVectorStore(index, embed_model, text_field)  

# Show
vectorstore

# Prepare UI

In [None]:
# Examples
placeholder = '¿Qué diferencias existen entre los dos tipos delictivos que el código penal regula en el artículo 245?'
examples_list = ['¿En qué consiste la irretroactividad de las normas y como se aplica en el derecho penal?',
                 '¿Un español que ha cometido un asesinato en otro país puede ser juzgado por un tribunal español?',
                 'Quiero recurrir una sentencia dictada por la audiencia nacional, ¿hacia qué órgano debemos dirigirlo?']

In [None]:
# Querying
def querying(query, history):
    memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm = llm,
        retriever = vectorstore.as_retriever(search_kwargs= {"k": config['top_k_docs']}),
        memory = memory,
        condense_question_prompt = prompt,
    )
    result = qa_chain({"question": query})
    return result["answer"].strip()

In [None]:
# Interace
iface = gr.ChatInterface(
    fn = querying,
    chatbot = gr.Chatbot(height = 600),
    textbox = gr.Textbox(placeholder = placeholder, container = False, scale = 7),
    title = 'LawGPT',
    theme = 'soft',
    examples = examples_list,
    retry_btn = 'Repetir',
    undo_btn = 'Deshacer',
    clear_btn = 'Borrar',
    submit_btn = 'Enviar'
)

In [None]:
# Launch
iface.launch(share = True)

# Clean

In [None]:
# CUDA information
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    allocated_memory = torch.cuda.memory_allocated(0) / (1024**3)  # Convert bytes to GB
    cached_memory = torch.cuda.memory_reserved(0) / (1024**3)  # Convert bytes to GB
    total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert bytes to GB
    available_memory = total_memory - cached_memory
    print('Allocated:   ', round(allocated_memory, 1), 'GB')
    print('Cached:      ', round(cached_memory, 1), 'GB')
    print('Available:  ', round(available_memory, 1), 'GB')
    print('Total:      ', round(total_memory, 1), 'GB')

In [None]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()