# Deploy Model

# Setup

In [1]:
# %pip install -r requirements.txt

In [2]:
# Environment Variables
import os
from dotenv import load_dotenv

# Load env
load_dotenv()

True

# Reference

https://www.youtube.com/watch?v=XctooiH0moI&ab_channel=IBMTechnology

https://www.youtube.com/watch?v=h5wLuVDr0oc&ab_channel=AssemblyAI

https://www.youtube.com/watch?v=sBhK-2K9bUc&ab_channel=CodingIsFun

https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks

# Directory

In [3]:
# Set directory to file location
from pathlib import Path
import sys
notebook_location = Path(os.path.abspath(''))
os.chdir(notebook_location)
# Get the current working directory
current_directory = os.getcwd()
current_directory

'/notebooks/LawGPT'

# Libraries

In [4]:
# General
from IPython.display import Markdown, display
import gradio as gr
import pinecone
import yaml
import time
import json

import gc
import os

# HuggingFace
from huggingface_hub import notebook_login

# Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import transformers

# Langchain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chains import SimpleSequentialChain, RetrievalQA, LLMChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain import HuggingFacePipeline
from langchain import PromptTemplate

# Torch
from torch import cuda, bfloat16, float16
import torch

# Other
from tqdm.notebook import tqdm

# Local
from functions import *

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Platform login

Use credentials from HuggingFace

In [5]:
# HF Key
hf_key = os.environ.get('HF_KEY')

In [6]:
# Jupyter / Colab
# notebook_login()

# VS Code
# Run huggingface-cli login in console

In [7]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA RTX A4000
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [8]:
# Clean memory
torch.cuda.empty_cache()
gc.collect()

248

# Pinecone

In [9]:
# Init pinecone
pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY'),
    environment = os.environ.get('PINECONE_ENVIRONMENT')
)

# Connect
index_name = 'lawgpt-unstructured-db'
index = pinecone.Index(index_name)

# Index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.05653,
 'namespaces': {'': {'vector_count': 5653}},
 'total_vector_count': 5653}

# Parameters

In [10]:
# Load parameters from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Embedding model

In [11]:
# Model ID
embed_model_id = config["embedding_model"]

# Embed model
embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device': device, 'batch_size': 32}
) 

# Load LLM model

In [12]:
# Select model
model_id = config["model"]

# BNB Config
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = bfloat16
)

# Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Set model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    quantization_config = bnb_config
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 15.73 GiB total capacity; 3.64 GiB already allocated; 3.12 MiB free; 3.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# CUDA information
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# Command

In [None]:
# Get pre-prompt
pre_prompt = config["pre_prompt"]

# Create prompt context
prompt_context = config["prompt_context"]

In [None]:
# General template
prompt_template = pre_prompt + prompt_context + "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}"

In [None]:
# Mistral template
mistral_template = "[INST]" + pre_prompt + prompt_context +  "A continuación se proporciona el contexto: {context}" + " " + "pregunta: {query}" + "[/INST]"

In [None]:
# Final template
if "mistral" in model_id.lower():
    final_template = mistral_template
else: 
    final_template = general_template

# Prompt Template
prompt = PromptTemplate(
    template = final_template, 
    input_variables = ["context", "query"]
)

# LLM Pipeline

In [None]:
# Define pipeline with parameters from JSON file
generate_text = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    temperature = config["temperature"],
    repetition_penalty = config["repetition_penalty"],
    return_full_text = config["return_full_text"],
    max_new_tokens = config["max_new_tokens"],
    pad_token_id = tokenizer.eos_token_id
)

# HF pipeline
llm = HuggingFacePipeline(pipeline = generate_text)

# Create llm chain 
llm_chain = LLMChain(llm = llm, prompt = prompt)

# Vector store

In [None]:
# Field in metadata with text
text_field = 'text'

# Initiate langchain vectorstore
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

# Prepare UI

In [None]:
# Examples
placeholder = '¿Qué diferencias existen entre los dos tipos delictivos que el código penal regula en el artículo 245?'
examples_list = ['¿En qué consiste la irretroactividad de las normas y como se aplica en el derecho penal?',
                 '¿Un español que ha cometido un asesinato en otro país puede ser juzgado por un tribunal español?',
                 'Quiero recurrir una sentencia dictada por la audiencia nacional, ¿hacia qué órgano debemos dirigirlo?']

In [None]:
# Querying
def querying(query, history):
    memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm = llm,
        retriever = vectorstore.as_retriever(search_kwargs= {"k": config['top_k_docs']}),
        memory = memory,
        condense_question_prompt = prompt,
    )
    result = qa_chain({"question": query})
    return result["answer"].strip()

In [None]:
# Interace
iface = gr.ChatInterface(
    fn = querying,
    chatbot = gr.Chatbot(height = 600),
    textbox = gr.Textbox(placeholder = placeholder, container = False, scale = 7),
    title = 'LawGPT',
    theme = 'soft',
    examples = examples_list,
    retry_btn = 'Repetir',
    undo_btn = 'Deshacer',
    clear_btn = 'Borrar',
    submit_btn = 'Enviar'
)

In [None]:
# Launch
iface.launch(share = True)