<a href="https://colab.research.google.com/github/Squigspear/xCyberLLM/blob/main/Tutorial_2_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This tutorial showcases **Retrieval Augmentation Generation** with a LLM, where a document (CCOP) is referred to when the model is queried.

It can be ran using either:

  >Open-sourced LLAMA 2 7B 5 bit quantized model *or*

  >Microsoft Azure OpenAI.

# Running LLAMA 2 on Colab - No UI

## Installing of required packages

Downloading quantized model from Google Drive

In [None]:
!pip install langchain==0.0.310 --quiet
!export LLAMA_CUBLAS=1
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --quiet
!pip install sentence-transformers==2.2.2 --quiet
!pip install chromadb==0.3.26 --quiet
!pip install ctransformers --quiet
!pip install pypdf --quiet

In [None]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf

## Loading the model

In [None]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    n_ctx = 4096,
    model_path="/content/llama-2-7b-chat.Q5_K_M.gguf", # location of the model, llama-2-13b-chat.Q4_0.gguf
    temperature=0.2,                 # temperature
    max_tokens=2000,                 # Max. number of tokens to be generated
    top_p=0.9,                    # top_p = 0.9
    top_k=30,                     # top_k = 30
    n_gpu_layers=200,                 # number of layers to offload to GPU
    verbose=True, # Verbose is required to pass to the callback manager
    callback_manager=callback_manager,
    n_batch=200,          # number of token generation in parallel
)

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from chromadb.config import Settings

import pandas as pd
import os

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<>\n", "\n<>\n\n"
DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant."

def create_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT):
  SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
  prompt = B_INST + SYSTEM_PROMPT + instruction + E_INST
  return "" + prompt

In [None]:
qa_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful concise answer below and nothing else.
Helpful answer:"""

In [None]:
prompt_template = create_prompt(qa_template)
print(prompt_template)
prompt = PromptTemplate(template=prompt_template,input_variables=['context', 'question'])

## Loading DocumentQA using Langchain

In [None]:
persist_directory = "./"

CHROMA_SETTINGS = Settings(
    chroma_db_impl = 'duckdb+parquet',  # use duckdb to r/w parquet file
    anonymized_telemetry = False     # refuse telemetry of usage info
)

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda'}      # you must have a gpu, otherwise change it to cpu
encode_kwargs = {'normalize_embeddings': True}
embedding_func = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [None]:
def setup_new_qa(filename):
  global llm
  # Loading of documents
  loader = PyPDFLoader(filename)
  documents = loader.load()
  # Splitting of text into chunks

  text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
  texts = text_splitter.split_documents(documents)

  # Embeddings to use, to transform the document
  # embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2',
  #                                   model_kwargs={'device':'cpu'})

  text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
  chunked_docs = text_splitter.split_documents(documents)

  # configuration of the vector store

  print("Creating new vector store...")
  db = Chroma.from_documents(
      chunked_docs,
      embedding_func,
      persist_directory=persist_directory,
      client_settings=CHROMA_SETTINGS
  )

  db.persist()

  retriever = db.as_retriever(search_type = 'similarity', search_kwargs = {'k':4}) # with top k ranked matches

  qa = ConversationalRetrievalChain.from_llm(llm, retriever,verbose = True, return_source_documents=True, combine_docs_chain_kwargs={"prompt": prompt})

  return qa

## Gradio Installation

In [None]:
!pip install gradio==3.48.0 pydantic --quiet

# UI for Document Q&A

In [None]:
import gradio as gr
import random
import time

COUNT = 0
chat_history = []
qa = None

def process_file(files):
    global qa
    file_paths = [file.name for file in files]
    qa = setup_new_qa(file_paths[0]) # btn.name


with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    file_output = gr.File()
    btn = gr.UploadButton("Click to Upload a File", file_types=["PDF"], file_count="multiple")
    btn.upload(process_file, btn, file_output)

    used_letters_var = gr.State([])

    def respond(message, chat_history, btn):
        global COUNT, query, qa

        #'''
        #if not btn:
        #    raise gr.Error(message = 'Document Not Found.')

        #if COUNT == 0:
        #  #filename = "./datafile/ccop/CCOP.pdf"
        #  filename = btn.name
        #  qa = setup_new_qa(filename) # btn.name
        #  COUNT += 1


        req_msg = str(message)
        res_msg = ''
        try:
          if qa == None:
            return "", chat_history
          result = qa({'question': req_msg, 'chat_history': []})
          if result['answer'] == '':
            res_msg = 'No answer'
            return
          else:
            res_msg =  str(result['answer'][:1000])
        except:
          raise gr.Error('LLM hang!')

        bot_message = res_msg

        #print(message)
        #bot_message = message
        #chat_history = []
        chat_history.append((message, bot_message))
        #time.sleep(2)
        return "", chat_history

#     btn.upload(inputs = [btn])
    msg.submit(respond, [msg, chatbot, btn], [msg, chatbot])

demo.launch(debug=True)

# Using AzureOpenAI as LLM

In [None]:
!pip install openai --quiet
!pip install tiktoken --quiet

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
# Setting up API access
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["OPENAI_API_BASE"] = "https://dto-testing.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = "ce8e5b0d756f4d83b4dfbbc4ccd08fec"


llm_OAI = AzureOpenAI( temperature = 0.1,
                   deployment_name="DTO_demo",  # This is the deployed GPT3 from Azure
                   model_name="text-davinci-003",
                            )

In [None]:
qa_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful concise answer below and nothing else.
Helpful answer:"""

prompt = PromptTemplate(template=qa_template,input_variables=['context', 'question'])

In [None]:
def setup_new_qa_OAI(filename):

  global llm_OAI
  # Loading of documents
  loader = PyPDFLoader(filename)
  documents = loader.load()
  # Splitting of text into chunks

  # Splitting of text into chunks

  text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
  texts = text_splitter.split_documents(documents)

  embeddings = OpenAIEmbeddings(deployment="DTO_embed"
                                ,model='text-embedding-ada-002'
                                ,chunk_size=1) # chunk_size number is peculiarity of Azure OpenAI

  db = Chroma.from_documents(texts, embeddings) # vector database

  retriever = db.as_retriever(search_type = 'similarity', search_kwargs = {'k':4}) # with top k ranked matches

  qa = ConversationalRetrievalChain.from_llm(llm_OAI, retriever,verbose = False, return_source_documents=True, combine_docs_chain_kwargs={"prompt": prompt})

  return qa

In [None]:
import gradio as gr
import random
import time

COUNT = 0
chat_history = []
qa = None

def process_file(files):
    global qa
    file_paths = [file.name for file in files]
    qa = setup_new_qa(file_paths[0]) # btn.name


with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    file_output = gr.File()
    btn = gr.UploadButton("Click to Upload a File", file_types=["PDF"], file_count="multiple")
    btn.upload(process_file, btn, file_output)

    used_letters_var = gr.State([])

    def respond(message, chat_history, btn):
        global COUNT, query, qa

        req_msg = str(message)
        res_msg = ''
        try:
          if qa == None:
            return "", chat_history
          result = qa({'question': req_msg, 'chat_history': []})
          if result['answer'] == '':
            res_msg = 'No answer'
            return
          else:
            res_msg =  str(result['answer'][:1000])
        except:
          raise gr.Error('LLM hang!')

        bot_message = res_msg
        chat_history.append((message, bot_message))
        return "", chat_history

    msg.submit(respond, [msg, chatbot, btn], [msg, chatbot])

demo.launch(debug=True)