# RAG System Using Llama2 With Hugging Face and Gradio

## ---------- Project Setup ----------

### Constants and Configuration

Constants used throughout the notebook are defined here.


In [None]:
# Define project name
PROJECT_NAME = "2023_2024_Project_RAG"

# Define virtual environment directory
ENV_DIRECTORY = "env"

# List of necessary packages to install
PACKAGES_TO_INSTALL = [
    "pypdf",
    "transformers",
    "einops",
    "accelerate",
    "langchain",
    "bitsandbytes",
    "sentence_transformers",
    "langchain",
    "llama_index",
    "llama-index-llms-huggingface",
    "llama-index-embeddings-langchain",
    "gradio"
]

# Define data directory
DATA_DIRECTORY = "data"

# Define the model name
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

# Define the tokenizer name
TOKENIZER_NAME = "meta-llama/Llama-2-7b-chat-hf"

# Define the embedding model name
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"


### Mount Google Drive

This section includes functions to mount Google Drive if it's not already mounted.

In [None]:
import os
from google.colab import drive

def mount_google_drive():
    """
    Mounts Google Drive if not already mounted.
    """
    if not os.path.exists('/content/drive/'):
        drive.mount('/content/drive')
    else:
        print("Google Drive is already mounted.")

# Mount Google Drive
mount_google_drive()

### Retrieve Project Folder Path

This function retrieves the path to the project folder based on the provided project name and base folder location.

In [None]:
def get_project_folder_path(project_name: str, base_folder: str = '/content/drive/MyDrive/') -> str:
    """
    Get the path to the project folder.

    Args:
    - project_name (str): Name of the project folder.
    - base_folder (str): Base folder where the project folder is located. Defaults to '/content/drive/MyDrive/'.

    Returns:
    - str: Path to the project folder.
    """
    # Construct project folder path
    project_path = os.path.join(base_folder, project_name)

    # Check if the project folder exists
    if not os.path.exists(project_path):
        print(f"Error: Project folder '{project_name}' does not exist.")
        return ""

    print(f"Retrieved project folder path: {project_path}")
    return project_path

# Get the path to the project folder
project_path = get_project_folder_path(PROJECT_NAME)


### Setting Up Virtual Environment

A virtual environment is utilized to store installed packages, speeding up future sessions by avoiding the need to reinstall everything each time a new Google Colab session is launched.


In [None]:
import sys

def initialize_virtual_environment(project_path: str, env_directory: str) -> str:
    """
    Initialize a virtual environment for the project.

    Args:
    - project_path (str): Path to the project folder.
    - env_directory (str): Name of the directory for the virtual environment.

    Returns:
    - str: Path to the activate script of the virtual environment.
    """
    # Install virtualenv
    !pip install virtualenv

    # Define virtual environment path
    env_path = os.path.join(project_path, env_directory)

    # Create virtual environment
    !virtualenv {env_path}

    # Define packages directory within virtual environment
    packages_path = os.path.join(env_path, "lib", "python3.10", "site-packages")

    # Add packages directory to sys.path
    sys.path.append(packages_path)

    # Define activate script path
    env_activate_path = os.path.join(env_path, "bin", "activate")

    print(f"Path to activate script: {env_activate_path}")
    return env_activate_path

# Initialize the virtual environment
env_activate_path = initialize_virtual_environment(project_path, ENV_DIRECTORY)

### Install Packages

This section covers the installation of necessary packages into the virtual environment.

In [None]:
def install_packages(env_activate_path: str, packages_to_install: list):
    """
    Install packages into the virtual environment.

    Args:
    - env_activate_path (str): Path to the activate script of the virtual environment.
    - packages_to_install (list): List of package names to install.
    """
    for package in packages_to_install:
        print(f"---------- Installing {package} ----------")
        !source {env_activate_path}; pip install {package}
        print(f"---------- {package} installed successfully! ----------\n\n")

install_packages(env_activate_path, PACKAGES_TO_INSTALL)

### Library Imports

In this section, we import the necessary libraries and set up the environment for our RAG system.

- **llama_index**: Facilitates building and querying vector indexes.
    - **SimpleDirectoryReader**: Reads documents from a directory and extracts their embeddings for indexing.
    - **ServiceContext**: Manages the context of the indexing service, including configuration and resources.
    - **VectorStoreIndex**: Represents a vector index storing document embeddings for retrieval.
- **llama_index.llms.huggingface**: Provides utilities for integrating Hugging Face language models with the vector index.
    - **HuggingFaceLLM**: Represents a Hugging Face LLM capable of generating responses based on input prompts.
- **llama_index.core.prompts**: Offers utilities for creating input prompts.
    - **SimpleInputPrompt**: Represents a simple input prompt.
- **llama_index.embeddings.langchain**: Supports integrating Langchain embeddings with the vector index.
    - **LangchainEmbedding**: Represents Langchain embeddings suitable for document embeddings.
- **langchain.embeddings.huggingface**: Supports integrating Hugging Face embeddings with the vector index.
    - **HuggingFaceEmbeddings**: Represents Hugging Face embeddings suitable for document embeddings.
- **gradio**: A library for creating customizable UI components for machine learning models. It will be utilized to create an interactive interface for our RAG system.

In [None]:
# Import required libraries
import pypdf
import torch
from google.colab import userdata
from llama_index.core import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from gradio import Interface


### Load Documents

This function loads documents from a directory within the project folder.


In [None]:
def load_documents(project_path: str, data_directory: str) -> list:
    """
    Load documents from a directory within the project folder.

    Args:
    - project_path (str): Path to the project folder.
    - data_directory (str): Name of the directory containing the documents within the project folder.

    Returns:
    - list: List of loaded documents.
    """
    # Construct directory path within the project folder
    directory_path = os.path.join(project_path, data_directory)

    try:
        # Initialize SimpleDirectoryReader to load data from the directory
        directory_reader = SimpleDirectoryReader(directory_path)

        # Load data from the directory
        documents = directory_reader.load_data()
        print("Documents loaded successfully.")
        return documents
    except FileNotFoundError:
        print(f"Error: Directory '{directory_path}' does not exist.")
        return []
    except Exception as e:
        print(f"Error loading documents: {e}")
        return []

# Load documents from the directory within the project folder
documents = load_documents(project_path, DATA_DIRECTORY)

# Display the loaded documents
print("Loaded Documents:")
for doc in documents:
    print(doc)


## ---------- Initializing the Q&A Assistant ----------

In this section, we establish the necessary prompts and initialize the Hugging Face LLM to create an interactive Q&A assistant.


### System Prompt and Query Wrapper

Here, we define the system prompt and the query wrapper prompt for the Q&A assistant.

- **System Prompt**: Sets the context and objective for the Q&A assistant. It instructs the assistant to answer questions as accurately as possible based on the instructions and context provided.

- **Query Wrapper Prompt**: Serves as a template for formatting queries to be input into the system. It follows the default format supported by LLama2, ensuring compatibility with the underlying model.

In [None]:
# Define system prompt for the Q&A assistant
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""

# Define query wrapper prompt (Default format supportable by LLama2)
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")


### Initializing the Hugging Face LLM

Initializing the HuggingFaceLLM requires providing the previous system prompt and the query wrapper prompt as inputs. Additionally, specific configurations such as the context window size, maximum number of new tokens to generate, tokenizer name, model name, and device mapping are utilized.

In [None]:
from transformers import BitsAndBytesConfig

def initialize_huggingface_llm(system_prompt: str, query_wrapper_prompt: SimpleInputPrompt, model_name: str, tokenizer_name: str) -> HuggingFaceLLM:
    """
    Initialize HuggingFaceLLM for language generation.

    Args:
    - system_prompt (str): Prompt provided to the model to establish the context for language generation.
    - query_wrapper_prompt (SimpleInputPrompt): Prompt for wrapping user queries for interaction with the model.
    - model_name (str): Name of the pretrained model used for language generation.
    - tokenizer_name (str): Name of the tokenizer used by the model.

    Returns:
    - HuggingFaceLLM: Initialized HuggingFaceLLM model.
    """
    # Check if HF_TOKEN exists in the secrets
    try:
        # Attempt to retrieve the HF_TOKEN secret
        userdata.get('HF_TOKEN')

        # Print a message indicating that the HF Token was found
        print("HF Token was found. No need to authenticate with Hugging Face CLI.")
    except userdata.SecretNotFoundError:
        # Print a message indicating that the HF Token was not found
        print("HF Token was not found.")

        # Print a message indicating the authentication process
        print("Authenticating with Hugging Face CLI...")

        # Authenticate Hugging Face CLI if necessary (required for models that need authentication such as Llama-2-7b-chat-hf)
        !huggingface-cli login

    # Check if CUDA is available to adjust model_kwargs accordingly
    if torch.cuda.is_available():
        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
        model_kwargs = {"quantization_config": quantization_config}
    else:
        model_kwargs = {}

    # Initialize HuggingFaceLLM
    """
    Arguments:
      - context_window: The maximum length of the context window for language generation.
      - max_new_tokens: The maximum number of new tokens to generate.
      - generate_kwargs: Additional keyword arguments for generation, such as temperature and sampling.
      - device_map: Device to use for model inference (e.g., "auto" for automatic selection).
      - model_kwargs: Additional keyword arguments for the model, such as data type and loading configuration.
    """
    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=256,
        generate_kwargs={"temperature": None, "top_p": None, "do_sample": False},
        system_prompt=system_prompt,
        query_wrapper_prompt=query_wrapper_prompt,
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        device_map="auto",
        model_kwargs=model_kwargs
    )

    return llm

# Initialize HuggingFaceLLM for language generation
llm = initialize_huggingface_llm(system_prompt, query_wrapper_prompt, MODEL_NAME, TOKENIZER_NAME)


### Example Interaction with the Assistant

To demonstrate the functionality of the assistant, we interact with it by providing a query and receiving a response:

In [None]:
def ask_question(llm, query):
    """
    Ask a question to the assistant and get the response.

    Args:
    - llm (HuggingFaceLLM): Initialized HuggingFaceLLM model.
    - query (str): Question to ask the assistant.

    Returns:
    - str: Response from the assistant.
    """
    # Query the assistant
    response = llm.complete(f"User: {query}\nAssistant:")

    # Print the query and model's response
    print(f"Query:\n{query}\n")
    print(f"Model's response:\n{response}\n")

    return response

if torch.cuda.is_available():
    response = ask_question(llm, "Do you see the documents in my project folder?")
    response = ask_question(llm, "Say something funny?")
    response = ask_question(llm, "Do you want to fly?")


## ---------- Setting Up the Information Retrieval System ----------
Before we can begin utilizing the information retrieval system, we need to set up and initialize several components.

### Initializing HuggingFace Embeddings

To start, we initialize the HuggingFaceEmbeddings model for embedding our documents.

In [None]:
def initialize_huggingface_embeddings(embedding_model_name: str) -> LangchainEmbedding:
    """
    Initialize HuggingFaceEmbeddings for embedding models.

    Args:
    - embedding_model_name (str): Name of the HuggingFace embedding model to use.

    Returns:
    - LangchainEmbedding: Initialized HuggingFaceEmbeddings model.
    """
    # Initialize HuggingFaceEmbeddings for embedding models
    embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=embedding_model_name))

    return embed_model

# Initialize HuggingFaceEmbeddings for embedding models
embed_model = initialize_huggingface_embeddings(EMBEDDING_MODEL_NAME)


### Creating Service Context

Next, we create a service context that will be used for indexing and querying.

In [None]:
def create_service_context(llm: HuggingFaceLLM, embed_model: LangchainEmbedding) -> ServiceContext:
    """
    Create service context for the index.

    Args:
    - llm (HuggingFaceLLM): Initialized HuggingFaceLLM model.
    - embed_model (LangchainEmbedding): Initialized HuggingFaceEmbeddings model.

    Returns:
    - ServiceContext: Created service context.
    """
    # Create service context for the index
    service_context = ServiceContext.from_defaults(
        chunk_size=1024,
        llm=llm,
        embed_model=embed_model
    )

    return service_context

# Create service context for the index
service_context = create_service_context(llm, embed_model)


### Loading and Creating Index

Now, we load documents from a directory and create a vector store index.

In [None]:
def load_and_create_index(project_path: str, data_directory: str, service_context: ServiceContext) -> VectorStoreIndex:
    """
    Load documents from a directory and create a vector store index.

    Args:
    - project_path (str): Path to the project folder.
    - data_directory (str): Name of the directory containing the documents within the project folder.
    - service_context (ServiceContext): Service context for the index.

    Returns:
    - VectorStoreIndex: Created vector store index.
    """
    # Construct directory path within the project folder
    directory_path = os.path.join(project_path, data_directory)

    try:
        # Load documents from the directory
        documents = SimpleDirectoryReader(directory_path).load_data()

        # Create vector store index from the documents
        index = VectorStoreIndex.from_documents(documents, service_context=service_context)
        return index
    except FileNotFoundError:
        print(f"Error: Directory '{directory_path}' does not exist.")
        return None

# Load documents from the directory and create a vector store index
index = load_and_create_index(project_path, DATA_DIRECTORY, service_context)


### Converting Index to Query Engine

Lastly, we convert the index into a query engine, allowing us to perform queries.

In [None]:
def convert_index_to_query_engine(index: VectorStoreIndex):
    """
    Convert index into a query engine.

    Args:
    - index (VectorStoreIndex): Vector store index.

    Returns:
    - QueryEngine: Converted query engine.
    """
    # Convert index into a query engine
    query_engine = index.as_query_engine()

    return query_engine

# Convert index into a query engine
query_engine = convert_index_to_query_engine(index)


### Performing Queries and Printing Responses

Now that our query engine is set up, we can utilize it to perform queries and print their responses.

In [None]:
def query_model(text):
    """
    Query the RAG system model.

    Args:
    - text (str): Input text from the user.

    Returns:
    - str: Response generated by the RAG system.
    """
    # Use the query engine to get a response based on the input text
    response = query_engine.query(text)
    return response

# Perform queries and print responses
if torch.cuda.is_available():
    response = query_model("Who are the authors of guide?")
    print(response)

## ---------- Gradio Interface ----------

With everything set up, we can deploy an interface using Gradio to interact with the RAG system in a more user-friendly manner.

In [None]:
# Create the Gradio interface
interface = Interface(
    fn=query_model,
    inputs="text",
    outputs="text",
    title="RAG System Interface",
    description="Ask a question to get a response from the RAG system."
)

# Launch the Gradio interface
interface.launch(debug=True)
