<a href="https://colab.research.google.com/github/NightFore/2324_Project_AI_RAG/blob/main/2023_2024_OptionAI_Project_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RAG System Using Llama2 With Hugging Face

In [1]:
# Install necessary packages
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install sentence_transformers
!pip install llama_index
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-langchain


Collecting pypdf
  Downloading pypdf-4.1.0-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.1.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m805.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m


In [2]:
# Import required libraries
import os
import torch
from google.colab import drive
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from gradio import Interface


In [4]:
# Mount Google Drive if not already mounted
def mount_google_drive():
    """
    Mounts Google Drive if not already mounted.

    Returns:
    - None
    """
    if not os.path.exists('/content/drive/'):
        drive.mount('/content/drive')
        print("Mounted Google Drive.")
    else:
        print("Google Drive is already mounted.")

# Mount Google Drive
mount_google_drive()


Mounted at /content/drive
Mounted Google Drive.


In [5]:
def change_to_project_folder(project_name: str, base_folder: str = '/content/drive/MyDrive/') -> str:
    """
    Returns the path to the specified project folder.

    Args:
    - project_name (str): Name of the project folder.
    - base_folder (str): Base folder where the project folder is located. Defaults to '/content/drive/MyDrive/'.

    Returns:
    - str: Path to the project folder.
    """
    # Construct project folder path
    project_folder_path = os.path.join(base_folder, project_name)

    # Check if the project folder exists
    if not os.path.exists(project_folder_path):
        print(f"Error: Project folder '{project_name}' does not exist.")
        return ""

    print(f"Changed to project folder: {project_folder_path}")
    return project_folder_path

# Define project name
project_name = "2023_2024_OptionAI_Project_RAG"

# Get the path to the project folder
project_folder = change_to_project_folder(project_name)


Changed to project folder: /content/drive/MyDrive/2023_2024_OptionAI_Project_RAG


In [6]:
def load_documents(project_folder: str, directory_name: str) -> list:
    """
    Load documents from a directory within the project folder.

    Args:
    - project_folder (str): Path to the project folder.
    - directory_name (str): Name of the directory containing the documents within the project folder.

    Returns:
    - list: List of loaded documents.
    """
    # Construct directory path within the project folder
    directory_path = os.path.join(project_folder, directory_name)

    if not os.path.exists(directory_path):
        print(f"Error: Directory '{directory_path}' does not exist.")
        return []

    # Initialize SimpleDirectoryReader to load data
    directory_reader = SimpleDirectoryReader(directory_path)

    try:
        # Load data from the directory
        documents = directory_reader.load_data()
        print("Documents loaded successfully.")
        return documents
    except Exception as e:
        print(f"Error loading documents: {e}")
        return []

# Define directory name
directory_name = "data"

# Load documents from the directory within the project folder
documents = load_documents(project_folder, directory_name)

# Display the loaded documents
print("Loaded Documents:")
for doc in documents:
    print(doc)


Documents loaded successfully.
Loaded Documents:
Doc ID: ab701480-6d6d-4fa8-8d6e-04dec71d721a
Text: இதயம்  என்பது  இஷ ்காவில்  உள்ள  அம்பு   इश्क़ में दिल फ़ना है,
oh-oh-oh  நீக்கவும்  அல்லது  உருவாக்கவும்   मैंने तुझको  चुना है, oh-
oh-oh  يرتدي كل ألوانك، ويرتدي أسلوبك   तेरा हुआ मैं सब को छोड़ के,
oh-oh-oh  "కొలవడం  ద్వా రా , రహస్యా లను  బహిరగతం చేయడం  ద్వా రా
ప్రేమలో  పడకండి ."  आया हूँ मैं सब को बोल के, oh-oh-oh  Ôi em đã đến
bên anh, s ứ...
Doc ID: 97f987c8-75fd-41f8-9225-ac0cb63c6734
Text: StarCraft II Guide by Ajek – Version 1, March 13th, 2011 1
StarCraft II Guide Written by Paul J Stales Ajek / kzoAjek Edited by
Axis, Golath, Ian, Rebirth, Rook
Doc ID: 2c9bfe62-88f0-475a-803a-b1d1048e4a1c
Text: StarCraft II Guide by Ajek – Version 1, March 13th, 2011 2
Version 1 March 13th, 2011
Doc ID: 89dac0fa-dc40-402e-89c1-11942bac2cfd
Text: StarCraft II Guide by Ajek – Version 1, March 13th, 2011 3 The
Staff of Ajek’s Guide Author Paul J Stales, aka Ajek / kzoAjek Author
of Original Cont

In [7]:
# Define system prompt for the Q&A assistant
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""

# Define query wrapper prompt (Default format supportable by LLama2)
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")


In [8]:
def initialize_huggingface_llm(system_prompt: str, query_wrapper_prompt: SimpleInputPrompt):
    """
    Initialize HuggingFaceLLM for language generation.

    Args:
    - system_prompt (str): Prompt provided to the model to establish the context for language generation.
    - query_wrapper_prompt (SimpleInputPrompt): Prompt for wrapping user queries for interaction with the model.

    Returns:
    - HuggingFaceLLM: Initialized HuggingFaceLLM model.
    """
    # Authenticate Hugging Face CLI if necessary (required for models that need authentication such as Llama-2-7b-chat-hf)
    !huggingface-cli login

    # Initialize HuggingFaceLLM
    # Arguments:
    #   - context_window: The maximum length of the context window for language generation.
    #   - max_new_tokens: The maximum number of new tokens to generate.
    #   - generate_kwargs: Additional keyword arguments for generation, such as temperature and sampling.
    #   - system_prompt: Prompt provided to the model to establish the context for language generation.
    #   - query_wrapper_prompt: Prompt for wrapping user queries for interaction with the model.
    #   - tokenizer_name: Name of the tokenizer used by the model.
    #   - model_name: Name of the pretrained model used for language generation.
    #   - device_map: Device to use for model inference (e.g., "auto" for automatic selection).
    #   - model_kwargs: Additional keyword arguments for the model, such as data type and loading configuration.
    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=256,
        generate_kwargs={"temperature": None, "top_p": None, "do_sample": False},
        # generate_kwargs={"temperature": 0.0, "do_sample": False},
        system_prompt=system_prompt,
        query_wrapper_prompt=query_wrapper_prompt,
        tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
        model_name="meta-llama/Llama-2-7b-chat-hf",
        device_map="auto",

        # Uncomment this if using CUDA to reduce memory usage
        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
    )

    return llm

# Initialize HuggingFaceLLM for language generation
llm = initialize_huggingface_llm(system_prompt, query_wrapper_prompt)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [9]:
def initialize_huggingface_embeddings():
    """
    Initialize HuggingFaceEmbeddings for embedding models.

    Returns:
    - LangchainEmbedding: Initialized HuggingFaceEmbeddings model.
    """
    # Initialize HuggingFaceEmbeddings for embedding models
    embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

    return embed_model

# Initialize HuggingFaceEmbeddings for embedding models
embed_model = initialize_huggingface_embeddings()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
def create_service_context(llm, embed_model):
    """
    Create service context for the index.

    Args:
    - llm (HuggingFaceLLM): Initialized HuggingFaceLLM model.
    - embed_model (LangchainEmbedding): Initialized HuggingFaceEmbeddings model.

    Returns:
    - ServiceContext: Created service context.
    """
    # Create service context for the index
    service_context = ServiceContext.from_defaults(
        chunk_size=1024,
        llm=llm,
        embed_model=embed_model
    )

    return service_context

# Create service context for the index
service_context = create_service_context(llm, embed_model)


  service_context = ServiceContext.from_defaults(


In [11]:
def load_and_create_index(project_folder: str, directory_name: str, service_context: ServiceContext):
    """
    Load documents from a directory and create a vector store index.

    Args:
    - project_folder (str): Path to the project folder.
    - directory_name (str): Name of the directory containing the documents within the project folder.
    - service_context (ServiceContext): Service context for the index.

    Returns:
    - VectorStoreIndex: Created vector store index.
    """
    # Construct directory path within the project folder
    directory_path = os.path.join(project_folder, directory_name)

    if not os.path.exists(directory_path):
        print(f"Error: Directory '{directory_path}' does not exist.")
        return None

    # Load documents from the directory
    documents = SimpleDirectoryReader(directory_path).load_data()

    # Create vector store index from the documents
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)

    return index

# Load documents from the directory and create a vector store index
index = load_and_create_index(project_folder, directory_name, service_context)


In [12]:
def convert_index_to_query_engine(index: VectorStoreIndex):
    """
    Convert index into a query engine.

    Args:
    - index (VectorStoreIndex): Vector store index.

    Returns:
    - QueryEngine: Converted query engine.
    """
    # Convert index into a query engine
    query_engine = index.as_query_engine()

    return query_engine

# Convert index into a query engine
query_engine = convert_index_to_query_engine(index)


In [None]:
def query_model(text):
    # Use your existing query engine to get response based on input text
    response = query_engine.query(text)
    return response

# Create the Gradio interface
interface = Interface(
    fn=query_model,
    inputs="text",
    outputs="text",
    title="RAG System Interface",
    description="Ask a question to get a response from the RAG system."
)

# Launch the Gradio interface
interface.launch(debug=True)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://af89edf9c36b415f70.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


In [None]:
# Perform queries and print responses (if not using the interface above)
response=query_engine.query("Who are the author of the StarCraft 2 Guide?")
print(response)