# Setup for RAG

* look at file `00-simple-local-rag.ipynb` for the detail

In [1]:
## checking local memoery
import torch
if torch.cuda.is_available():
    print(torch.cuda.get_device_properties(0))
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

device = "cuda" if torch.cuda.is_available() else "cpu"

_CudaDeviceProperties(name='NVIDIA GeForce RTX 4060', major=8, minor=9, total_memory=8187MB, multi_processor_count=24)
CUDA available: True
CUDA device: NVIDIA GeForce RTX 4060


In [2]:
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the embeddings from CSV file
text_chunks_embeddings_df = pd.read_csv("text_chunks_embeddings_df.csv")

#Convert the embeddings from string representation to numpy arrays
text_chunks_embeddings_df["embedding"] = text_chunks_embeddings_df["embedding"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ").astype(np.float64)
)

embeddings = torch.tensor(np.stack(text_chunks_embeddings_df["embedding"].tolist(), 
                    axis=0)).to(device)

Using device: cuda


In [3]:
from sentence_transformers import SentenceTransformer, util

try:
    if embedding_model is None:
        # Load the embedding model
        embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)
except NameError:
    embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import fitz
import matplotlib.pyplot as plt


def pdf_img_show(num_page: int):

    pdf_path = "human-nutrution-text.pdf"
    doc = fitz.open(pdf_path)
    page = doc.load_page(num_page+41)

    img = page.get_pixmap(dpi=300)

    doc.close()

    img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))
    
    plt.figure(figsize= (13,10))
    plt.imshow(img_array)
    plt.axis("off")
    plt.show()


In [5]:
def retrival_search(query : str,
             top_k : int = 5,
             embeddings : torch.tensor = embeddings,
             embedding_model : SentenceTransformer = embedding_model,
             
             device: str = device):
    
    #step 1: Turn query to embedding
    query_embedding = embedding_model.encode(query, device=device, convert_to_tensor=True).to(torch.float64)
    
    #step 2: Find top k <query_embedding, embeddings>
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    top_k = torch.topk(dot_scores, k=top_k+1)
    
    #step 3: text_chunk_df[top_k.indices]
    indices = top_k.indices
    scores = top_k.values
    
    return indices, scores
    

In [6]:
import textwrap

def print_top_results(query : str,        
                      top_k : int = 5,
                      embeddings : torch.tensor = embeddings,
                      embedding_model : SentenceTransformer = embedding_model,
                      text_chunk_df : pd.DataFrame = text_chunks_embeddings_df,
                      show_result = True):
    
    ## find top k relevant text chunk
    indices , scores = retrival_search(query, top_k, embeddings, embedding_model)
    
    top_k_relevant_text = [text_chunk_df["sentence_chunk"].iloc[int(idx)] for idx in indices]
    
    if show_result:
        print(f"Query: {query}\n")
        print("---"*40)
    
    relevant_texts = list()
    
    for i, idx in enumerate(indices):
        text = text_chunk_df["sentence_chunk"].iloc[int(idx)]
        relevant_texts.append(text)
        page_number = int(text_chunks_embeddings_df["page_number"].iloc[int(idx)])
        
        if show_result:
            pdf_img_show(page_number)
            print(f"Top {i+1} relevant text")
            print(textwrap.fill(text, 80)+"\n")
            print(f"Source page: {page_number}")
            print("---"*40)
        
    return relevant_texts

In [7]:
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = gpu_memory_bytes / (1024 ** 3)
print(f"Total GPU memory: {gpu_memory_gb:.2f} GB")

Total GPU memory: 8.00 GB


In [8]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 7.99560546875 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_flash_attn_2_available

# 1. Create a quantization 
#!pip install bitsandbytes accelerate
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                         bnb_4bit_compute_dtype = torch.float16)

if(is_flash_attn_2_available) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"

else:
    attn_implementation = "sdpa"   ## scaled dot-product attention 
print(f"Attention implementation set to: {attn_implementation}")

# 2. Loading LLM
# 
# model_id = "google/gemma-2b-it"
model_id = model_id  
print(  f"Loading model {model_id} ...")

print(f"Loading tokenizer for {model_id} ...")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id)


print(f"Loading model for {model_id} ...")
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id, 
                                          torch_dtype = torch.float16,
                                          low_cpu_mem_usage=False,
                                          #attn_implementation = attn_implementation,    ##very difficult to get flash attention work
                                          quantization_config = quantization_config if use_quantization_config else None)

if not use_quantization_config:
    llm_model.to("cuda")
    

Attention implementation set to: flash_attention_2
Loading model google/gemma-2b-it ...
Loading tokenizer for google/gemma-2b-it ...
Loading model for google/gemma-2b-it ...


Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.82s/it]


In [10]:
## get the model parameters

def get_model_params(model: torch.nn.Module):
    return sum([param.numel() for  param in model.parameters()])

print(f"number of parameters: {get_model_params(llm_model)}")

def get_model_mem_size(model: torch.nn.Module):
    mem__params = sum([param.numel() * param.element_size() for param in model.parameters()])
    mem_bufs = sum([buf.numel() * buf.element_size() for buf in model.buffers()])
    total_mem_bytes = mem__params + mem_bufs      

    total_mem_gb = total_mem_bytes / (1024 ** 3)

    return {"model_mem_bytes": round(total_mem_bytes, 2),
            "model_mem_gb": round(total_mem_gb, 2)}

print(get_model_mem_size(llm_model))

number of parameters: 1515268096
{'model_mem_bytes': 2039632384, 'model_mem_gb': 1.9}


In [11]:
def generate_answer(prompt: str, 
                    output_token_show: bool = False,
                    output_text_show: bool = True):
    #print(f"Query_text: {prompt}")

    # Tokenize the input text and send it to the GPU
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")  ## pt for pytorch

    #print("\nGenerating answer...")
    print("====="*20)
    print("====="*20)

    ## Generate outputs from Local LLM
    outputs = llm_model.generate(**input_ids,
                                max_new_tokens=256,
                                temperature=0.7,
                                do_sample=True
                                )
    
    if output_token_show:
        print(f"Model output (tokens): \n {outputs[0]}\n")
        
        print("====="*20)
        print("====="*20)
    
    ### decode the output tokens to text
    outputs_decoded = tokenizer.decode(outputs[0])
    RAG_text = outputs_decoded.replace(prompt, '').strip()
    print(f"RAG output: \n {textwrap.fill(RAG_text,80)}\n" )

    return outputs_decoded

In [12]:
## queries for testing

queries = [
    "What are the six classes of nutrients required for the body to function, and what are their basic functions?",
    "What is the primary role of carbohydrates in human nutrition, and how do they support specific cells like red blood cells and the brain?",
    "How do simple and complex carbohydrates differ in terms of digestion and absorption in the body?",
    "What are the five primary functions of carbohydrates in the human body according to the textbook?",
    "How does dietary fiber contribute to health, and what examples of fiber-rich foods are mentioned in the context of the traditional Hawaiian diet?",
    "What is the role of protein sparing in the context of carbohydrate consumption?",
    "How do carbohydrates assist in lipid metabolism, and why is this important for energy use?",
    "What are the health consequences of consuming too many or too few carbohydrates in the diet?",
    "How does the liver redistribute glucose in the body, and what percentage of ingested glucose is typically redistributed?",
    "Why is glucose the preferred energy source for certain cells, and under what conditions might the brain use alternative energy sources?",
    "What are micronutrients, and how do they differ from macronutrients like carbohydrates in terms of function and energy provision?",
    "What is the significance of water as a nutrient, and how much water does an average adult consume daily from food and drink?",
    "How do vitamins and minerals function as micronutrients, and what is their role in enzymatic processes?",
    "What are the benefits of a nutrient-dense diet, and how does it relate to maintaining a healthy weight?",
    "How do carbohydrates contribute to building macromolecules, and what are some examples of these molecules?",
    "What are the key nutritional components of the traditional Hawaiian diet, and what percentage of it was composed of carbohydrate-rich foods?",
    "How does the body use glycogen, and where is it stored?",
    "What are non-nutrients in foods, and how can they be beneficial or harmful to health?",
    "How do carbohydrates support the nutritional needs of the brain and nervous system?",
    "What factors affect an individual’s nutritional needs, and how can personal dietary choices impact health outcomes?"
]

In [13]:
"""
Aims: 

create a prompt like:

Based on the following contexts:
- <context_item_1>
- <context_item_2>
- <context_item_3>

Please answer the following query: {query}

Anwser:
"""
def prompt_formatter(query:str ,
                     contenxt_items: list[str],
                     prompt_print = False) ->str:
    
    context = "- "+"\n- ".join(contenxt_items)

    based_prompt = """Based on the following contexts: 
    
    {context}
    
    Please answer the following query: {query}
    Anwser:"""

    based_prompt = based_prompt.format(context=context, query=query)
    dialogue_template = [{"role": "user", "content": based_prompt}]

    prompt = tokenizer.apply_chat_template(conversation = dialogue_template, 
                                            tokenize=False, 
                                            add_generation_prompt=True)


    if prompt_print:
        print(prompt)
        
    return prompt


## Another prompt formatting example to improve answer quality

def prompt_formatter_2(query:str ,
                     contenxt_items: list[str],
                     prompt_print = False) ->str:
    
    context = "- "+"\n- ".join(contenxt_items)

    based_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    prompt = based_prompt.format(context=context, query=query)
    
    if prompt_print:
        print(prompt)
        
    return prompt

In [14]:
import random

def RAG_pipeline(query: str, 
                 top_k: int = 5,
                 prompt_formatter = prompt_formatter_2):

    print(f"Query: {query}")

    print("Retrieving relevant contexts...")
    retrival_texts = print_top_results(query, show_result= False)

    print("Formatting prompt with retrieved contexts...")
    augmented_prompt = prompt_formatter(query , retrival_texts)

    print( "Generating answer with augmented prompt...")
    generate_answer(augmented_prompt)
    print("Done.")
# --- IGNORE ---

# RAG_pipeline

In [39]:
## using pre-defined queries

query = random.choice(queries) 

# define query yourself
#query = " What are the six classes of nutrients required for the body to function, and what are their basic functions?"  

RAG_pipeline(query)

Query: How do carbohydrates support the nutritional needs of the brain and nervous system?
Retrieving relevant contexts...
Formatting prompt with retrieved contexts...
Generating answer with augmented prompt...
RAG output: 
 <bos> The passage does not provide any information about how carbohydrates
support the nutritional needs of the brain and nervous system, so I cannot
answer this query from the context.<eos>

Done.
