In [8]:
import torch
import platform
import subprocess

def get_gpu_info():
    try:
        # Check if nvidia-smi is available
        nvidia_smi = subprocess.check_output("nvidia-smi", shell=True)
        print("NVIDIA GPU detected:\n")
        print(nvidia_smi.decode())
        return True
    except:
        print("No NVIDIA GPU detected or nvidia-smi not found")
        return False

print("System Information:")
print("-" * 50)
print(f"Platform: {platform.system()} {platform.release()}")
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print("\nGPU Information:")
print("-" * 50)
get_gpu_info()

System Information:
--------------------------------------------------
Platform: Windows 10
Python version: 3.11.9
PyTorch version: 2.3.1+cu121
CUDA available: True

GPU Information:
--------------------------------------------------
NVIDIA GPU detected:

Sat Aug 30 00:47:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.97                 Driver Version: 580.97         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   44C    P8            N/A  /  115W |    1152MiB /   8188MiB |

True

# 1. RAG - Data preprocess

## 1.1 Data Process: text over page, then split the sentences.

### 1.1.1 download pdf and simple process

In [11]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrution-text.pdf"

# Download the PDF
if not os.path.exists(pdf_path):
    print("[INFO]File doesn't exist Downloading...")
    
    #Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    #The local filename to save the downloaded file
    filename = pdf_path
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        #Open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO]File downloaded and saved as {filename}")
        
    else:
        print(f"[INFO]Failed to download file. Status code: {response.status_code}")
else:
    print(f"[INFO]File {pdf_path} already exists.")


[INFO]File human-nutrution-text.pdf already exists.


In [12]:
from tqdm.auto import tqdm
try:
    import fitz
except ImportError:
    import sys
    !{sys.executable} -m pip install PyMuPDF
    import fitz
    
def text_formaater(text: str) -> str:
    """Performs. minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

# read pdf file in list[dict]
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formaater(text= text)
        pages_and_texts.append({"page_number": page_number -41, 
                               "page_char_count": len(text), 
                               "page_word_count": len(text.split(" ")),
                               "page_sentence_count" : len(text.split(". ")),
                               "page_token_count": len(text)/4, # Rough estimate of tokens
                               "text":text
                               })
        
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path)

0it [00:00, ?it/s]

1208it [00:00, 1485.91it/s]


In [13]:
import random
#view some of the pages and texts
#display(random.sample(pages_and_texts, k=3))

import pandas as pd
df = pd.DataFrame(pages_and_texts)
display(df[500:503])
display(df.describe().round(2))

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
500,459,745,120,4,186.25,Learning Activities Technology Note: The seco...
501,460,1958,337,19,489.5,The Atom UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD ...
502,461,1339,230,12,334.75,Image by DoSiDo / CC BY-SA 3.0 Atoms and mo...


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


### 1.1.2 split sentence by using spacy (nlp tool)

In [14]:
try:
    from spacy.lang.en import English
except ModuleNotFoundError:
    import sys
    !{sys.executable} -m pip install spacy
    from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

## example for sentence splitting
test_sentences = "This is a sentence. This is another one. And this is the last one."
doc = nlp(test_sentences)
print(f"Original: {test_sentences}")
print(f"After split: {list(doc.sents)}")

Original: This is a sentence. This is another one. And this is the last one.
After split: [This is a sentence., This is another one., And this is the last one.]


In [15]:
for item in tqdm(pages_and_texts):
    # Process the text with spaCy
    item["sentences"] = list(nlp(item["text"]).sents)
    
    #make sure to convert sentences to string
    item["sentences"] = [str(sentence)for sentence in item["sentences"]]
    
    # Add the sentence count to the item
    item["page_sentence_count_spacy"] = len(item["sentences"])
    
    
#view some of the pages and texts
random.sample(pages_and_texts, k=1)


100%|██████████| 1208/1208 [00:01<00:00, 1040.71it/s]


[{'page_number': 712,
  'page_char_count': 1834,
  'page_word_count': 340,
  'page_sentence_count': 19,
  'page_token_count': 458.5,
  'text': 'EAR values become the scientific foundation upon which RDA  values are set.  2. Recommended Daily Allowances. Once the EAR of a nutrient  has been established, the RDA can be mathematically  determined. While the EAR is set at a point that meets the  needs of half the population, RDA values are set to meet the  needs of the vast majority (97 to 98 percent) of the target  healthy population. It is important to note that RDAs are not  the same thing as individual nutritional requirements. The  actual nutrient needs of a given individual will be different  than the RDA. However, since we know that 97 to 98 percent of  the population’s needs are met by the RDA, we can assume that  if a person is consuming the RDA of a given nutrient, they are  most likely meeting their nutritional need for that nutrient.  The important thing to remember is that the

## 1.2 Chunk the sentence together

### 1.2.1 chunk sentence function

In [18]:
num_sentence_chunk_size = 10

## chunking function
def split_list(input_list : list[str],
               slice_size : int= num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

#to check if the function works apporpiately
test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [19]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"]=split_list(item["sentences"])
    item["num_chunks"] = len(item["sentence_chunks"])
    
print(f"after chunking:")
display(random.sample(pages_and_texts, k=1))
df = pd.DataFrame(pages_and_texts)
display(df.head())
display(df.describe().round(2))

100%|██████████| 1208/1208 [00:00<00:00, 302075.91it/s]

after chunking:





[{'page_number': 496,
  'page_char_count': 1805,
  'page_word_count': 283,
  'page_sentence_count': 19,
  'page_token_count': 451.25,
  'text': 'that have more recently modernized, industrialized, and urbanized  are experiencing a surge in their overweight and obese populations.  China, the most populous country in the world, now has more than  215 million people, approximately one-fifth of their population, that  are considered overweight or obese.4  The increase in China’s waistline is partly attributed to changes  in the traditional diet, more sedentary lives, and a massive increase  in motor vehicle use. Moreover, China’s recent famines in the 1950s,  which affected the poor and lower classes to a greater extent than  the upper class, have sanctioned lax social attitudes toward body  fat and reinspired the age-old Chinese belief that excess body fat  represents health and prosperity.  One of the worst statistics regarding overweight and obesity in  China is that more than ten milli

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1,[[Human Nutrition: 2020 Edition]],1
1,-40,0,1,1,0.0,,[],0,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1,[[Human Nutrition: 2020 Edition UNIVERSITY O...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1,[[Human Nutrition: 2020 Edition by University ...,1
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2,[[Contents Preface University of Hawai‘i at ...,1


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### 1.2.2 groupby page[chunk]

In [20]:

import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        #Join the sentences together into a paragraph-like structure, i.e. join the list of sentence into one paragraph
        # list to string
        joined_sentences_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentences_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentences_chunk) # ".A" to ". A"

        chunk_dict["sentence_chunk"] = joined_sentences_chunk
        
        chunk_dict["chunk_char_count"] = len(joined_sentences_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentences_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentences_chunk)/4     # Rough estimate of tokens
        
        pages_and_chunks.append(chunk_dict)
        
len(pages_and_chunks)
        

100%|██████████| 1208/1208 [00:00<00:00, 40266.39it/s]


1843

In [21]:
df= pd.DataFrame(pages_and_chunks)
df.head()
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [26]:
min_token_length = 30
## almost all of the chunks with under 30 tokens are not useful
for row in df[df["chunk_token_count"]< min_token_length].sample(5).iterrows():
    print(f"chunk token: {row[1]['chunk_token_count']} ||sentence chunk: {row[1]['sentence_chunk']}")

chunk token: 28.75 ||sentence chunk: Image by FDA/ Changes to the Nutrition Facts Label Figure 12.5 Food Serving Sizes 728 | Discovering Nutrition Facts
chunk token: 24.75 ||sentence chunk: http://www.ajcn.org/content/87/1/64.long. Accessed September 22, 2017. 554 | Water-Soluble Vitamins
chunk token: 25.5 ||sentence chunk: http://www.ajcn.org/cgi/ pmidlookup?view=long&pmid=10197575. Accessed October 6, 2017. 640 | Magnesium
chunk token: 16.5 ||sentence chunk: Table 4.6 Sweeteners Carbohydrates and Personal Diet Choices | 281
chunk token: 12.75 ||sentence chunk: PART VIII CHAPTER 8. ENERGY Chapter 8. Energy | 451


### 1.2.3 filter out the chunks with under 30 tokens

In [27]:

pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] >= min_token_length].to_dict(orient="records")
display(pages_and_chunks_over_min_token_length[:2])
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

[{'page_number': 1009,
  'sentence_chunk': 'PCBs, or polychlorinated biphenyls, are man-made organic compounds that consists of carbon, hydrogen and chlorine. Due to their non-flammability, chemically stable, and high boiling points PCBs were manufactured and used commercially from 1929 until 1979 when it was banned. Like methylmercury, higher concentrations of this contaminant are found in predatory fish. Health effects include complications in physical and neurological development in children, and this compound is potentially a carcinogen. PCB contamination also can affect the immune, reproductive, nervous, and endocrine systems.9 Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are Ministry of the Environment, Government of Japan.http://www.env.go.jp/en/chemi/hs/ minamata2002/. Published 2002. Accessed December 21, 2011. 9.\xa0Learn About Polychlorina

## 1.3 Embedding the sentence

#### 1.3.1 Embedding model download 

In [28]:
import numpy as np
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device="cpu")


In [29]:

sentences = ["This is a sentence.", "This is another one.", "And this is the last one."]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding(first 5): {embedding[:5]}")
    print(f"Embedding shape: {embedding.shape}")

Sentence: This is a sentence.
Embedding(first 5): [ 0.04461046 -0.01864081 -0.00090315  0.02639336 -0.0418851 ]
Embedding shape: (768,)
Sentence: This is another one.
Embedding(first 5): [ 0.02019235  0.0298154   0.00167827 -0.04361509 -0.03454699]
Embedding shape: (768,)
Sentence: And this is the last one.
Embedding(first 5): [ 0.01193439  0.03599297 -0.02380103 -0.0224243   0.02393362]
Embedding shape: (768,)


In [30]:
embeddings[0].shape

(768,)

#### 1.3.2 Embedding chunks one by one with using CPU

In [31]:
#%%time 

cpu_run = True

if cpu_run:
    embedding_model.to("cpu")

    for item in tqdm(pages_and_chunks_over_min_token_length):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])
        item["embedding_shape"] = item["embedding"].shape
    
print("CPU times: user 4min 13s, sys: 44.4 s, total: 4min 57s \n Wall time: 3min 30s")## on MacBook Pro M2 Pro 16GB RAM


100%|██████████| 1680/1680 [03:07<00:00,  8.95it/s]

CPU times: user 4min 13s, sys: 44.4 s, total: 4min 57s 
 Wall time: 3min 30s





In [None]:
#%%time 
##with GPU
cuda_run = True

if cuda_run:
    embedding_model.to("cuda")

    for item in tqdm(pages_and_chunks_over_min_token_length):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])
        item["embedding_shape"] = item["embedding"].shape



100%|██████████| 1680/1680 [00:26<00:00, 63.76it/s]



#### 1.3.3 Save embeddings to file ( Don't run it if the embeddings is not ran above)


In [33]:
text_chunks_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = "text_chunks_embeddings_df.csv"
text_chunks_embeddings_df.to_csv(embeddings_df_save_path, index=False)


In [34]:
# import saved file and view

text_chunks_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_embeddings_df_load.head()


# notice saving embeddings data to csv file may not be the best way, 
# in stead of doing that, vectors database is recommended when the data is large

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding,embedding_shape
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242675e-02 9.02281329e-02 -5.09549491e-...,"(768,)"
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156270e-02 5.92139587e-02 -1.66167449e-...,"(768,)"
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801972e-02 3.39813679e-02 -2.06426457e-...,"(768,)"
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566985e-02 3.81275043e-02 -8.46854504e-...,"(768,)"
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264494e-02 -8.49768892e-03 9.57159698e-...,"(768,)"


# 2. RAG - Top relevant Search


* the final function for this section is  **print_top_results("What is the role of carbohydrates in human nutrition?" , show_result= False)** 

### 2.1 import data

In [1]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the embeddings from CSV file
text_chunks_embeddings_df = pd.read_csv("text_chunks_embeddings_df.csv")

#Convert the embeddings from string representation to numpy arrays
text_chunks_embeddings_df["embedding"] = text_chunks_embeddings_df["embedding"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ").astype(np.float64)
)

embeddings = torch.tensor(np.stack(text_chunks_embeddings_df["embedding"].tolist(), 
                    axis=0)).to(device)


print(f"embeddings : {embeddings}")
print(f"embeddings shape: {embeddings.shape}")


Using device: cuda
embeddings : tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]],
       device='cuda:0', dtype=torch.float64)
embeddings shape: torch.Size([1680, 768])


In [2]:
#!pip uninstall -y numpy
#!pip install numpy==1.26.4

### 2.2 loading embedding model and doing sematic search pipeline

Doing sematic search pipeline with the following steps:
1. define query
2. embedding the query
3. cosine similarity with text embeddings and query embedding
4. find top k highest similarity from result

In [3]:
from sentence_transformers import SentenceTransformer, util

try:
    if embedding_model is None:
        # Load the embedding model
        embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)
except NameError:
    embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
query = "What is the role of carbohydrates in human nutrition?"
query_embedding = embedding_model.encode(query, device=device, convert_to_tensor=True).to(torch.float64)

print(f"Enter query: {query}")
print(f"Query embedding shape: {query_embedding.shape}")


Enter query: What is the role of carbohydrates in human nutrition?
Query embedding shape: torch.Size([768])


In [5]:
# %%time

k=5
## Perform semantic search by dot product with no normalization

dot_scores = util.dot_score(query_embedding, embeddings)[0]
print(dot_scores.shape)

top_k = torch.topk(dot_scores, k=k)
print(f"top {k} results: {top_k.indices}")

torch.Size([1680])
top 5 results: tensor([389, 381, 390, 347,  41], device='cuda:0')


In [6]:
## checking local memoery
import torch
if torch.cuda.is_available():
    print(torch.cuda.get_device_properties(0))
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

_CudaDeviceProperties(name='NVIDIA GeForce RTX 4060', major=8, minor=9, total_memory=8187MB, multi_processor_count=24)
CUDA available: True
CUDA device: NVIDIA GeForce RTX 4060


### 2.3 show the pages for the result with following steps:

1. open the pdf in image
2. show the image with matplotlib

In [9]:
#!pip install fitz
#!pip install frontend

In [10]:
import fitz
import matplotlib.pyplot as plt


def pdf_img_show(num_page: int):

    pdf_path = "human-nutrution-text.pdf"
    doc = fitz.open(pdf_path)
    page = doc.load_page(num_page+41)

    img = page.get_pixmap(dpi=300)

    doc.close()

    img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))
    
    plt.figure(figsize= (13,10))
    plt.imshow(img_array)
    plt.axis("off")
    plt.show()

#pdf_img_show(50)


In [11]:
# show the top k result in text
top_k_embeddings = embeddings[top_k.indices]

print(f"Query : {query}\n")

total_token_count = 0
total_word_count = 0

for i, idx in enumerate(top_k.indices):
    page_number = text_chunks_embeddings_df["page_number"].iloc[int(idx)]+41
    print(f"Top {i+1} relevant result (from page{page_number}): ")
    
    text_chunks = text_chunks_embeddings_df["sentence_chunk"].iloc[int(idx)]
    total_token_count += text_chunks_embeddings_df["chunk_token_count"].iloc[int(idx)]
    total_word_count += text_chunks_embeddings_df["chunk_word_count"].iloc[int(idx)]
    
    for sentence in [ n for n in text_chunks.split(".")]:
        print(sentence)
    print("\n\n")
    
print(f"Total token count: {total_token_count}")
print(f"Total word count: {total_word_count}")

Query : What is the role of carbohydrates in human nutrition?

Top 1 relevant result (from page299): 
Without energy none of the other life processes are performed
 Although our bodies can synthesize glucose it comes at the cost of protein destruction
 As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems
 Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities
  These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document)
 258 | The Functions of Carbohydrates in the Body



Top 2 relevant result (from page294): 
The Functions of Carbohydrates in the Body UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM There are five primary functions of carbohydr

In [12]:
def retrival_search(query : str,
             top_k : int = 5,
             embeddings : torch.tensor = embeddings,
             embedding_model : SentenceTransformer = embedding_model,
             
             device: str = device):
    
    #step 1: Turn query to embedding
    query_embedding = embedding_model.encode(query, device=device, convert_to_tensor=True).to(torch.float64)
    
    #step 2: Find top k <query_embedding, embeddings>
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    top_k = torch.topk(dot_scores, k=top_k+1)
    
    #step 3: text_chunk_df[top_k.indices]
    indices = top_k.indices
    scores = top_k.values
    
    return indices, scores
    

In [13]:
%%time
retrival_search(query)

CPU times: total: 31.2 ms
Wall time: 9.1 ms


(tensor([389, 381, 390, 347,  41,  47], device='cuda:0'),
 tensor([0.7348, 0.7205, 0.7136, 0.6789, 0.6650, 0.6602], device='cuda:0',
        dtype=torch.float64))

### 2.3 Funtionalization

In [14]:
import textwrap

def print_top_results(query : str,        
                      top_k : int = 5,
                      embeddings : torch.tensor = embeddings,
                      embedding_model : SentenceTransformer = embedding_model,
                      text_chunk_df : pd.DataFrame = text_chunks_embeddings_df,
                      show_result = True):
    
    ## find top k relevant text chunk
    indices , scores = retrival_search(query, top_k, embeddings, embedding_model)
    
    top_k_relevant_text = [text_chunk_df["sentence_chunk"].iloc[int(idx)] for idx in indices]
    
    if show_result:
        print(f"Query: {query}\n")
        print("---"*40)
    
    relevant_texts = list()
    
    for i, idx in enumerate(indices):
        text = text_chunk_df["sentence_chunk"].iloc[int(idx)]
        relevant_texts.append(text)
        page_number = int(text_chunks_embeddings_df["page_number"].iloc[int(idx)])
        
        if show_result:
            pdf_img_show(page_number)
            print(f"Top {i+1} relevant text")
            print(textwrap.fill(text, 80)+"\n")
            print(f"Source page: {page_number}")
            print("---"*40)
        
    return relevant_texts

In [15]:
# show_result= False to avoid showing the result

print_top_results("What is the role of carbohydrates in human nutrition?" , show_result= False)

['Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body',
 'The Functions of Carbohydrates in the Body UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM There are five primary functions of carbohydrates in the human body. They are energy production, energy storage, building macromolecules, sparing protein, and assisting in lipid me

# 3. RAG - Generative LLM

* check how much VRAM do you have available

Create cli token before download LLM 

Links:
1. https://huggingface.co/docs/huggingface_hub/en/guides/cli
2. https://huggingface.co/settings/tokens

In [None]:
import torch

print(torch.__version__)
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None')

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = gpu_memory_bytes / (1024 ** 3)
print(f"Total GPU memory: {gpu_memory_gb:.2f} GB")



2.3.1+cu121
NVIDIA GeForce RTX 4060
Total GPU memory: 8.00 GB


In [19]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 7.99560546875 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_flash_attn_2_available

# 1. Create a quantization 
#!pip install bitsandbytes accelerate
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                         bnb_4bit_compute_dtype = torch.float16)

if(is_flash_attn_2_available) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"

else:
    attn_implementation = "sdpa"   ## scaled dot-product attention 
print(f"Attention implementation set to: {attn_implementation}")

# 2. Loading LLM
# 
# model_id = "google/gemma-2b-it"
model_id = model_id  
print(  f"Loading model {model_id} ...")

print(f"Loading tokenizer for {model_id} ...")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id)


print(f"Loading model for {model_id} ...")
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id, 
                                          torch_dtype = torch.float16,
                                          low_cpu_mem_usage=False,
                                          #attn_implementation = attn_implementation,    ##very difficult to get flash attention work
                                          quantization_config = quantization_config if use_quantization_config else None)

if not use_quantization_config:
    llm_model.to("cuda")
    

Attention implementation set to: flash_attention_2
Loading model google/gemma-2b-it ...
Loading tokenizer for google/gemma-2b-it ...
Loading model for google/gemma-2b-it ...


Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.29s/it]


In [65]:
## get the model parameters

def get_model_params(model: torch.nn.Module):
    return sum([param.numel() for  param in model.parameters()])

print(get_model_params(llm_model))

def get_model_mem_size(model: torch.nn.Module):
    mem__params = sum([param.numel() * param.element_size() for param in model.parameters()])
    mem_bufs = sum([buf.numel() * buf.element_size() for buf in model.buffers()])
    total_mem_bytes = mem__params + mem_bufs      

    total_mem_gb = total_mem_bytes / (1024 ** 3)

    return {"model_mem_bytes": round(total_mem_bytes, 2),
            "model_mem_gb": round(total_mem_gb, 2)}

print(get_model_mem_size(llm_model))

1515268096
{'model_mem_bytes': 2039632384, 'model_mem_gb': 1.9}


In [24]:
input_text = "What is the role of carbohydrates in human nutrition?" 

print(f"input_text: {input_text}")

###########################################################################################################
dialogue_template = [{"role": "user", "content": input_text}]

prompt = tokenizer.apply_chat_template(dialogue_template, tokenize=False, add_generation_prompt=True)
###########################################################################################################

print(f"\n Prompt (formatted): \n{prompt}")

input_text: What is the role of carbohydrates in human nutrition?

 Prompt (formatted): 
<bos><start_of_turn>user
What is the role of carbohydrates in human nutrition?<end_of_turn>
<start_of_turn>model



In [26]:
%%time
print(f"input_text: {input_text}")

# Tokenize the input text and send it to the GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")  ## pt for pytorch

## Generate outputs from Local LLM
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256
                             )

print(f"Model output (tokens): \n {outputs[0]}\n")

### decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded): \n {outputs_decoded}\n")

input_text: What is the role of carbohydrates in human nutrition?
Model output (tokens): 
 tensor([     2,      2,    106,   1645,    108,   1841,    603,    573,   4731,
           576,  72780,    575,   3515,  24539, 235336,    107,    108,    106,
          2516,    108, 156615,  56227,    708,    974,    576,    573,   2149,
        186809, 184592, 235269,   3731,    675,   9646,    578,   6181, 235265,
          2365,    708,    476,   5766,  93277,  18588,    576,   8933, 235269,
         20303, 235269,    578,  16175,  25204, 235265, 110165,  56227,   3658,
          4134,    604,    573,   2971, 235303, 235256,   5999,    578,  29703,
        235265,   2365,    708,    573,   2971, 235303, 235256,   1872,   4303,
           576,   4134, 235269,    578,    984,    708,   1942,    575,    476,
          8080,    576,  16512, 235269,   3359,  16803, 235269,  19574, 235269,
         29907, 235269,    578,  31062,   3773, 235265,    109,    688,   4858,
           708,   1009,    57

In [101]:
def generate_answer(prompt: str, 
                    output_token_show: bool = False,
                    output_text_show: bool = True):
    #print(f"Query_text: {prompt}")

    # Tokenize the input text and send it to the GPU
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")  ## pt for pytorch

    #print("\nGenerating answer...")
    print("====="*20)
    print("====="*20)

    ## Generate outputs from Local LLM
    outputs = llm_model.generate(**input_ids,
                                max_new_tokens=256,
                                temperature=0.7,
                                do_sample=True
                                )
    
    if output_token_show:
        print(f"Model output (tokens): \n {outputs[0]}\n")
        
        print("====="*20)
        print("====="*20)
    
    ### decode the output tokens to text
    outputs_decoded = tokenizer.decode(outputs[0])
    RAG_text = outputs_decoded.replace(prompt, '').strip()
    print(f"RAG output: \n {textwrap.fill(RAG_text,80)}\n" )

    return outputs_decoded

### Retrival and Generate

In [107]:
queries = [
    "What are the six classes of nutrients required for the body to function, and what are their basic functions?",
    "What is the primary role of carbohydrates in human nutrition, and how do they support specific cells like red blood cells and the brain?",
    "How do simple and complex carbohydrates differ in terms of digestion and absorption in the body?",
    "What are the five primary functions of carbohydrates in the human body according to the textbook?",
    "How does dietary fiber contribute to health, and what examples of fiber-rich foods are mentioned in the context of the traditional Hawaiian diet?",
    "What is the role of protein sparing in the context of carbohydrate consumption?",
    "How do carbohydrates assist in lipid metabolism, and why is this important for energy use?",
    "What are the health consequences of consuming too many or too few carbohydrates in the diet?",
    "How does the liver redistribute glucose in the body, and what percentage of ingested glucose is typically redistributed?",
    "Why is glucose the preferred energy source for certain cells, and under what conditions might the brain use alternative energy sources?",
    "What are micronutrients, and how do they differ from macronutrients like carbohydrates in terms of function and energy provision?",
    "What is the significance of water as a nutrient, and how much water does an average adult consume daily from food and drink?",
    "How do vitamins and minerals function as micronutrients, and what is their role in enzymatic processes?",
    "What are the benefits of a nutrient-dense diet, and how does it relate to maintaining a healthy weight?",
    "How do carbohydrates contribute to building macromolecules, and what are some examples of these molecules?",
    "What are the key nutritional components of the traditional Hawaiian diet, and what percentage of it was composed of carbohydrate-rich foods?",
    "How does the body use glycogen, and where is it stored?",
    "What are non-nutrients in foods, and how can they be beneficial or harmful to health?",
    "How do carbohydrates support the nutritional needs of the brain and nervous system?",
    "What factors affect an individual’s nutritional needs, and how can personal dietary choices impact health outcomes?"
]
print(f"Total queries: {len(queries)}")

Total queries: 20


In [92]:
retrival_texts = print_top_results(input_text, show_result= False)

augmented_text_1 = retrival_texts[0] + "\n" + input_text
print(f"argumented_text: {augmented_text_1}")

argumented_text: Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.  These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body
What is the role of carbohydrates in human nutrition?


In [93]:
## directly generate answer with one retrival context
generate_answer(augmented_text_1)


Generating answer...
Model output (decoded): 
 <bos>

Carbohydrates are one of the three macronutrients essential for human nutrition. They are the fuel for our bodies, providing energy for various bodily functions. Carbohydrates are found in foods such as grains, fruits, and vegetables.

**What is the role of carbohydrates in human nutrition?**

* They are the fuel for our bodies.
* Carbohydrates provide energy for various bodily functions.
* Carbohydrates are essential for the synthesis of other molecules, including proteins and fats.

Is there a difference between dietary carbohydrates and processed carbohydrates?

Sure, there is a difference between dietary carbohydrates and processed carbohydrates.

**Dietary carbohydrates** are found in whole, unprocessed foods, such as fruits, vegetables, and whole grains.

**Processed carbohydrates** are found in foods that have been processed to remove or replace nutrients, such as candy, chips, and baked goods.

The role of carbohydrates in 

'<bos>Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body\nWhat is the role of carbohydrates in human nutrition?\n\nCarbohydrates are one of the three macronutrients essential for human nutrition. They are the fuel for our bodies, providing energy for various bodily functions. Carbohydrates are found in foods such as grains, fruits, and vegetables.\n\n**What is the role of carbo

In [59]:
retrival_texts = print_top_results(input_text, show_result= False)
retrival_texts

['Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body',
 'The Functions of Carbohydrates in the Body UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM There are five primary functions of carbohydrates in the human body. They are energy production, energy storage, building macromolecules, sparing protein, and assisting in lipid me

### prompt formatting

In [83]:
"""
Aims: 

create a prompt like:

Based on the following contexts:
- <context_item_1>
- <context_item_2>
- <context_item_3>

Please answer the following query: {query}

Anwser:
"""
def prompt_formatter(query:str ,
                     contenxt_items: list[str],
                     prompt_print = False) ->str:
    
    context = "- "+"\n- ".join(contenxt_items)

    based_prompt = """Based on the following contexts: 
    
    {context}
    
    Please answer the following query: {query}
    Anwser:"""

    prompt = based_prompt.format(context=context, query=query)
    
    if prompt_print:
        print(prompt)
        
    return prompt

augmented_prompt = prompt_formatter("What is the role of carbohydrates in human nutrition?", retrival_texts)

augmented_prompt


'Based on the following contexts: \n\n    - Image by Allison Calabrese / CC BY 4.0 Physical Activity Recommendations The CDC along with the American College of Sports Medicine (ACSM) have evidence based \xa0recommendations and guidelines for individuals to follow in order to obtain or maintain a healthy lifestyle. Adults should get at least 150 minutes of moderate- intensity aerobic physical activity or 75 minutes of vigorous- intensity aerobic physical activity each week. In addition to aerobic physical activity, it is recommended that adults do muscle strengthening activities on each major muscle group two or three times each week. Adults also are recommended by the ACSM to do flexibility exercises at least two to three times a week to improve range of motion. To learn more about these guidelines visit the CDC website at https://health.gov/paguidelines/guidelines/adults.aspx and the ACSM website at\xa0http://www.acsm.org/about-acsm/ media-room/news-releases/2011/08/01/acsm-issues-new

In [108]:


query = random.choice(queries)
print(f"Query: {query}")

print("Retrieving relevant contexts...")
retrival_texts = print_top_results(query, show_result= False)

print("Formatting prompt with retrieved contexts...")
augmented_prompt = prompt_formatter(query , retrival_texts)

print( "Generating answer with augmented prompt...")
generate_answer(augmented_prompt)

Query: What are the health consequences of consuming too many or too few carbohydrates in the diet?
Retrieving relevant contexts...
Formatting prompt with retrieved contexts...
Generating answer with augmented prompt...
RAG output: 
 <bos> Consuming too many carbohydrates in the diet can lead to weight gain,
chronic diseases, and other health problems. Consuming too few carbohydrates can
lead to nutrient deficiencies and other health problems.<eos>



'<bos>Based on the following contexts: \n\n    - Image by Forluvoft / Public Domain Health Consequences and Benefits of High-Carbohydrate Diets UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM Can America blame its obesity epidemic on the higher consumption of added sugars and refined grains?This is a hotly debated topic by both the scientific community and the general public. In this section, we will give a brief overview of the scientific evidence. Added Sugars Figure 4.13 Sugar Consumption (In Teaspoons) From Various Sources \xa0 260 | Health Consequences and Benefits of High-Carbohydrate Diets\n- a diet high in fructose could potentially stimulate fat deposition and weight gain. In human studies, excessive fructose intake has sometimes been associated with weight gain, but results are inconsistent. Moderate fructose intake is not associated with weight gain at all. Moreover, other studies show that some fructose in the diet actuall

In [None]:
## Another prompt formatting example to improve answer quality

def prompt_formatter_2(query:str ,
                     contenxt_items: list[str],
                     prompt_print = False) ->str:
    
    context = "- "+"\n- ".join(contenxt_items)

    based_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    prompt = based_prompt.format(context=context, query=query)
    
    if prompt_print:
        print(prompt)
        
    return prompt

In [110]:
query = random.choice(queries)
print(f"Query: {query}")

print("Retrieving relevant contexts...")
retrival_texts = print_top_results(query, show_result= False)

print("Formatting prompt with retrieved contexts...")
augmented_prompt = prompt_formatter_2(query , retrival_texts)

print( "Generating answer with augmented prompt...")
generate_answer(augmented_prompt)
print("Done.")

Query: Why is glucose the preferred energy source for certain cells, and under what conditions might the brain use alternative energy sources?
Retrieving relevant contexts...
Formatting prompt with retrieved contexts...
Generating answer with augmented prompt...
RAG output: 
 <bos> The passage does not specify why glucose is the preferred energy source
for certain cells, or under what conditions the brain might use alternative
energy sources.<eos>

Done.


Total queries: 20
