# 1. RAG - Data preprocess

## 1.1 Data Process: text over page, then split the sentences.

### 1.1.1 download pdf and simple process

In [2]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrution-text.pdf"

# Download the PDF
if not os.path.exists(pdf_path):
    print("[INFO]File doesn't exist Downloading...")
    
    #Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    #The local filename to save the downloaded file
    filename = pdf_path
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        #Open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"[INFO]File downloaded and saved as {filename}")
        
    else:
        print(f"[INFO]Failed to download file. Status code: {response.status_code}")
else:
    print(f"[INFO]File {pdf_path} already exists.")


[INFO]File human-nutrution-text.pdf already exists.


In [3]:
from tqdm.auto import tqdm
try:
    import fitz
except ImportError:
    import sys
    !{sys.executable} -m pip install PyMuPDF
    import fitz
    
def text_formaater(text: str) -> str:
    """Performs. minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

# read pdf file in list[dict]
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formaater(text= text)
        pages_and_texts.append({"page_number": page_number -41, 
                               "page_char_count": len(text), 
                               "page_word_count": len(text.split(" ")),
                               "page_sentence_count" : len(text.split(". ")),
                               "page_token_count": len(text)/4, # Rough estimate of tokens
                               "text":text
                               })
        
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path)

0it [00:00, ?it/s]

In [4]:
import random
#view some of the pages and texts
#display(random.sample(pages_and_texts, k=3))

import pandas as pd
df = pd.DataFrame(pages_and_texts)
display(df[500:503])
display(df.describe().round(2))

  from pandas.core import (


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
500,459,745,120,4,186.25,Learning Activities Technology Note: The seco...
501,460,1958,337,19,489.5,The Atom UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD ...
502,461,1339,230,12,334.75,Image by DoSiDo / CC BY-SA 3.0 Atoms and mo...


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


### 1.1.2 split sentence by using spacy (nlp tool)

In [5]:
try:
    from spacy.lang.en import English
except ModuleNotFoundError:
    import sys
    !{sys.executable} -m pip install spacy
    from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

## example for sentence splitting
test_sentences = "This is a sentence. This is another one. And this is the last one."
doc = nlp(test_sentences)
print(f"Original: {test_sentences}")
print(f"After split: {list(doc.sents)}")

Original: This is a sentence. This is another one. And this is the last one.
After split: [This is a sentence., This is another one., And this is the last one.]


In [6]:
for item in tqdm(pages_and_texts):
    # Process the text with spaCy
    item["sentences"] = list(nlp(item["text"]).sents)
    
    #make sure to convert sentences to string
    item["sentences"] = [str(sentence)for sentence in item["sentences"]]
    
    # Add the sentence count to the item
    item["page_sentence_count_spacy"] = len(item["sentences"])
    
    
#view some of the pages and texts
random.sample(pages_and_texts, k=1)


  0%|          | 0/1208 [00:00<?, ?it/s]

[{'page_number': 121,
  'page_char_count': 764,
  'page_word_count': 137,
  'page_sentence_count': 7,
  'page_token_count': 191.0,
  'text': '“Axial  Skeleton” by  Openstax  College / CC  BY 3.0\xa0 providing structural support, red and white blood cells and platelets  are synthesized in bone marrow. Another vital function of bones  is that they act as a storage depot for minerals such as calcium,  phosphorous, and magnesium. Although bone tissue may look  inactive at first glance, at the microscopic level you will find that  bones are continuously breaking down and reforming. Bones also  contain a complex network of canals, blood vessels, and nerves that  allow for nutrient transport and communication with other organ  systems.  Figure 2.25 Human Skeletal Structure  The human skeleton contains 206 bones. It is divided into two main  parts, the axial and appendicular.  The Skeletal System  |  121',
  'sentences': ['“Axial  Skeleton” by  Openstax  College / CC  BY 3.0\xa0 providing stru

## 1.2 Chunk the sentence together

### 1.2.1 chunk sentence function

In [7]:
num_sentence_chunk_size = 10

## chunking function
def split_list(input_list : list[str],
               slice_size : int= num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

#to check if the function works apporpiately
test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [8]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"]=split_list(item["sentences"])
    item["num_chunks"] = len(item["sentence_chunks"])
    
print(f"after chunking:")
display(random.sample(pages_and_texts, k=1))
df = pd.DataFrame(pages_and_texts)
display(df.head())
display(df.describe().round(2))

  0%|          | 0/1208 [00:00<?, ?it/s]

after chunking:


[{'page_number': 250,
  'page_char_count': 1641,
  'page_word_count': 313,
  'page_sentence_count': 13,
  'page_token_count': 410.25,
  'text': 'Pretzels  83  White bread  70  White rice  72  Bagel  72  Rice milk  86  Cheerios  74  Raisin Bran  73  Fruit roll-up  99  Gatorade  78  For  the  Glycemic  Index  on  different  foods,  visit  http://www.mendosa.com/gilists.htm.  The type of carbohydrate within a food affects the GI along with  its fat and fiber content. Increased fat and fiber in foods increases  the time required for digestion and delays the rate of gastric  emptying into the small intestine which, ultimately reduces the GI.  Processing and cooking also affects a food’s GI by increasing their  digestibility. Advancements in the technologies of food processing  and the high consumer demand for convenient, precooked foods in  the United States has created foods that are digested and absorbed  more rapidly, independent of the fiber content. Modern breakfast  cereals, breads, p

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],1,[[Human Nutrition: 2020 Edition]],1
1,-40,0,1,1,0.0,,[],0,[],0
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,1,[[Human Nutrition: 2020 Edition UNIVERSITY O...,1
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,1,[[Human Nutrition: 2020 Edition by University ...,1
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,2,[[Contents Preface University of Hawai‘i at ...,1


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### 1.2.2 groupby page[chunk]

In [9]:

import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        #Join the sentences together into a paragraph-like structure, i.e. join the list of sentence into one paragraph
        # list to string
        joined_sentences_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentences_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentences_chunk) # ".A" to ". A"

        chunk_dict["sentence_chunk"] = joined_sentences_chunk
        
        chunk_dict["chunk_char_count"] = len(joined_sentences_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentences_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentences_chunk)/4     # Rough estimate of tokens
        
        pages_and_chunks.append(chunk_dict)
        
len(pages_and_chunks)
        

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [10]:
df= pd.DataFrame(pages_and_chunks)
df.head()
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


In [11]:
min_token_length = 30
## almost all of the chunks with under 30 tokens are not useful
for row in df[df["chunk_token_count"]< min_token_length].sample(5).iterrows():
    print(f"chunk token: {row[1]['chunk_token_count']} ||sentence chunk: {row[1]['sentence_chunk']}")

chunk token: 9.75 ||sentence chunk: Older Adulthood: The Golden Years | 925
chunk token: 16.0 ||sentence chunk: Accessed January 20, 2018. 1032 | The Effect of New Technologies
chunk token: 12.75 ||sentence chunk: PART VI CHAPTER 6. PROTEIN Chapter 6. Protein | 357
chunk token: 7.0 ||sentence chunk: Water-Soluble Vitamins | 553
chunk token: 11.25 ||sentence chunk: Accessed March 17, 2011. 212 | Water Concerns


### 1.2.3 filter out the chunks with under 30 tokens

In [12]:

pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] >= min_token_length].to_dict(orient="records")
display(pages_and_chunks_over_min_token_length[:2])
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

[{'page_number': 766,
  'sentence_chunk': 'Note: There may be a few foods that do not fall in either of the 3 food groups, which the SPC regards as not essential or needed in significant amounts to achieve healthy eating. The recommendations for these miscellaneous food items are to “Eat Less” of, but not necessarily to avoid completely as they may have cultural and traditional value. Such food items will have this pin. To learn more about how to use the Pacific Food Guide visit http://manoa.hawaii.edu/ctahr/pacificfoodguide/index.php/ nutritional-guidelines/how-to-use-the-guide/ Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning activities may be used across various mobile devices, however, for the best user exp

## 1.3 Embedding the sentence

#### 1.3.1 Embedding model download 

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device="cpu")


  Referenced from: /Users/yaukahei/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


README.md: 0.00B [00:00, ?B/s]

In [14]:

sentences = ["This is a sentence.", "This is another one.", "And this is the last one."]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding(first 5): {embedding[:5]}")
    print(f"Embedding shape: {embedding.shape}")

Sentence: This is a sentence.
Embedding(first 5): [ 0.04461049 -0.01864093 -0.00090317  0.02639336 -0.04188507]
Embedding shape: (768,)
Sentence: This is another one.
Embedding(first 5): [ 0.02019227  0.02981547  0.0016782  -0.04361513 -0.03454698]
Embedding shape: (768,)
Sentence: And this is the last one.
Embedding(first 5): [ 0.01193432  0.03599298 -0.02380107 -0.02242428  0.02393363]
Embedding shape: (768,)


In [23]:
embeddings[0].shape

(768,)

#### 1.3.2 Embedding chunks one by one with using CPU

In [27]:
#%%time 

cpu_run = True

if cpu_run:
    embedding_model.to("cpu")

    for item in tqdm(pages_and_chunks_over_min_token_length):
        item["embedding"] = embedding_model.encode(item["sentence_chunk"])
        item["embedding_shape"] = item["embedding"].shape
    
print("CPU times: user 4min 13s, sys: 44.4 s, total: 4min 57s \n Wall time: 3min 30s")## on MacBook Pro M2 Pro 16GB RAM


  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 4min 13s, sys: 44.4 s, total: 4min 57s 
 Wall time: 3min 30s



#### 1.3.3 Save embeddings to file ( Don't run it if the embeddings is not ran above)


In [28]:
text_chunks_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = "text_chunks_embeddings_df.csv"
text_chunks_embeddings_df.to_csv(embeddings_df_save_path, index=False)


In [29]:
# import saved file and view

text_chunks_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_embeddings_df_load.head()


# notice saving embeddings data to csv file may not be the best way, 
# in stead of doing that, vectors database is recommended when the data is large

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding,embedding_shape
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242675e-02 9.02280584e-02 -5.09552285e-...,"(768,)"
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52155972e-02 5.92137724e-02 -1.66167505e-...,"(768,)"
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801637e-02 3.39813568e-02 -2.06427071e-...,"(768,)"
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82567060e-02 3.81274000e-02 -8.46859720e-...,"(768,)"
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264121e-02 -8.49774759e-03 9.57154669e-...,"(768,)"


# 2. RAG - Top relevant Search

### 2.1 import data

In [31]:
text_chunks_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding,embedding_shape
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.06742427, 0.09022806, -0.005095523, -0.0317...","(768,)"
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.055215597, 0.059213772, -0.01661675, -0.020...","(768,)"
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.50,"[0.027980164, 0.033981357, -0.020642707, 0.001...","(768,)"
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,"[0.068256706, 0.0381274, -0.008468597, -0.0181...","(768,)"
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.033026412, -0.008497748, 0.009571547, -0.00...","(768,)"
...,...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1305,176,326.25,"[0.018562289, -0.016427929, -0.012704643, -0.0...","(768,)"
1676,1164,Hazard Analysis Critical Control Points reused...,375,51,93.75,"[0.033472236, -0.057044152, 0.015148944, -0.01...","(768,)"
1677,1165,ShareAlike 11. Organs reused “Pancreas Organ A...,1286,173,321.50,"[0.0770514, 0.009785434, -0.0121816965, 0.0010...","(768,)"
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,410,59,102.50,"[0.10304509, -0.016470246, 0.00826844, 0.03779...","(768,)"


In [32]:
import random

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the embeddings from CSV file
text_chunks_embeddings_df = pd.read_csv("text_chunks_embeddings_df.csv")

#Convert the embeddings from string representation to numpy arrays
text_chunks_embeddings_df["embedding"] = text_chunks_embeddings_df["embedding"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ").astype(np.float32)
)

embeddings = torch.tensor(np.stack(text_chunks_embeddings_df["embedding"].tolist(), 
                      axis=0))


print(f"embeddings : {embeddings}")
print(f"embeddings shape: {embeddings.shape}")


Using device: cpu
embeddings : tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])
embeddings shape: torch.Size([1680, 768])


### 2.2 loading embedding model and doing sematic search pipeline

Doing sematic search pipeline with the following steps:
1. define query
2. embedding the query
3. cosine similarity with text embeddings and query embedding
4. find top k highest similarity from result

In [33]:
from sentence_transformers import SentenceTransformer, util

try:
    if embedding_model is None:
        # Load the embedding model
        embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)
except NameError:
    embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2", device=device)


In [34]:
query = "What is the role of carbohydrates in human nutrition?"
query_embedding = embedding_model.encode(query, device=device, convert_to_tensor=True)

print(f"Enter query: {query}")
print(f"Query embedding shape: {query_embedding.shape}")


Enter query: What is the role of carbohydrates in human nutrition?
Query embedding shape: torch.Size([768])


In [35]:
%%time

k=5
## Perform semantic search by dot product with no normalization

dot_scores = util.dot_score(query_embedding, embeddings)[0]
print(dot_scores.shape)

top_k = torch.topk(dot_scores, k=k)
print(f"top {k} results: {top_k.indices}")

torch.Size([1680])
top 5 results: tensor([389, 381, 390, 347,  41])
CPU times: user 1.95 ms, sys: 2.83 ms, total: 4.78 ms
Wall time: 8.35 ms


In [36]:
## checking local memoery
import torch
if torch.cuda.is_available():
    torch.cuda.get_device_properties(0)

In [37]:
import platform
import psutil

print("Processor:", platform.processor())
print("Machine:", platform.machine())
print("Platform:", platform.platform())
print("CPU cores (physical):", psutil.cpu_count(logical=False))
print("CPU cores (total):", psutil.cpu_count(logical=True))
print("CPU frequency (MHz):", psutil)

Processor: arm
Machine: arm64
Platform: macOS-12.4-arm64-arm-64bit
CPU cores (physical): 8
CPU cores (total): 8
CPU frequency (MHz): <module 'psutil' from '/Users/yaukahei/anaconda3/lib/python3.11/site-packages/psutil/__init__.py'>


### 2.3 show the pages for the result with following steps:

1. open the pdf in image
2. show the image with matplotlib

In [38]:
import fitz
import matplotlib.pyplot as plt


def pdf_img_show(num_page: int):

    pdf_path = "human-nutrution-text.pdf"
    doc = fitz.open(pdf_path)
    page = doc.load_page(num_page+41)

    img = page.get_pixmap(dpi=300)

    doc.close()

    img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))
    
    plt.figure(figsize= (13,10))
    plt.imshow(img_array)
    plt.axis("off")
    plt.show()

#pdf_img_show(50)


In [39]:
# show the top k result in text
top_k_embeddings = embeddings[top_k.indices]

print(f"Query : {query}\n")

total_token_count = 0
total_word_count = 0

for i, idx in enumerate(top_k.indices):
    page_number = text_chunks_embeddings_df["page_number"].iloc[int(idx)]+41
    print(f"Top {i+1} relevant result (from page{page_number}): ")
    
    text_chunks = text_chunks_embeddings_df["sentence_chunk"].iloc[int(idx)]
    total_token_count += text_chunks_embeddings_df["chunk_token_count"].iloc[int(idx)]
    total_word_count += text_chunks_embeddings_df["chunk_word_count"].iloc[int(idx)]
    
    for sentence in [ n for n in text_chunks.split(".")]:
        print(sentence)
    print("\n\n")
    
print(f"Total token count: {total_token_count}")
print(f"Total word count: {total_word_count}")

Query : What is the role of carbohydrates in human nutrition?

Top 1 relevant result (from page299): 
Without energy none of the other life processes are performed
 Although our bodies can synthesize glucose it comes at the cost of protein destruction
 As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems
 Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities
  These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document)
 258 | The Functions of Carbohydrates in the Body



Top 2 relevant result (from page294): 
The Functions of Carbohydrates in the Body UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM There are five primary functions of carbohydr

In [40]:
def retrival_search(query : str,
             top_k : int = 5,
             embeddings : torch.tensor = embeddings,
             embedding_model : SentenceTransformer = embedding_model,
             
             device: str = device):
    
    #step 1: Turn query to embedding
    query_embedding = embedding_model.encode(query, device=device, convert_to_tensor=True)
    
    #step 2: Find top k <query_embedding, embeddings>
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    top_k = torch.topk(dot_scores, k=top_k+1)
    
    #step 3: text_chunk_df[top_k.indices]
    indices = top_k.indices
    scores = top_k.values
    
    return indices, scores
    

In [41]:
%%time
retrival_search(query)

CPU times: user 48.6 ms, sys: 21.1 ms, total: 69.7 ms
Wall time: 69.1 ms


(tensor([389, 381, 390, 347,  41,  47]),
 tensor([0.7348, 0.7205, 0.7136, 0.6789, 0.6650, 0.6602]))

### 2.3 Funtionalization

In [42]:
import textwrap

def print_top_results(query : str,        
                      top_k : int = 5,
                      embeddings : torch.tensor = embeddings,
                      embedding_model : SentenceTransformer = embedding_model,
                      text_chunk_df : pd.DataFrame = text_chunks_embeddings_df,
                      show_result = True):
    
    ## find top k relevant text chunk
    indices , scores = retrival_search(query, top_k, embeddings, embedding_model)
    
    top_k_relevant_text = [text_chunk_df["sentence_chunk"].iloc[int(idx)] for idx in indices]
    
    if show_result:
        print(f"Query: {query}\n")
        print("---"*40)
    
    relevant_texts = list()
    
    for i, idx in enumerate(indices):
        text = text_chunk_df["sentence_chunk"].iloc[int(idx)]
        relevant_texts.append(text)
        page_number = int(text_chunks_embeddings_df["page_number"].iloc[int(idx)])
        
        if show_result:
            pdf_img_show(page_number)
            print(f"Top {i+1} relevant text")
            print(textwrap.fill(text, 80)+"\n")
            print(f"Source page: {page_number}")
            print("---"*40)
        
    return relevant_texts

In [43]:
print_top_results("What is the role of carbohydrates in human nutrition?", show_result = False)

['Without energy none of the other life processes are performed. Although our bodies can synthesize glucose it comes at the cost of protein destruction. As with all nutrients though, carbohydrates are to be consumed in moderation as having too much or too little in the diet may lead to health problems. Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 258 | The Functions of Carbohydrates in the Body',
 'The Functions of Carbohydrates in the Body UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM There are five primary functions of carbohydrates in the human body. They are energy production, energy storage, building macromolecules, sparing protein, and assisting in lipid me