In [None]:
import os
import requests

def download_pdf(url: str = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf",
                 file_path: str = "human-nutrition.pdf") -> str:
  if not os.path.exists(file_path):
    print("-> File doesn't exist, downloading...")
    response = requests.get(url)
    filename = file_path
    if response.status_code == 200:
      with open(filename, "wb") as file:
        file.write(response.content)
      return (f"-> File saved successfully with filename: {filename}")
    else:
      return (f"-> Unable to download the file: {response.status_code}")
  else:
    return ("File already exists.")

def upload_pdf(file_path: str = "human-nutrition.pdf") -> str:
  if os.path.exists(file_path):
    return ("File exists.")
  else:
    return ("File doesn't exist.")

if __name__=="__main__":
  print("Menu:")
  print("1. Download PDF")
  print("2. Upload PDF")
  choice = input("Enter your choice: ")

  if choice == "1":
    url = input("Enter the url (leave blank for default): ").strip()
    file_path = input("Enter the file path (leave blank for default): ").strip()
    if not url:
      url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    if not file_path:
      file_path = "human-nutrition.pdf"
    print(download_pdf(url, file_path))
  elif choice == "2":
    file_path = input("Enter the file path (leave blank for default): ").strip()
    if not file_path:
      file_path = "human-nutrition.pdf"
    print(upload_pdf(file_path))
  else:
    print("Wrong Input")

Menu:
1. Download PDF
2. Upload PDF
Enter your choice: 1
Enter the url (leave blank for default): 
Enter the file path (leave blank for default): 
-> File doesn't exist, downloading...
-> File saved successfully with filename: human-nutrition.pdf


In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [None]:
!pip install tqdm



In [None]:
import fitz
from tqdm.auto import tqdm

def clean_text(text: str) -> str:
  cleaned_text = text.replace("\n", " ").strip()
  return cleaned_text

def open_read_pdf(path: str) -> list[dict]:
  pdf = fitz.open(path)
  pages_texts = []
  for page_number, page in tqdm(enumerate(pdf)):
    text = page.get_text()
    text = clean_text(text)
    pages_texts.append({
        "page_number": page_number,
        # "page_char_count": len(text),
        # "page_word_count": len(text.split(" ")),
        # "page_sentence_count": len(text.split(". ")),
        # "page_token_count": len(text)/4, # 1 token ~= 4 characters
        "text": text
    })
  return pages_texts

pages_texts = open_read_pdf(file_path)

pages_texts[:2]

0it [00:00, ?it/s]

[{'page_number': 0, 'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': 1, 'text': ''}]

In [None]:
import random

random.sample(pages_texts, k=3)

[{'page_number': 527,
  'text': 'Hypothalam us by  Methoxyroxy ~commonswi ki / Public  Domain  contains distinct centers of neural circuits that regulate hunger and  satiety (Figure 8.7).  Figure 8.7 Sagittal View of the Brain  This is a scan of a brain. The hypothalamus contains distinct centers  of neural circuits that regulate hunger and satiety.  Hunger pangs are real and so is a “growling” stomach. When the  stomach is empty it contracts, producing the characteristic pang  and “growl.” The stomach’s mechanical movements relay neural  signals to the hypothalamus, which relays other neural signals to  parts of the brain. This results in the conscious feeling of the need  to eat. Alternatively, after you eat a meal the stomach stretches and  sends a neural signal to the brain stimulating the sensation of satiety  and relaying the message to stop eating. The stomach also sends out  certain hormones when it is full and others when it is empty. These  hormones communicate to the hypotha

In [None]:
import pandas as pd

df = pd.DataFrame(pages_texts)
df.head()

Unnamed: 0,page_number,text
0,0,Human Nutrition: 2020 Edition
1,1,
2,2,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,3,Human Nutrition: 2020 Edition by University of...
4,4,Contents Preface University of Hawai‘i at Mā...


In [None]:
# from spacy.lang.en import English

# nlp = English()

# nlp.add_pipe("sentencizer")

# doc = nlp("I am a human being. I like machine learning. I would also like to work for an org. where i will be at a good position.")

# print(list(doc.sents))

In [None]:
for item in tqdm(pages_texts):
  item["sentences"] = list(nlp(item["text"]).sents)
  item["sentences"] = [str(sentence) for sentence in item["sentences"]] # Sentences are spacy tokens and not string so thats why conversion
  # item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(pages_texts)
df.head()

Unnamed: 0,page_number,text,sentences
0,0,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition]
1,1,,[]
2,2,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...
3,3,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...
4,4,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...


In [None]:
df.iloc[600]

Unnamed: 0,600
page_number,600
text,Image by Allison Calabrese / CC BY 4.0 Kor...
sentences,[Image by Allison Calabrese / CC BY 4.0 Ko...


In [None]:
# chunk_size = 10

# def create_chunks(input_list: list, chunk_size: int) -> list[list[str]]:
#   return [input_list[i : i+chunk_size] for i in range(0, len(input_list), chunk_size)] # [["a", "b"....10 values],[10 values],....], 10 = chunk_size

# test = list(range(25))
# create_chunks(test, chunk_size)

In [None]:
for item in tqdm(pages_texts):
  item["sentences_chunks"] = create_chunks(item["sentences"], chunk_size)
  # item["sentences_chunks_count"] = len(item["sentences_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [None]:
pages_texts[1115]

{'page_number': 1115,
 'text': 'connection to body cues of hunger and fullness and the selection  of foods based on both pleasure and nutrient density.24\xa0 When an  individual is able to disconnect from diet culture and a focus on  weight loss, they are able to eat in ways that support having energy  throughout the day and feel competent around all foods.\xa0 Instead  of an outward focus on counting calories, dietary restriction, and  measuring food portions, intuitive eating teaches a mindfulness  practice of going inward and learning to respond to the cues and  rhythms of the body.\xa0 A review of 22 intervention studies that  compared traditional weight loss to the intuitive eating approach  concluded that the participants in the non-diet groups were able  to stop unhealthy weight controlling behaviors, improve metabolic  fitness and reduce risk factors, increase body satisfaction, and  improve psychological distress.25 \xa0While the non-diet approach has  not been shown to promot

In [None]:
df = pd.DataFrame(pages_texts)
df.head()

Unnamed: 0,page_number,text,sentences,sentences_chunks
0,0,Human Nutrition: 2020 Edition,[Human Nutrition: 2020 Edition],[[Human Nutrition: 2020 Edition]]
1,1,,[],[]
2,2,Human Nutrition: 2020 Edition UNIVERSITY OF ...,[Human Nutrition: 2020 Edition UNIVERSITY OF...,[[Human Nutrition: 2020 Edition UNIVERSITY O...
3,3,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,[[Human Nutrition: 2020 Edition by University ...
4,4,Contents Preface University of Hawai‘i at Mā...,[Contents Preface University of Hawai‘i at M...,[[Contents Preface University of Hawai‘i at ...


In [None]:
df.shape

(1208, 4)

In [None]:
import re

pages_chunks = []
for item in tqdm(pages_texts): # Selecting every row
  for sentence_chunk in item["sentences_chunks"]: # Selecting every ["a", "b"....] from sentence chunk of a row: [["a", "b"....],...]
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]
    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip() # "ab...."
    joined_sentence_chunk = re.sub(r"\.([A-Z])", r". \1", joined_sentence_chunk) # .A -> . A

    chunk_dict["sentence_chunk"] = joined_sentence_chunk
    # chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    # chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4 # 1 token ~= 4 words
    pages_chunks.append(chunk_dict) # [{page_number: 41, sentence_chunk: "ab...."}, {page_number: 41, sentence_chunk: "xy...."}]

len(pages_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [None]:
pages_chunks

[{'page_number': 0,
  'sentence_chunk': 'Human Nutrition: 2020 Edition',
  'chunk_token_count': 7.25},
 {'page_number': 2,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_token_count': 77.0},
 {'page_number': 3,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_token_count': 52.5},
 {'page_number': 4,
  'sentence_chunk': 'Contents Preface University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program xxv About the Contributors University of Hawai‘i at Mānoa Food Science an

In [None]:
df = pd.DataFrame(pages_chunks)
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_token_count
0,0,Human Nutrition: 2020 Edition,7.25
1,2,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,77.0
2,3,Human Nutrition: 2020 Edition by University of...,52.5
3,4,Contents Preface University of Hawai‘i at Māno...,191.5
4,5,Lifestyles and Nutrition University of Hawai‘i...,235.25


In [None]:
# min_token_length = 30

# for row in df[df["chunk_token_count"]<=min_token_length].sample(5).iterrows():
#   print(row[1]["chunk_token_count"])

In [None]:
# df[df["chunk_token_count"]>min_token_length].to_dict(orient="records")

In [None]:
pages_chunks_over_min_token = df[df["chunk_token_count"]>min_token_length].to_dict(orient="records") # Preserve the tokens in which more than 30*4 words or 30 tokens are present
pages_chunks_over_min_token[:2]

[{'page_number': 2,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_token_count': 77.0},
 {'page_number': 3,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_token_count': 52.5}]

In [None]:
random.sample(pages_chunks_over_min_token, k=1)

[{'page_number': 1154,
  'sentence_chunk': 'The New England Journal of Medicine, 346(6), 393–403. http://www.nejm.org/doi/full/10.1056/NEJMoa012512. Accessed April 15, 2018. 12.\xa0Diabetes Overview. National Institute of Diabetes and Digestive and Kidney Disease. https://www.niddk.nih.gov/health-information/ diabetes/overview. Accessed April 15, 2018.\xa0 Threats to Health | 1113',
  'chunk_token_count': 83.25}]

In [None]:
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")
# sentences = ["The Sentences Transformers library provides an easy and open-source way to create embeddings.",
#              "Sentences can be embedded one by one or as a list of strings.",
#              "I like dogs!"]

# embeddings = embedding_model.encode(sentences)
# all_embeddings = dict(zip(sentences, embeddings))

# # for sentence, embeddings in all_embeddings.items():
# #   print(f"Sentence: {sentence}")
# #   print(f"Embeddings: {embeddings}")
# #   print(" ")

# all_embeddings

In [None]:
# %%time

# embedding_model.to("cpu")

# for item in tqdm(pages_chunks_over_min_token):
#   item["embedding"] = embedding_model.encode(item["sentence_chunk"])

%%time

embedding_model.to("cuda")

for item in tqdm(pages_chunks_over_min_token):
  item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 31.3 s, sys: 205 ms, total: 31.5 s
Wall time: 31.8 s


In [None]:
# text_chunks = [item["sentence_chunk"] for item in pages_chunks_over_min_token]
# text_chunks

In [None]:
# len(text_chunks)

In [None]:
# %%time

# embedding_model.to("cuda")

# text_chunks_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)

# text_chunks_embeddings

In [None]:
text_chunks_embeddings_df = pd.DataFrame(pages_chunks_over_min_token)
text_chunks_embeddings_path = "text_chunks_embeddings_df.csv"
text_chunks_embeddings_df.to_csv(text_chunks_embeddings_path, index=False)
text_chunks_embeddings_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_token_count,embedding
0,2,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,77.0,"[0.06742427, 0.09022814, -0.005095489, -0.0317..."
1,3,Human Nutrition: 2020 Edition by University of...,52.5,"[0.05521564, 0.059213977, -0.016616724, -0.020..."
2,4,Contents Preface University of Hawai‘i at Māno...,191.5,"[0.027980184, 0.033981375, -0.020642668, 0.001..."
3,5,Lifestyles and Nutrition University of Hawai‘i...,235.25,"[0.06825669, 0.0381275, -0.008468541, -0.01813..."
4,6,The Cardiovascular System University of Hawai‘...,249.5,"[0.03302645, -0.008497635, 0.009571596, -0.004..."


In [None]:
text_chunks_embeddings_df_csv = pd.read_csv(text_chunks_embeddings_path)
text_chunks_embeddings_df_csv.head()

Unnamed: 0,page_number,sentence_chunk,chunk_token_count,embedding
0,2,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,77.0,[ 6.74242675e-02 9.02281404e-02 -5.09548886e-...
1,3,Human Nutrition: 2020 Edition by University of...,52.5,[ 5.52156419e-02 5.92139773e-02 -1.66167244e-...
2,4,Contents Preface University of Hawai‘i at Māno...,191.5,[ 2.79801842e-02 3.39813754e-02 -2.06426680e-...
3,5,Lifestyles and Nutrition University of Hawai‘i...,235.25,[ 6.82566911e-02 3.81275006e-02 -8.46854132e-...
4,6,The Cardiovascular System University of Hawai‘...,249.5,[ 3.30264494e-02 -8.49763490e-03 9.57159605e-...


## Loading the saved embeddings

In [None]:
import random
import numpy as np
import pandas as pd
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_embeddings_path = "text_chunks_embeddings_df.csv"
text_chunks_embeddings_df_csv = pd.read_csv(text_chunks_embeddings_path)
text_chunks_embeddings_df_csv["embedding"] = text_chunks_embeddings_df_csv["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))
embeddings = torch.tensor(np.stack(text_chunks_embeddings_df_csv["embedding"].to_list(), axis=0), dtype=torch.float32).to(device)

pages_chunks = text_chunks_embeddings_df_csv.to_dict(orient="records")

text_chunks_embeddings_df_csv.head()

Unnamed: 0,page_number,sentence_chunk,chunk_token_count,embedding
0,2,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,77.0,"[0.0674242675, 0.0902281404, -0.00509548886, -..."
1,3,Human Nutrition: 2020 Edition by University of...,52.5,"[0.0552156419, 0.0592139773, -0.0166167244, -0..."
2,4,Contents Preface University of Hawai‘i at Māno...,191.5,"[0.0279801842, 0.0339813754, -0.020642668, 0.0..."
3,5,Lifestyles and Nutrition University of Hawai‘i...,235.25,"[0.0682566911, 0.0381275006, -0.00846854132, -..."
4,6,The Cardiovascular System University of Hawai‘...,249.5,"[0.0330264494, -0.0084976349, 0.00957159605, -..."


In [None]:
embeddings

tensor([[ 0.0674,  0.0902, -0.0051,  ..., -0.0221, -0.0232,  0.0126],
        [ 0.0552,  0.0592, -0.0166,  ..., -0.0120, -0.0103,  0.0227],
        [ 0.0280,  0.0340, -0.0206,  ..., -0.0054,  0.0213,  0.0313],
        ...,
        [ 0.0771,  0.0098, -0.0122,  ..., -0.0409, -0.0752, -0.0241],
        [ 0.1030, -0.0165,  0.0083,  ..., -0.0574, -0.0283, -0.0295],
        [ 0.0864, -0.0125, -0.0113,  ..., -0.0522, -0.0337, -0.0299]])

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# query = "breast feeding timeline"

# print(f"Query: {query}")

# query_embeddings = embedding_model.encode(query, convert_to_tensor=True).to(device)

# dot_scores = util.dot_score(a=query_embeddings, b=embeddings)[0]

# top_k = 5

# top_results = torch.topk(dot_scores, k=top_k)
# top_results

In [None]:
# pages_chunks[1169]

In [None]:
# import textwrap

# def print_wrapped(text, wrap_length=80):
#   wrapped_text = textwrap.fill(text, wrap_length)
#   print(wrapped_text)

In [None]:
# query = "breast feeding timeline"

# print(f"Query: {query}")

# print("Result:")
# for score, idx in zip(top_results[0], top_results[1]):
#   print(f"Score: {score}")
#   print("Text: ")
#   print_wrapped(pages_chunks[idx]["sentence_chunk"])
#   print(f"Page Number: {pages_chunks[idx]['page_number']}")
#   print("\n")

In [None]:
# from sentence_transformers import CrossEncoder

# reranker = CrossEncoder(model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1")

# query = "breast feeding timeline"

# print(f"Query: {query}")

# documents = {}

# print("Result:")
# for score, idx in zip(top_results[0], top_results[1]):
#   documents[pages_chunks[idx]['page_number']] = pages_chunks[idx]["sentence_chunk"]

# pairs = [(query, doc) for doc in documents.values()]

# scores = reranker.predict(pairs)

# results = zip(documents.keys(), documents.values(), scores)
# results = sorted(results, key=lambda x: x[2], reverse=True)

# for page, text, score in results:
#   print(f"Page: {page}")
#   print(f"Sentence:")
#   print_wrapped(text)
#   print(f"Score: {score}")
#   print("\n")

In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [None]:
# # Download and show the top matched page image

# import fitz

# pdf_path = "human-nutrition.pdf"
# doc = fitz.open(pdf_path)
# page = doc.load_page(816+41)

# img = page.get_pixmap(dpi=300)
# img.save("page_816.png")

# doc.close()

# img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))

# import matplotlib.pyplot as plt

# plt.figure(figsize=(13, 10))
# plt.imshow(img_array)
# plt.title(f"Query: {query} | Most relevant page: ")
# plt.axis("off")
# plt.show()

In [None]:
# import torch

# def dot_product(vector1, vector2):
#   return torch.dot(vector1, vector2)

# def cosine_similarity(vector1, vector2):
#   dot_product = torch.dot(vector1, vector2)
#   norm_vector1 = torch.norm(vector1)
#   norm_vector2 = torch.norm(vector2)
#   return dot_product / (norm_vector1 * norm_vector2)

# vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
# vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
# vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# print(f"Dot Product: {dot_product(vector1, vector2)}")
# print(f"Dot Product: {dot_product(vector1, vector3)}")
# print(f"Dot Product: {dot_product(vector1, vector4)}")

# print(f"Cosine Similarity: {cosine_similarity(vector1, vector2)}")
# print(f"Cosine Similarity: {cosine_similarity(vector1, vector3)}")
# print(f"Cosine Similarity: {cosine_similarity(vector1, vector4)}")

In [None]:
from sentence_transformers import util, SentenceTransformer, CrossEncoder
from timeit import default_timer as timer
import textwrap

def get_relevant_resources(query: str,
                           embeddings: torch.tensor,
                           model: SentenceTransformer=embedding_model,
                           n_resources_to_return: int=5,
                           print_time: bool=True):
  query_embeddings = embedding_model.encode(query, convert_to_tensor=True)

  start_time = timer()
  dot_scores = util.dot_score(a=query_embeddings, b=embeddings)[0]
  end_time = timer()

  if print_time:
    print(f"-> Time taken to get scores for ({len(embeddings)} embeddings): {end_time-start_time}")

  values, indices = torch.topk(dot_scores, k=n_resources_to_return)

  return values, indices

def print_wrapped(text, wrap_length=80):
  wrapped_text = textwrap.fill(text, wrap_length)
  print(wrapped_text)


def get_relevant_resources_reranked(query: str,
                             embeddings: torch.tensor,
                             n_resources_to_return: int=5):
  reranker = CrossEncoder(model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1")

  documents = {}

  scores, indices = get_relevant_resources(query=query, embeddings=embeddings, n_resources_to_return=n_resources_to_return, print_time=False)

  for score, idx in zip(scores, indices):
    documents[pages_chunks[idx]['page_number']] = pages_chunks[idx]["sentence_chunk"]

  pairs = [(query, doc) for doc in documents.values()]

  scores = reranker.predict(pairs)

  results = zip(indices, documents.keys(), documents.values(), scores)
  results = sorted(results, key=lambda x: x[3], reverse=True)

  # for page, text, score in results:
  #   print(f"Page: {page}")
  #   print(f"Sentence:")
  #   print_wrapped(text)
  #   print(f"Score: {score}")
  #   print("\n")

  reranked_indices = torch.tensor([r[0] for r in results], dtype=torch.int64)
  reranked_scores = torch.tensor([r[3] for r in results], dtype=torch.float32)

  return reranked_scores, reranked_indices

In [None]:
# query = "foods high in fiber"

# # print(get_relevant_resources(query="food high in fiber", embeddings=embeddings))
# print(get_relevant_resources_reranked(query=query, embeddings=embeddings))

### Loading LLM

In [None]:
# import torch

# gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
# gpu_memory_gb = round(gpu_memory_bytes / (2**30))
# print(f"Available GPU memory: {gpu_memory_gb} GB")

In [None]:
!pip install bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [None]:
# # Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
# if gpu_memory_gb < 5.1:
#     print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
# elif gpu_memory_gb < 8.1:
#     print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
#     use_quantization_config = True
#     model_id = "google/gemma-2b-it"
# elif gpu_memory_gb < 19.0:
#     print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
#     use_quantization_config = False
#     model_id = "google/gemma-2b-it"
# elif gpu_memory_gb > 19.0:
#     print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
#     use_quantization_config = False
#     model_id = "google/gemma-7b-it"

# print(f"use_quantization_config set to: {use_quantization_config}")
# print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


In [None]:
use_quantization_config = False

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

if(is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attn_2"
else:
  attn_implementation = "sdpa"

model_id = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id)

# To remove the warning: Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation)

# To remove the warning: Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
llm_model.config.pad_token_id = llm_model.config.eos_token_id
llm_model.generation_config.pad_token_id = tokenizer.pad_token_id

if not use_quantization_config:
  device = "cuda" if torch.cuda.is_available() else "cpu"
  llm_model.to(device)

In [None]:
# import torch
# torch.cuda.get_device_capability(0)

In [None]:
llm_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (ro

In [None]:
# def get_model_params(model: torch.nn.Module):
#   return sum([p.numel() for p in model.parameters()])

# get_model_params(llm_model)

In [None]:
# def get_model_mem_size(model: torch.nn.Module):
#     # Get model parameters and buffer sizes
#     mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
#     mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

#     # Calculate various model sizes
#     model_mem_bytes = mem_params + mem_buffers # in bytes
#     model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
#     model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

#     return {"model_mem_bytes": model_mem_bytes,
#             "model_mem_mb": round(model_mem_mb, 2),
#             "model_mem_gb": round(model_mem_gb, 2)}

# get_model_mem_size(llm_model)

In [None]:
# input_text = "How long should infants be breastfed for?"
# print(f"Query: {input_text}")

# template = [
#     {
#         "role": "user",
#         "content": input_text
#     }
# ]

# prompt = tokenizer.apply_chat_template(conversation=template, tokenize=False, add_generation_prompt=True)
# print(f"Prompt: {prompt}")

In [None]:
# tokenizer

In [None]:
# %%time

# input_ids = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# output = llm_model.generate(**input_ids, max_new_tokens=256)

# print(f"Model output: {output[0]}\n")

In [None]:
# decoded_output = tokenizer.decode(output[0])

# print(decoded_output)

In [None]:
# # Nutrition-style questions generated with GPT4
# gpt4_questions = [
#     "What are the macronutrients, and what roles do they play in the human body?",
#     "How do vitamins and minerals differ in their roles and importance for health?",
#     "Describe the process of digestion and absorption of nutrients in the human body.",
#     "What role does fibre play in digestion? Name five fibre containing foods.",
#     "Explain the concept of energy balance and its importance in weight management."
# ]

# # Manually created question list
# manual_questions = [
#     "How often should infants be breastfed?",
#     "What are symptoms of pellagra?",
#     "How does saliva help with digestion?",
#     "What is the RDI for protein per day?",
#     "water soluble vitamins"
# ]

# query_list = gpt4_questions + manual_questions

In [None]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
  context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

  base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

  base_prompt = base_prompt.format(context=context, query=query)

  dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  return prompt

query = random.choice(query_list)
print(f"Query: {query}")

scores, indices = get_relevant_resources(query=query, embeddings=embeddings)

context_items = [pages_chunks[idx] for idx in indices]

prompt = prompt_formatter(query=query, context_items=context_items)
print(f"Prompt: {prompt}")

Query: How does saliva help with digestion?
-> Time taken to get scores for (1680 embeddings): 0.003909009000381047
Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Oct 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a cr

In [None]:
# %%time

# input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# outputs = llm_model.generate(**input_ids, temperature=0.7, do_sample=True, max_new_tokens=256)

# output_text = tokenizer.decode(outputs[0])

# print(f"Query: {query}")
# print(f"RAG Output: {output_text.replace(prompt, '')}")

In [None]:
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens: int=256,
        format_answer_text: bool=True,
        return_answer_only: bool=True):

  scores, indices = get_relevant_resources_reranked(query=query, embeddings=embeddings)

  context_items = [pages_chunks[i] for i in indices]

  for i, item in enumerate(context_items):
    item["score"] = scores[i].cpu()

  prompt = prompt_formatter(query=query, context_items=context_items)

  input_ids = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

  outputs = llm_model.generate(**input_ids,
                                temperature=temperature,
                                do_sample=True,
                                max_new_tokens=max_new_tokens)

  output_text = tokenizer.decode(outputs[0])

  if format_answer_text:
    output_text = output_text.replace(prompt, "").replace("<|begin_of_text|>", "").replace("<|eot_id|>", "").replace("Based on the context, the", "The")

  if return_answer_only:
    return output_text

  return output_text, context_items

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

# query = random.choice(query_list)
query = "What role does fibre play in digestion? Name five fibre containing foods."
print(f"Query: {query}")
print(f"RAG Answer: {ask(query, temperature=0.2, return_answer_only=False)}")

Query: What role does fibre play in digestion? Name five fibre containing foods.


KeyboardInterrupt: 