In [1]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    #!pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference


[INFO] Running in Google Colab, installing requirements.
Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.6
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m74.8 MB/s[

In [2]:
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

!pip install -U transformers sentence-transformers


Found existing installation: torch 2.9.0+cu126
Uninstalling torch-2.9.0+cu126:
  Successfully uninstalled torch-2.9.0+cu126
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: torchaudio 2.9.0+cu126
Uninstalling torchaudio-2.9.0+cu126:
  Successfully uninstalled torchaudio-2.9.0+cu126
Found existing installation: transformers 4.57.2
Uninstalling transformers-4.57.2:
  Successfully uninstalled transformers-4.57.2
Found existing installation: sentence-transformers 5.1.2
Uninstalling sentence-transformers-5.1.2:
  Successfully uninstalled sentence-transformers-5.1.2
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0

In [3]:
# Download PDF file
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
    print("File doesn't exist, downloading...")

    # The URL of the PDF you want to download
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # The local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open a file in binary write mode and save the content to it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"The file has been downloaded and saved as {filename}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists.")


File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [4]:
import fitz # This is the PyMuPdf
from tqdm.auto import tqdm # This Helps in faster iteration
def text_formatter(text: str) ->str:
  #Performs Minor operation on the text
  cleaned_text = text.replace("\n"," ").strip()
  return cleaned_text

# Now we will open the pdf
def open_and_read_pdf(pdf_path: str) ->list:
  doc = fitz.open(pdf_path) # opening a document
  pages_texts = [] # this is where we store the text contained in the certain number of page
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text()
    text = text_formatter(text)
    pages_texts.append({"page_number": page_number + 1,
                        "page_char_count": len(text),
                        "page_word_count": len(text.split(" ")),
                        "page_sentence_count": len(text.split(". ")),
                        "Page_token_count": len(text)/4,
                        "text": text
                        })
  return pages_texts # Moved return statement outside the loop

pages_texts = open_and_read_pdf(pdf_path = pdf_path)
pages_texts[:3]

0it [00:00, ?it/s]

[{'page_number': 1,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'Page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': 2,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'Page_token_count': 0.0,
  'text': ''},
 {'page_number': 3,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count': 1,
  'Page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'}]

In [5]:
import pandas as pd
df = pd.DataFrame(pages_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,Page_token_count,text
0,1,29,4,1,7.25,Human Nutrition: 2020 Edition
1,2,0,1,1,0.0,
2,3,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,4,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,5,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [6]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,Page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,604.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,1.0,0.0,1.0,1.0,0.0
25%,302.75,762.0,134.0,4.0,190.5
50%,604.5,1231.5,214.5,10.0,307.88
75%,906.25,1603.5,271.0,14.0,400.88
max,1208.0,2308.0,429.0,32.0,577.0


In [7]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer") # It breaks Sentences in a paragraph into Single Sentences
doc = nlp("This is Vihith. I am a chess Player")
assert len(list(doc.sents))==2
list(doc.sents)

[This is Vihith., I am a chess Player]

In [8]:
for item in tqdm(pages_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])


  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
import random
random.sample(pages_texts,k=1)

[{'page_number': 934,
  'page_char_count': 1567,
  'page_word_count': 266,
  'page_sentence_count': 14,
  'Page_token_count': 391.75,
  'text': 'Micronutrients  Micronutrient recommendations for adolescents are mostly the  same as for adults, though children this age need more of certain  minerals to promote bone growth (e.g., calcium and phosphorus,  along with iron and zinc for girls). Again, vitamins and minerals  should be obtained from food first, with supplementation for  certain micronutrients only (such as iron).  The most important micronutrients for adolescents are calcium,  vitamin D, vitamin A, and iron. Adequate calcium and vitamin D are  essential for building bone mass. The recommendation for calcium  is 1,300 milligrams for both boys and girls. Low-fat milk and cheeses  are excellent sources of calcium and help young people avoid  saturated fat and cholesterol. It can also be helpful for adolescents  to consume products fortified with calcium, such as breakfast  cereals

In [10]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])


  0%|          | 0/1208 [00:00<?, ?it/s]

In [11]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = " ".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)  # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) // 4  # 1 token = ~4 characters

        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)



  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [12]:
import random
random.sample(pages_and_chunks, k =1)

[{'page_number': 520,
  'sentence_chunk': 'Table 8.3 Physical Activity (PA) Categories and Values3 Activity Level Men PA Value Women PA Value Description Sedentary 1.00 1.00 No physical activity beyond that required for independent living Low 1.11 1.12 Equivalent to walking 1.5 to 3 miles per day Moderate 1.25 1.27 Equivalent to walking 3 to 10 miles per day High 1.48 1.45 Equivalent to walking 10 or more miles per day These values only apply to normal weight adults and not to children or pregnant or lactating women. These values only apply to normal weight adults and not to children or pregnant or lactating women. The numbers within the equations for the EER were derived from measurements taken from a group of people of the same sex and age with similar body size and physical activity level. These standardized formulas are then applied to individuals whose measurements have not been taken, but who have similar characteristics, in order to estimate their energy requirements. Thus, a pe

In [13]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,625.38,735.14,113.03,183.41
std,347.79,447.64,71.27,111.91
min,1.0,12.0,3.0,3.0
25%,322.5,315.0,45.0,78.0
50%,628.0,747.0,114.0,186.0
75%,932.0,1119.0,174.0,279.0
max,1208.0,1832.0,298.0,458.0


In [14]:
min_chunk_char_count = 30
pages_and_chunks_over_min_length = [
    chunk for chunk in pages_and_chunks if chunk["chunk_char_count"] >= min_chunk_char_count
]

print(f"Number of chunks before filtering: {len(pages_and_chunks)}")
print(f"Number of chunks after filtering (min char count >= {min_chunk_char_count}): {len(pages_and_chunks_over_min_length)}")


Number of chunks before filtering: 1843
Number of chunks after filtering (min char count >= 30): 1822


In [15]:
pages_and_chunks_over_min_length[:2]

[{'page_number': 3,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77},
 {'page_number': 4,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52}]

In [16]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu") # Choose the device to load the model to (note: GPU will often be *
# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981113e-02  3.03165037e-02 -2.01217793e-02  6.86483830e-02
 -2.55255643e-02 -8.47689249e-03 -2.07147663e-04 -6.32377118e-02
  2.81606149e-02 -3.33353840e-02  3.02634649e-02  5.30720763e-02
 -5.03526479e-02  2.62288153e-02  3.33314016e-02 -4.51578461e-02
  3.63043845e-02 -1.37116178e-03 -1.20171569e-02  1.14946812e-02
  5.04510887e-02  4.70857173e-02  2.11912915e-02  5.14607765e-02
 -2.03746371e-02 -3.58889103e-02 -6.67888962e-04 -2.94393133e-02
  4.95858490e-02 -1.05639603e-02 -1.52013749e-02 -1.31754903e-03
  4.48196754e-02  1.56023391e-02  8.60379942e-07 -1.21397164e-03
 -2.37978995e-02 -9.09372000e-04  7.34479493e-03 -2.53932923e-03
  5.23370057e-02 -4.68043573e-02  1.66214500e-02  4.71579209e-02
 -4.15599570e-02  9.01942665e-04  3.60278897e-02  3.42214778e-02
  9.68227163e-02  5.94828576e-02 -1.64984670e-02 -3.51250097e-02
  5.92516316e-03 -7.07960629e-04 -2.4103

In [17]:
embedding_model.to("cuda")
for item in tqdm(pages_and_chunks_over_min_length):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])
#

  0%|          | 0/1822 [00:00<?, ?it/s]

In [18]:
text_chunk_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_length)
embeddings_df_save_path = "text_chunk_and_embeddings_df.csv"

text_chunk_and_embeddings_df.to_csv(embeddings_df_save_path, index = False)

In [19]:
text_chunk_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunk_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,3,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77,[ 6.74242675e-02 9.02281404e-02 -5.09548886e-...
1,4,Human Nutrition: 2020 Edition by University of...,210,30,52,[ 5.52156419e-02 5.92139773e-02 -1.66167244e-...
2,5,Contents Preface University of Hawai‘i at Māno...,766,114,191,[ 2.79801842e-02 3.39813754e-02 -2.06426680e-...
3,6,Lifestyles and Nutrition University of Hawai‘i...,942,143,235,[ 6.82566911e-02 3.81275006e-02 -8.46854132e-...
4,7,The Cardiovascular System University of Hawai‘...,998,152,249,[ 3.30264494e-02 -8.49763490e-03 9.57159605e-...


In [24]:
import random

import torch
import numpy as np
import pandas as pd
# Removed json import as it's no longer used for parsing this string format

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunk_and_embeddings_df.csv")

# Function to parse the string representation of a numpy array back into a numpy array
def parse_embedding_string_to_array(embedding_str):
    # Remove leading/trailing brackets, any ellipsis, and split by space.
    # Example string in CSV: "[ 0.0674242675  0.0902281404 -0.00509548886 ...]"
    # First, handle potential truncation by '...' if it's present, otherwise it might cause ValueError.
    if '...' in embedding_str:
        # Attempt to remove '...' and ensure proper closing bracket, if it was truncated.
        # This is a heuristic and assumes '...' is at the end of meaningful numbers.
        embedding_str = embedding_str.split('...')[0].strip() + ']'

    # Remove outer brackets and any extra spaces before splitting
    cleaned_str = embedding_str.strip('[] ')

    # Split by space and convert each numeric part to float. Filter out any empty strings.
    try:
        float_list = [float(num_str) for num_str in cleaned_str.split() if num_str]
        return np.array(float_list, dtype=np.float32)
    except ValueError:
        print(f"Warning: Could not fully parse embedding string: '{embedding_str}'. Returning empty array.")
        return np.array([], dtype=np.float32)

# Apply the custom parsing function to the 'embedding' column
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(parse_embedding_string_to_array)

# Filter out any rows where parsing might have failed or resulted in an incorrect embedding dimension
# The embedding model 'all-mpnet-base-v2' produces 768-dimensional embeddings.
expected_embedding_dim = 768
text_chunks_and_embedding_df = text_chunks_and_embedding_df[
    text_chunks_and_embedding_df["embedding"].apply(lambda x: len(x) == expected_embedding_dim)
].reset_index(drop=True)

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([1822, 768])

In [25]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,3,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77,"[0.06742427, 0.09022814, -0.005095489, -0.0317..."
1,4,Human Nutrition: 2020 Edition by University of...,210,30,52,"[0.05521564, 0.059213977, -0.016616724, -0.020..."
2,5,Contents Preface University of Hawai‘i at Māno...,766,114,191,"[0.027980184, 0.033981375, -0.020642668, 0.001..."
3,6,Lifestyles and Nutrition University of Hawai‘i...,942,143,235,"[0.06825669, 0.0381275, -0.008468541, -0.01813..."
4,7,The Cardiovascular System University of Hawai‘...,998,152,249,"[0.03302645, -0.008497635, 0.009571596, -0.004..."


In [26]:
embeddings[0]

tensor([ 6.7424e-02,  9.0228e-02, -5.0955e-03, -3.1755e-02,  7.3908e-02,
         3.5198e-02, -1.9799e-02,  4.6769e-02,  5.3573e-02,  5.0123e-03,
         3.3393e-02, -1.6222e-03,  1.7608e-02,  3.6265e-02, -3.1668e-04,
        -1.0712e-02,  1.5426e-02,  2.6218e-02,  2.7765e-03,  3.6494e-02,
        -4.4411e-02,  1.8936e-02,  4.9012e-02,  1.6402e-02, -4.8578e-02,
         3.1829e-03,  2.7299e-02, -2.0476e-03, -1.2283e-02, -7.2805e-02,
         1.2045e-02,  1.0730e-02,  2.1000e-03, -8.1777e-02,  2.6783e-06,
        -1.8143e-02, -1.2080e-02,  2.4717e-02, -6.2747e-02,  7.3544e-02,
         2.2162e-02, -3.2877e-02, -1.8010e-02,  2.2295e-02,  5.6137e-02,
         1.7951e-03,  5.2593e-02, -3.3174e-03, -8.3387e-03, -1.0628e-02,
         2.3192e-03, -2.2393e-02, -1.5301e-02, -9.9306e-03,  4.6532e-02,
         3.5747e-02, -2.5476e-02,  2.6369e-02,  3.7491e-03, -3.8268e-02,
         2.5833e-02,  4.1287e-02,  2.5818e-02,  3.3297e-02, -2.5178e-02,
         4.5152e-02,  4.4907e-04, -9.9662e-02,  4.9

In [28]:
from sentence_transformers import util, SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

In [33]:
query = "macronutrients functions"
query_embedding = embedding_model.encode(query, convert_to_tensor = True)
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
top_results_of_dot_scores = torch.topk(dot_scores, k=5)
top_results_of_dot_scores

torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0'),
indices=tensor([43, 48, 42, 52, 47], device='cuda:0'))

In [34]:
import textwrap

def print_wrapped(text,wrap_length=80):
  wrapped_text = textwrap.wrap(text, wrap_length)
  print(wrapped_text)

In [41]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indici​es from torch.topk
for score, idx in zip(top_results_of_dot_scores[0], top_results_of_dot_scores[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")


Query: 'macronutrients functions'

Results:
Score: 0.6926
Text:
['Macronutrients Nutrients that are needed in large amounts are called', 'macronutrients. There are three classes of macronutrients: carbohydrates,', 'lipids, and proteins. These can be metabolically processed into cellular energy.', 'The energy from macronutrients comes from their chemical bonds. This chemical', 'energy is converted into cellular energy that is then utilized to perform work,', 'allowing our bodies to conduct their basic functions. A unit of measurement of', 'food energy is the calorie. On nutrition food labels the amount given for', '“calories” is actually equivalent to each calorie multiplied by one thousand. A', 'kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with', 'the “Calorie” (with a capital “C”) on nutrition food labels. Water is also a', 'macronutrient in the sense that you require a large amount of it, but unlike the', 'other macronutrients, it does not yield calorie

In [48]:
from sentence_transformers import util, SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

def retrieve_element(query: str,
                     embeddings: torch.tensor,
                     model: SentenceTransformer=embedding_model,
                     n_of_resources_to_return: int = 5,
                     ):
  query_embedding = embedding_model.encode(query, convert_to_tensor = True)
  dot_scores = util.dot_score(query_embedding,embeddings)[0] # Corrected from dot_scores to dot_score
  scores, indices = torch.topk(input=dot_scores, k = n_of_resources_to_return)
  return scores, indices


def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):

    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_element(query=query,
                                                  embeddings=embeddings,
                                                  n_of_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indices
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order,
        # the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [52]:
query = "symptoms of pellagra"
scores, indices = retrieve_element(query=query,
                                   embeddings=embeddings,
                                   n_of_resources_to_return=5)
scores, indices

(tensor([0.5000, 0.3741, 0.3194, 0.2959, 0.2793], device='cuda:0'),
 tensor([ 885,  917,  886, 1671, 1691], device='cuda:0'))

In [51]:
query = "symptoms of pellagra"
print_top_results_and_scores(query=query,
                                   embeddings=embeddings,
                                   pages_and_chunks=pages_and_chunks)

Query: symptoms of pellagra

Results:
Score: 0.5000
['Niacin deficiency is commonly known as pellagra and the symptoms include', 'fatigue, decreased appetite, and indigestion. \xa0These symptoms are then commonly', 'followed by the four D’s: diarrhea, dermatitis, dementia, and sometimes death.', 'Figure 9.12 \xa0Conversion of Tryptophan to Niacin Water-Soluble Vitamins | 565']
Page number: 607


Score: 0.3741
['car. Does it drive faster with a half-tank of gas or a full one? It does not', 'matter; the car drives just as fast as long as it has gas. Similarly, depletion', 'of B vitamins will cause problems in energy metabolism, but having more than is', 'required to run metabolism does not speed it up. Buyers of B-vitamin supplements', 'beware; B vitamins are not stored in the body and all excess will be flushed', 'down the toilet along with the extra money spent. B vitamins are naturally', 'present in numerous foods, and many other foods are enriched with them. In the', 'United States, 

In [54]:
from huggingface_hub import login
login(token="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")


In [55]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
# 1. Creating Quantization for config Model
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype = torch.float16)
attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# Define use_quantization_config (assuming it should be True given the setup)
use_quantization_config = True

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
model_id = "google/gemma-7b-it" # Uncommented and assigned a value
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    torch_dtype=torch.float16,  # datatype to use, we want float16
    quantization_config=quantization_config if use_quantization_config else None,
    low_cpu_mem_usage=False,  # use full memory
    attn_implementation=attn_implementation  # which attention version to use
)

if not use_quantization_config:  # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")


[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-7b-it


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [56]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
      )
    )
    

In [57]:
def get_model_param(model: torch.nn.Module):
  return sum([param.numel() for param in model.parameters()])

get_model_param(llm_model)

4662144000

In [58]:
input_text = "What are the macronutrients, and what roles do they play in the human body?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,  # Keep as raw text (not tokenized)
                                        add_generation_prompt=True)

print(f"\nPrompt (formatted):\n{prompt}")

Input text:
What are the macronutrients, and what roles do they play in the human body?

Prompt (formatted):
<bos><start_of_turn>user
What are the macronutrients, and what roles do they play in the human body?<end_of_turn>
<start_of_turn>model



In [59]:
# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256)  # define the maximum number of new tokens to create

print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   1841,    708,    573, 186809,
         184592, 235269,    578,   1212,  16065,    749,    984,   1554,    575,
            573,   3515,   2971, 235336,    107,    108,    106,   2516,    108]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]], device='cuda:0')}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1841,    708,    573, 186809,
        184592, 235269,    578,   1212,  16065,    749,    984,   1554,    575,
           573,   3515,   2971, 235336,    107,    108,    106,   2516,    108,
         21404, 235269,   1517,    708,    573, 186809, 184592,    578,   1024,
         16065,    575,    573,   3515,   2971, 235292,    109,    688,  12298,
          1695, 184592,  66058,    109, 235290,   5231, 156615,  56227,  66058,
         34428,   4134,    604,    573,   2971, 23

In [60]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
What are the macronutrients, and what roles do they play in the human body?<end_of_turn>
<start_of_turn>model
Sure, here are the macronutrients and their roles in the human body:

**Macronutrients:**

- **Carbohydrates:** Provide energy for the body. They are broken down into glucose, which is then used for energy. Carbohydrates are found in foods such as bread, pasta, rice, potatoes, and sugary drinks.
- **Proteins:** Build and repair tissues, produce enzymes and hormones, and help to regulate blood sugar levels. Proteins are found in foods such as meat, fish, eggs, dairy products, beans, lentils, and nuts.
- **Fats:** Provide energy storage, insulation, and hormone production. Fats are found in foods such as butter, cheese, oil, nuts, seeds, and some processed foods.

**Roles of Macronutrients:**

- **Carbohydrates:** Provide energy for the body. They are broken down into glucose, which is then used for energy. Glucose is the body