In [1]:
import os
import requests
import fitz #for opening document
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Get PDF document path
pdf_path = "Human-Nutrition-2020.pdf"
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """Opens a PDF file, reads its text content page by page, and collects statistics."""
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41, # adjusted page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(", ")),
                                "page_token_count": len(text) / 4, #1 token has approx 4 characters
                                "text": text})
    return pages_and_texts
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[2:4]

1208it [00:01, 1206.24it/s]


[{'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count_raw': 11,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'},
 {'page_number': -38,
  'page_char_count': 212,
  'page_word_count': 32,
  'page_sentence_count_raw': 2,
  'page_token_count': 53.0,
  'text': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Human Nutrition Program is licensed under a Creative Commons Attribution 4.0  International License, except where otherwise noted.'}]

In [3]:
import random

random.sample(pages_and_texts, k=2)

[{'page_number': 717,
  'page_char_count': 127,
  'page_word_count': 16,
  'page_sentence_count_raw': 1,
  'page_token_count': 31.75,
  'text': 'view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=412  Understanding Dietary Reference Intakes  |  717'},
 {'page_number': 775,
  'page_char_count': 537,
  'page_word_count': 88,
  'page_sentence_count_raw': 6,
  'page_token_count': 134.25,
  'text': 'downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=434  \xa0 Understanding the Bigger Picture of Dietary Guidelines  |  775'}]

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head(10)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,11,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,2,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,1,199.25,Contents Preface University of Hawai‘i at Mā...
5,-36,976,177,3,244.0,Lifestyles and Nutrition University of Hawai‘...
6,-35,1037,191,1,259.25,The Cardiovascular System University of Hawai...
7,-34,1047,186,3,261.75,"Indicators of Health: Body Mass Index, Body Fa..."
8,-33,947,168,1,236.75,Chloride University of Hawai‘i at Mānoa Food ...
9,-32,1024,187,1,256.0,The Functions of Carbohydrates in the Body Un...


In [5]:
df.shape

(1208, 6)

In [6]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,11.05,287.0
std,348.86,560.38,95.76,8.95,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,5.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,15.0,400.88
max,1166.0,2308.0,429.0,106.0,577.0


### text processing (splitting pages into sentences)
We will to follow the workflow of:

Ingest text -> split it into groups/chunks -> embed the groups/chunks -> use the embeddings

Why split into sentences?

Easier to handle than larger pages of text (especially if pages are densely filled with text).
Can get specific and find out which group of sentences were used to help within a RAG pipeline.
We will use spaCy to break our text into sentences since it's likely a bit more robust than just using text.split(". ","?","!").

In [7]:
from spacy.lang.en import English

nlp = English()
total_sentence_count = 0 

nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    item["page_sentence_count_spacy"] = len(item["sentences"])
    total_sentence_count += len(item["sentences"])
print("Total sentences:", total_sentence_count)    

100%|██████████| 1208/1208 [00:01<00:00, 1056.38it/s]

Total sentences: 12466





In [8]:
random.sample(pages_and_texts, k=1)

[{'page_number': 364,
  'page_char_count': 846,
  'page_word_count': 165,
  'page_sentence_count_raw': 9,
  'page_token_count': 211.5,
  'text': 'Image by  Allison  Calabrese /  CC BY 4.0  Amino acids differ from each other by which specific side chain is  bonded to the carbon center.  Figure 6.1 Amino Acid Structure  Amino acids contain four elements. The arrangement of elements  around the carbon center is the same for all amino acids. Only the  side chain (R) differs.  It’s All in the Side Chain  The side chain of an amino acid, sometimes called the “R” group, can  be as simple as one hydrogen bonded to the carbon center, or as  complex as a six-carbon ring bonded to the carbon center. Although  each side chain of the twenty amino acids is unique, there are some  chemical likenesses among them. Therefore, they can be classified  into four different groups. These are nonpolar, polar, acidic, and  basic.  Figure 6.2 The Different Groups of Amino Acids  364  |  Defining Protein',
  'se

In [9]:
df = pd.DataFrame(pages_and_texts)

In [10]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,11.05,287.0,10.32
std,348.86,560.38,95.76,8.95,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,15.0,400.88,15.0
max,1166.0,2308.0,429.0,106.0,577.0,28.0


### Chunking our sentences together¶
Why do we do this?

Easier to manage similar sized chunks of text.
Don't overload the embedding models capacity for tokens (e.g. if an embedding model has a capacity of 384 tokens, there could be information loss if you try to embed a sequence of 400+ tokens).
Our LLM context window (the amount of tokens an LLM can take in) may be limited and requires compute power so we want to make sure we're using it as well as possible.

In [11]:
chunk_size = 10
def split_list(input_list: list[str], 
               slice_size: int=chunk_size) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 148837.30it/s]


In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,11.05,287.0,10.32,1.53
std,348.86,560.38,95.76,8.95,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,5.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,15.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,106.0,577.0,28.0,3.0


### Splitting each chunk into its own item

In [13]:
import re

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.(A-Z)', r'. \1', joined_sentence_chunk) # convert ".A"to ". A"(only for capital letter)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        
        pages_and_chunks.append(chunk_dict)
    
len(pages_and_chunks)   

100%|██████████| 1208/1208 [00:00<00:00, 55601.25it/s]


1843

In [14]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,731.11,109.0,182.78
std,347.79,445.65,69.34,111.41
min,-41.0,12.0,3.0,3.0
25%,280.5,313.5,43.0,78.38
50%,586.0,745.0,111.0,186.25
75%,890.0,1112.0,168.0,278.0
max,1166.0,1824.0,290.0,456.0


In [15]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count : {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count : 29.25 | Text: Abagovomab (monoclonal antibody) by Blake C / CC BY-SA 3.0 Figure 6.13 Antigens Protein’s Functions in the Body | 389
Chunk token count : 9.75 | Text: 1002 | The Causes of Food Contamination
Chunk token count : 8.0 | Text: For example, 856 | Toddler Years
Chunk token count : 28.75 | Text: American Journal of Clinical Dietary, Behavioral, and Physical Activity Recommendations for Weight Management | 509
Chunk token count : 23.0 | Text: view it online here: http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=301 The Atom | 471


In [16]:
#filtering rows with token under 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

### Embedding our text chunks¶
Embeddings of text will mean that similar meaning texts have similar numerical representation.

Our goal is to turn each of our chunks into a numerical representation (an embedding vector, where a vector is a sequence of numbers arranged in order).

We'll use our computers to find patterns in the embeddings and then we can use their text mappings to further our understanding.

We'll use the sentence-transformers library which contains many pre-trained embedding models.

In [17]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")


for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 1680/1680 [00:20<00:00, 80.04it/s]


In [18]:
pages_and_chunks_over_min_token_len[0]["embedding"].shape

(768,)

Our embedding has a shape of (768,) meaning it's a vector of 768 numbers which represent our text in high-dimensional space.

In [None]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[9]

'Defining Protein University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 363 The Role of Proteins in Foods: Cooking and Denaturation University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 374 Protein Digestion and Absorption University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 378 Protein’s Functions in the Body University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 383 Diseases Involving Proteins University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 395 Proteins in a Nutshell University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 405 Proteins, Diet, and Personal Choices University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 409'

In [20]:
len(text_chunks)

1680

In [21]:
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=16, # Embed all texts in batches
                                               convert_to_tensor=True)
text_chunk_embeddings[0]

tensor([ 6.7424e-02,  9.0228e-02, -5.0955e-03, -3.1755e-02,  7.3908e-02,
         3.5198e-02, -1.9799e-02,  4.6769e-02,  5.3573e-02,  5.0123e-03,
         3.3393e-02, -1.6221e-03,  1.7608e-02,  3.6265e-02, -3.1669e-04,
        -1.0712e-02,  1.5426e-02,  2.6218e-02,  2.7765e-03,  3.6494e-02,
        -4.4411e-02,  1.8936e-02,  4.9012e-02,  1.6402e-02, -4.8578e-02,
         3.1829e-03,  2.7299e-02, -2.0476e-03, -1.2283e-02, -7.2805e-02,
         1.2045e-02,  1.0730e-02,  2.1000e-03, -8.1777e-02,  2.6783e-06,
        -1.8143e-02, -1.2080e-02,  2.4718e-02, -6.2747e-02,  7.3544e-02,
         2.2162e-02, -3.2877e-02, -1.8010e-02,  2.2295e-02,  5.6137e-02,
         1.7951e-03,  5.2593e-02, -3.3174e-03, -8.3387e-03, -1.0628e-02,
         2.3192e-03, -2.2393e-02, -1.5301e-02, -9.9306e-03,  4.6532e-02,
         3.5747e-02, -2.5476e-02,  2.6369e-02,  3.7491e-03, -3.8268e-02,
         2.5833e-02,  4.1287e-02,  2.5818e-02,  3.3297e-02, -2.5178e-02,
         4.5152e-02,  4.4903e-04, -9.9662e-02,  4.9

In [22]:
len(text_chunk_embeddings[0])

768

In [23]:
text_chunk_embeddings.shape

torch.Size([1680, 768])

In [24]:
pip install faiss-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [25]:
#Saving embedding to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(save_path, index=False)

In [26]:
# Import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv(save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242526e-02 9.02281031e-02 -5.09549724e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156046e-02 5.92139363e-02 -1.66167300e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,765,113,191.25,[ 2.79801898e-02 3.39813866e-02 -2.06426680e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,141,235.0,[ 6.82566911e-02 3.81274968e-02 -8.46854970e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30264494e-02 -8.49766005e-03 9.57159698e-...


### RAG - Search and Answer
Similarity search
Similarity search or semantic search or vector search is the idea of searching on semantic.

With keyword search, you are trying to match the string "apple" with the string "apple".

Whereas with similarity/semantic search, you may want to search "macronutrients functions". And get back results that don't necessarily contain the words "macronutrients functions" but get back pieces of text that match that meaning.

In [27]:
import torch
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df = pd.read_csv(save_path)
#convert embedding to array (it got converted to string when it saved)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

#converting embedding into torch tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)
# Convert texts and embedding df to list of dicts
pages_and_chunks = text = text_chunks_and_embedding_df.to_dict(orient="records")

text_chunks_and_embeddings_df

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.00,"[0.06742425, 0.0902281, -0.0050954972, -0.0317..."
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.50,"[0.055215605, 0.059213936, -0.01661673, -0.020..."
2,-37,Contents Preface University of Hawai‘i at Māno...,765,113,191.25,"[0.02798019, 0.033981387, -0.020642668, 0.0019..."
3,-36,Lifestyles and Nutrition University of Hawai‘i...,940,141,235.00,"[0.06825669, 0.038127497, -0.00846855, -0.0181..."
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.50,"[0.03302645, -0.00849766, 0.009571597, -0.0049..."
...,...,...,...,...,...,...
1675,1164,Flashcard Images Note: Most images in the flas...,1298,169,324.50,"[0.018562242, -0.016427761, -0.012704563, -0.0..."
1676,1164,Hazard Analysis Critical Control Points reused...,373,49,93.25,"[0.03347209, -0.057044044, 0.015148953, -0.010..."
1677,1165,ShareAlike 11.Organs reused “Pancreas Organ An...,1277,164,319.25,"[0.077051535, 0.009785596, -0.012181752, 0.001..."
1678,1165,Sucrose reused “Figure 03 02 05” by OpenStax B...,408,57,102.00,"[0.103045136, -0.016470214, 0.008268454, 0.037..."


In [28]:
embeddings.shape

torch.Size([1680, 768])

### Retrival is done by following steps:

Define a query string.
Turn the query string in an embedding with same model we used to embed our text chunks.
Perform a dot product or cosine similarity function between the text embeddings and the query embedding to get similarity scores.
Sort the results from step 3 in descending order (a higher score means more similarity in the eyes of the model) and use these values to inspect the texts.

In [29]:
from sentence_transformers import util

query = "macronutrients functions"
print(f"Query : {query}")

query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

dot_scores = util.dot_score(query_embedding, embeddings)[0]

top_results = torch.topk(dot_scores, k=5)
top_results

Query : macronutrients functions


torch.return_types.topk(
values=tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0'),
indices=tensor([42, 47, 41, 51, 46], device='cuda:0'))

In [30]:
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score:.4f}")
    print("Text")
    print(pages_and_chunks[idx]["sentence_chunk"])
    print("\n\n")

Score: 0.6926
Text
Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed into cellular energy.The energy from macronutrients comes from their chemical bonds.This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions.A unit of measurement of food energy is the calorie.On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand.A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels.Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.



In [31]:
import time
import torch
from sentence_transformers import util

def brute_force_search_with_timing(query: str, k: int = 5):
    t0 = time.perf_counter()

    # 1) Query embedding time
    t_embed_start = time.perf_counter()
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.to(embeddings.device)  # same device as embeddings
    t_embed_end = time.perf_counter()

    # 2) Similarity computation time (dot over all chunks)
    t_score_start = time.perf_counter()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]  # shape: (N,)
    t_score_end = time.perf_counter()

    # 3) Top-k time
    t_topk_start = time.perf_counter()
    top_results = torch.topk(dot_scores, k=k)
    t_topk_end = time.perf_counter()

    # Total
    t_total = time.perf_counter() - t0

    return {
        "top_results": top_results,
        "timing": {
            "embedding_time_ms": (t_embed_end - t_embed_start) * 1000,
            "scoring_time_ms": (t_score_end - t_score_start) * 1000,
            "topk_time_ms": (t_topk_end - t_topk_start) * 1000,
            "total_time_ms": t_total * 1000,
            "num_vectors": int(embeddings.shape[0]),
            "dim": int(embeddings.shape[1]),
            "device": str(embeddings.device),
        }
    }

# Example
query = "macronutrients functions"
out = brute_force_search_with_timing(query, k=5)

print("Timing (ms):")
for k, v in out["timing"].items():
    if isinstance(v, float):
        print(f"{k}: {v:.2f}")
    else:
        print(f"{k}: {v}")

print("\nTop-k indices:", out["top_results"].indices.tolist())
print("Top-k scores:", out["top_results"].values.tolist())

Timing (ms):
embedding_time_ms: 10.58
scoring_time_ms: 0.07
topk_time_ms: 0.04
total_time_ms: 10.69
num_vectors: 1680
dim: 768
device: cuda:0

Top-k indices: [42, 47, 41, 51, 46]
Top-k scores: [0.6925809383392334, 0.6738272905349731, 0.6646263003349304, 0.6536346673965454, 0.6472818851470947]


In [32]:
def retrieve_relevant_resources(query: str, n_resources_to_return: int=5):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    scores, indices = torch.topk(dot_scores, k=n_resources_to_return)
    
    return scores, indices

In [33]:
retrieve_relevant_resources(query)

(tensor([0.6926, 0.6738, 0.6646, 0.6536, 0.6473], device='cuda:0'),
 tensor([42, 47, 41, 51, 46], device='cuda:0'))

In [34]:
def print_top_results_and_scores(query: str, n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.
    """
    scores, indices = retrieve_relevant_resources(query, n_resources_to_return=n_resources_to_return)
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text")
        print(pages_and_chunks[idx]["sentence_chunk"])
        print("\n\n")
print_top_results_and_scores(query)        

Score: 0.6926
Text
Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed into cellular energy.The energy from macronutrients comes from their chemical bonds.This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions.A unit of measurement of food energy is the calorie.On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand.A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels.Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hydrogen, and oxygen.



## Using FAISS Vector Database

### Load CSV + parse embeddings + build FAISS index

In [35]:
import pandas as pd
import numpy as np
import faiss

save_path = "text_chunks_and_embeddings_df.csv"
df = pd.read_csv(save_path)

# ✅ Convert embedding string -> np.array
# Your CSV embedding looks like: "[0.1 0.2 ...]" (space-separated)
df["embedding"] = df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# ✅ Stack into (N, D)
embeddings = np.vstack(df["embedding"].values).astype("float32")
N, D = embeddings.shape
print("Embeddings shape:", embeddings.shape)

# ✅ Normalize for cosine similarity
# (cosine similarity == inner product when vectors are L2-normalized)
faiss.normalize_L2(embeddings)

# ✅ Build FAISS index (exact search)
index = faiss.IndexFlatIP(D)   # IP on normalized vectors => cosine similarity
index.add(embeddings)

print("FAISS index size:", index.ntotal)

Embeddings shape: (1680, 768)
FAISS index size: 1680


### Save FAISS index + metadata (so you can reload later)

FAISS saves only vectors/index. Your text+page metadata should be saved separately.

In [36]:
# Save FAISS index
faiss.write_index(index, "chunks.index")

# Save metadata (everything except embedding is enough)
# Keep embedding too if you want; but it's optional once index is saved.
df.to_parquet("chunks_metadata.parquet", index=False)

print("Saved: chunks.index + chunks_metadata.parquet")


Saved: chunks.index + chunks_metadata.parquet


#### Load FAISS index later (without rebuilding)

In [37]:
# import faiss
# import pandas as pd

# index = faiss.read_index("chunks.index")
# df = pd.read_parquet("chunks_metadata.parquet")

# print("Loaded index size:", index.ntotal)

In [38]:
import time
from sentence_transformers import SentenceTransformer
import faiss

embedding_model = SentenceTransformer("all-mpnet-base-v2")

def search_with_timing(query: str, top_k: int = 5):
    t0 = time.perf_counter()

    # 1️⃣ Query embedding time
    t_embed_start = time.perf_counter()
    q_emb = embedding_model.encode([query]).astype("float32")
    faiss.normalize_L2(q_emb)
    t_embed_end = time.perf_counter()

    # 2️⃣ FAISS search time
    t_search_start = time.perf_counter()
    scores, ids = index.search(q_emb, top_k)
    t_search_end = time.perf_counter()

    # 3️⃣ Mapping time (negligible but measured)
    t_map_start = time.perf_counter()
    results = []
    for score, idx in zip(scores[0], ids[0]):
        row = df.iloc[int(idx)]
        results.append({
            "score": float(score),
            "page_number": int(row.get("page_number", -1)),
            "text": row["sentence_chunk"]
        })
    t_map_end = time.perf_counter()

    t_total = time.perf_counter() - t0

    return {
        "results": results,
        "timing": {
            "embedding_time_ms": (t_embed_end - t_embed_start) * 1000,
            "faiss_search_time_ms": (t_search_end - t_search_start) * 1000,
            "mapping_time_ms": (t_map_end - t_map_start) * 1000,
            "total_time_ms": t_total * 1000
        }
    }


In [39]:
query = "macronutrients functions"

out = search_with_timing(query, top_k=5)

print("Timing (ms):")
for k, v in out["timing"].items():
    print(f"{k}: {v:.2f}")

for r in out["results"]:
    print("\nScore:", r["score"], "| Page:", r["page_number"])
    print(r["text"][:200])

Timing (ms):
embedding_time_ms: 7.19
faiss_search_time_ms: 0.60
mapping_time_ms: 0.32
total_time_ms: 8.11

Score: 0.6925809383392334 | Page: 5
Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed 

Score: 0.6738272905349731 | Page: 8
Water There is one other nutrient that we must have in large quantities: water.Water does not contain carbon, but is composed of two hydrogens and one oxygen per molecule of water.More than 60 percent

Score: 0.6646261811256409 | Page: 4
Learning Objectives By the end of this chapter, you will be able to: • Describe basic concepts in nutrition • Describe factors that affect your nutritional needs • Describe the importance of research 

Score: 0.6536346673965454 | Page: 11
Vitamins Major Functions Water-soluble Thiamin (B1) Coenzyme, energy metabolism assistance Riboflavin (B2 ) Coenzyme, energy metabolism assist

#### Time only FAISS search (exclude embedding + mapping)

In [40]:
import time, numpy as np, faiss

q = embedding_model.encode([query]).astype("float32")
faiss.normalize_L2(q)

t0 = time.perf_counter()
scores, ids = index.search(q, 5)
t1 = time.perf_counter()

print("FAISS search only ms:", (t1 - t0) * 1000)

FAISS search only ms: 0.512272999912966


#### Time only GPU dot score (exclude embedding)

In [41]:
import time
q_t = embedding_model.encode(query, convert_to_tensor=True).to(embeddings.device)

t0 = time.perf_counter()
dot_scores = util.dot_score(q_t, embeddings)[0]
topk = torch.topk(dot_scores, k=5)
t1 = time.perf_counter()

print("GPU score+topk only ms:", (t1 - t0) * 1000)

GPU score+topk only ms: 4.455233000044245


Key difference in one line

FAISS → directly searches top-k efficiently,FAISS is just extremely optimized C++ (vectorized CPU instructions, good memory layout, multithreading), so it’s fast even though it’s still brute-force.

GPU dot score → computes all N scores, then filters top-k

### Build FAISS directly (no CSV)
1) Prepare text + embeddings

In [43]:
import numpy as np
import faiss

# texts
texts = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

# embeddings matrix: (N, D)
embeddings = embedding_model.encode(
    texts,
    batch_size=16,
    show_progress_bar=True
).astype("float32")

# normalize for cosine similarity
faiss.normalize_L2(embeddings)

N, D = embeddings.shape
print("Embeddings shape:", embeddings.shape)

Batches: 100%|██████████| 105/105 [00:19<00:00,  5.39it/s]

Embeddings shape: (1680, 768)





2) Create FAISS index + add vectors

In [44]:
index = faiss.IndexFlatIP(D)  # cosine (because normalized) using inner product
index.add(embeddings)

print("Total vectors in index:", index.ntotal)

Total vectors in index: 1680


3) Search

In [46]:
def faiss_search(query: str, top_k: int = 5):
    q = embedding_model.encode([query]).astype("float32")
    faiss.normalize_L2(q)

    scores, ids = index.search(q, top_k)

    results = []
    for score, idx in zip(scores[0], ids[0]):
        results.append({
            "score": float(score),
            "text": texts[int(idx)]
        })
    return results

print(faiss_search("macronutrients functions", top_k=5))


[{'score': 0.6925808191299438, 'text': 'Macronutrients Nutrients that are needed in large amounts are called macronutrients.There are three classes of macronutrients: carbohydrates, lipids, and proteins.These can be metabolically processed into cellular energy.The energy from macronutrients comes from their chemical bonds.This chemical energy is converted into cellular energy that is then utilized to perform work, allowing our bodies to conduct their basic functions.A unit of measurement of food energy is the calorie.On nutrition food labels the amount given for “calories” is actually equivalent to each calorie multiplied by one thousand.A kilocalorie (one thousand calories, denoted with a small “c”) is synonymous with the “Calorie” (with a capital “C”) on nutrition food labels.Water is also a macronutrient in the sense that you require a large amount of it, but unlike the other macronutrients, it does not yield calories. Carbohydrates Carbohydrates are molecules composed of carbon, hy

In production, people often save:

FAISS index (faiss.write_index)

metadata (Parquet/JSON)

In [None]:
# faiss.write_index(index, "chunks.index")
# # save texts/metadata separately (parquet/json)
