 **Create and Run a local RAG pipeline from scratch**

## 1. Document/text preprocessing and embedding creation

In [3]:
import os
import requests

# Get PDF document path
pdf_path = "human-nutrition-text.pdf"

# Download
if not os.path.exists(pdf_path):
    print(f"[INFO] File doesn't exist, downloading...")

    # Enter the URL of the PDF
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    # THe Local filename to save the downloaded file
    filename = pdf_path

    # Send a GET request to the URL
    responce = requests.get(url)

    # Check if the request was successfull
    if responce.status_code == 200:
        # Open the file and save it
        with open(filename , 'wb') as file:
            file.write(responce.content)
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Falied to download the file . Status code {responce.status_code}")

else:
    print(f"File {pdf_path} exists.")


File human-nutrition-text.pdf exists.


In [1]:
pip install grandalf

Collecting grandalf
  Using cached grandalf-0.8-py3-none-any.whl.metadata (1.7 kB)
Using cached grandalf-0.8-py3-none-any.whl (41 kB)
Installing collected packages: grandalf
Successfully installed grandalf-0.8
Note: you may need to restart the kernel to use updated packages.


In [4]:
import fitz
from tqdm import tqdm


def text_formaatter(text: str) -> str:
    """ Performs minor formatting on text """
    cleaned_text = text.replace("\n" , " ").strip()

    # Pottentially more text formating functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path:str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number , page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formaatter(text=text)
        pages_and_text.append({"page_number" : page_number - 41 , 
                               "page_char_count": len(text) , 
                               "page_word_count": len(text.split(" ")) , 
                               "page_sentence_count_raw": len(text.split(". ")) , 
                               "page_token_count": len(text) / 4  , # 1 token = 4 character.
                               "text": text})
    
    return pages_and_text

pages_and_text = open_and_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]


1208it [00:01, 901.72it/s]


[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [5]:
import random
random.sample(pages_and_text , k = 4)

[{'page_number': 990,
  'page_char_count': 1027,
  'page_word_count': 178,
  'page_sentence_count_raw': 6,
  'page_token_count': 256.75,
  'text': 'for diabetics, cancer patients, people who have liver disease, and  people who have stomach problems as a result of low stomach  acid or previous stomach surgery. People in all of these groups  should handle food carefully, make sure that what they eat has been  cooked thoroughly, and avoid taking any chances that could lead to  exposure.  Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop

In [6]:
import pandas as pd 

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [7]:
df.describe()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.004139,198.299669,9.972682,287.001035
std,348.86387,560.382275,95.759336,6.187226,140.095569
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.875
75%,864.25,1603.5,271.0,14.0,400.875
max,1166.0,2308.0,429.0,32.0,577.0


**Further text processing (splitting pages into sentences)**

In [26]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This is another sentenece. I like elephants.")
assert len(list(doc.sents)) == 3

# Print out our sentence split
list(doc.sents)

[This is a sentence., This is another sentenece., I like elephants.]

In [27]:
for item in tqdm(pages_and_text):
    item['sentences'] = list(nlp(item["text"]).sents)

    # Make sure all sentences are string (the default type is a spacyy datatype)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the senteneces
    item["page_sentence_count_spacy"] = len(item["sentences"])


100%|██████████| 1208/1208 [00:01<00:00, 964.41it/s] 


In [28]:
random.sample(pages_and_text , k = 1)

[{'page_number': 470,
  'page_char_count': 832,
  'page_word_count': 139,
  'page_sentence_count_raw': 5,
  'page_token_count': 208.0,
  'text': 'Learning Activities  Technology Note: The second edition of the Human  Nutrition Open Educational Resource (OER) textbook  features interactive learning activities.\xa0 These activities are  available in the web-based textbook and not available in the  downloadable versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities may be used across various mobile  devices, however, for the best user experience it is strongly  recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=301  \xa0 An interactive or media element has been  excluded from this version of the text. You can  470  |  The Atom',
  'sentenc

In [29]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Chucking our sentence together

In [30]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split a list of texts recursively into chunk size

def split_list(input_list: list[str] , slice_size : int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0 , len(input_list) , slice_size)]

test_list = list(range(25))
split_list(test_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [31]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"] , slice_size=num_sentence_chunk_size)

    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1208/1208 [00:00<00:00, 677097.32it/s]


In [32]:
random.sample(pages_and_text , k = 1)

[{'page_number': 593,
  'page_char_count': 1359,
  'page_word_count': 224,
  'page_sentence_count_raw': 9,
  'page_token_count': 339.75,
  'text': 'Antioxidants  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  The market is flooded with advertisements for “super antioxidant”  supplements teeming with molecules that block free radical  production, stimulate the immune system, prevent cancer, and  reduce the signs of aging. Based on the antioxidant-supplement  industry’s success, the general public appears to believe these  health claims. However, these claims are not backed by scientific  evidence; rather, there is some evidence suggesting supplements  can actually cause harm. While scientists have found evidence  supporting the consumption of antioxidant-rich foods as a method  of reducing the risk of chronic disease, there is no “miracle cure”;  no pill or supplement alone can provide the same benefits as a  healthy diet. Remember,

In [33]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


### Splitting each chunk into its own item

In [34]:
import re

# Split each chunk into own item
pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  " , " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])' , r'. \1' , joined_sentence_chunk) # ".A" => ". A" (will work for any capital letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

100%|██████████| 1208/1208 [00:00<00:00, 18856.14it/s]


1843

In [35]:
random.sample(pages_and_chunks , k = 1)

[{'page_number': 595,
  'sentence_chunk': 'Antioxidant Antioxidant Source Antioxidant Function Vitamin A Karat banana, beef liver, chicken liver Protects cellular membranes, prevents glutathione depletion, maintains free radical detoxifying enzyme systems, reduces inflammation Vitamin E Sunflower seeds, almonds, sunflower oil Protects cellular membranes,\xa0 prevents glutathione depletion Vitamin C Oranges, grapefruit Protects DNA, RNA, proteins, and lipids, aids in regenerating vitamin E Vitamin D Swordfish, salmon, tuna fish canned in water and drained Regulates blood calcium levels in concert with parathyroid hormone Carotenoids Pumpkin, carrots Free radical scavenger Learning Activities Technology Note: The second edition of the Human Nutrition Open Educational Resource (OER) textbook features interactive learning activities.\xa0 These activities are available in the web-based textbook and not available in the downloadable versions (EPUB, Digital PDF, Print_PDF, or Open Document). 

In [36]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


### Filter chunks of text for short chunks

In [37]:
min_token_length = 30

In [38]:
# Filter our DataFrame for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient = "records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

In [39]:
random.sample(pages_and_chunks_over_min_token_len , k = 1)

[{'page_number': 343,
  'sentence_chunk': 'Check the ingredient list, especially the first three to four ingredients, for telltale signs of hydrogenated fat such as partially or fractionated hydrogenated oil. The higher up the words “partially hydrogenated oil” are on the list of ingredients, the more trans fat the product contains. Measure out one serving and eat one serving only. An even better choice would be to eat a fruit or vegetable. There are no trans fats and the serving size is more reasonable for similar calories. Fruits and vegetables are packed with water, fiber, and many vitamins, Lipids and the Food Industry | 343',
  'chunk_char_count': 593,
  'chunk_word_count': 99,
  'chunk_token_count': 148.25}]

## Embedding our text chunks

In [42]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2" , device = "mps")

# Create a list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
"Sentences can be embedded one by one or in a list.",
"I like horses!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences , embeddings))

# See the embeddings
for sentence , embedding in embeddings_dict.items():
    print(f"Sentences: {sentences}")
    print(f"Embedding: {embedding}")
    print("")

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [43]:
embeddings[0].shape

NameError: name 'embeddings' is not defined

In [44]:
%%time

embedding_model.to("mps")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

NameError: name 'embedding_model' is not defined

In [45]:
%%time

text_chunks = [item['sentence_chunk'] for item in pages_and_chunks_over_min_token_len]
text_chunks[523]

CPU times: user 316 μs, sys: 1.26 ms, total: 1.57 ms
Wall time: 1.63 ms


'through food. Genetic factors may also influence the way a person’s body modifies cholesterol. The 2015-2020 US Dietary Guidelines suggest limiting saturated fats, thereby indirectly limiting dietary cholesterol since foods that are high in cholesterol tend to be high in saturated fats also. A Prelude to Disease If left unchecked, improper dietary fat consumption can lead down a path to severe health problems. An increased level of lipids, triglycerides, and cholesterol in the blood is called hyperlipidemia. Hyperlipidemia is inclusive of several conditions but more commonly refers to high cholesterol and triglyceride levels. When blood lipid levels are high, any number of adverse health problems may ensue. Consider the following: • Cardiovascular disease. According to the AHA, cardiovascular disease encompasses a variety of problems, many of which are related to the process of atherosclerosis. Over time the arteries thicken and harden with plaque buildup, causing restricted or at tim

In [46]:
len(text_chunks)

1680

In [47]:
%%time

# Embed all texts in batches
text_chunks_embeddings = embedding_model.encode(text_chunks , batch_size=32 ,convert_to_tensor=True)

text_chunks_embeddings


NameError: name 'embedding_model' is not defined

# Save embeddings to file


In [48]:
pages_and_chunks_over_min_token_len[412]

{'page_number': 273,
 'sentence_chunk': 'Foods Total Carbohydrates Sugars Fiber Added Sugars Banana 27 (1 medium) 14.40 3.1 0 Lentils 40 (1 c.) 3.50 16.0 0 Snap beans 8.7 (1 c.) 1.60 4.0 0 Green pepper 5.5 (1 medium) 2.90 2.0 0 Corn tortilla 10.7 (1) 0.20 1.5 0 Bread, wheat bran 17.2 (1 slice) 3.50 1.4 3.4 Bread, rye 15.5 (1 slice) 1.20 1.9 1.0 Bagel (plain) 53 (1 medium) 5.30 2.3 4.8 Brownie 36 (1 square) 20.50 1.2 20.0 Oatmeal cookie 22.3 (1 oz.) 12.00 2.0 7.7 Cornflakes 23 (1 c.) 1.50 0.3 1.5 Pretzels 47 (10 twists) 1.30 1.7 0 Popcorn (homemade) 58 (100 g) 0.50 10.0 0 Skim milk 12 (1 c.) 12.00 0 0 Cream (half and half) 0.65 (1 Tbs.) 0.02 0 0 Cream substitute 1.0 (1 tsp.) 1.00 0 1.0 Cheddar cheese 1.3 (1 slice) 0.50 0 0 Yogurt (with fruit) 32.3 (6 oz.) 32.30 0 19.4 Caesar dressing 2.8 (1 Tbs.) 2.80 0 2.4 Sources: • National Nutrient Database for Standard Reference. US Department of Agriculture.http://www.nal.usda.gov/fnic/ foodcomp/search/. Updated December 7, 2011. Accessed Septembe

In [49]:
# Save embeddings to file 
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path , index=False)

In [50]:
# Import saved file and view
embeddings_df_save_path = "/Users/pandhari/Desktop/NLP/RAG/text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5


# RAG - Search and Answer

### Similarity search

In [51]:
import random
import torch
import numpy as np
import pandas as pd

device = "mps"

# Import texts and embedding df
text_chunks_and_embeddings_df = pd.read_csv("/Users/pandhari/Desktop/NLP/RAG/text_chunks_and_embeddings_df.csv")

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient= "records")

# Convert emedding column back to np.array (it got converted to string when it saved to CSV)
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]") , sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: Numpy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()) , dtype=torch.float32).to(device=device)
embeddings.shape


KeyError: 'embedding'

In [None]:
# Create model
from sentence_transformers import util , SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2" , device=device)

In [None]:
# 1. Define the query
query = "good foods for protein"
print(f"Query: {query}")

# 2. Embed the query
# Note: it's important to embed you query with the same model you embedding your passages
query_embedding = embedding_model.encode(query , convert_to_tensor=True)
query_embedding = query_embedding.to(device=device)

# 3. Get Similaity socre with the dot product (use cosine similarity if outputs of model aren't normalize)
from time import perf_counter as timer

start_time = timer()
dot_score = util.dot_score(a = query_embedding , b = embeddings)[0]
end_time = timer()

print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds")

# 4. Get the top-k results (we'll keep top 5)
top_results = torch.topk(dot_score , k = 5)
top_results

In [None]:
pages_and_chunks[42]

In [27]:
import textwrap

def print_wrapped(text , wrap_length = 80):
    wrapped_text = textwrap.fill(text , wrap_length)
    print(wrapped_text)


In [None]:
query = "good food for protein"
print(f"Query: {query}\n")
print("Results:")
# Loop through zipped together scores and indices from torch.topk
for score , idx in zip(top_results[0] , top_results[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page Number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [None]:
import fitz # pymupdf Library

# open PDF and Load target
pdf_path = pdf_path
doc = fitz.open(pdf_path)
page = doc.load_page(411 + 41) # note: page numbers of our PDF start 41+

# Get the image of the page
img = page.get_pixmap(dpi = 300)

# Save image (optional)
# img.save("output_filename.png")
doc.close()

# Convert the pixmap to a numpy array
img_array = np.frombuffer(img.samples_mv , dtype=np.uint8).reshape(img.h , img.w , img.n)


# Display the image using MatPlotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(13 , 10))
plt.imshow(img_array)
plt.title(f"Query: {query} | Most relevant page:")
plt.axis("off")
plt.show()

### Similarity measures: Dot Product and Cosine similarity

In [None]:
import torch

def dot_products(vector1 , vector2):
    return torch.dot(vector1 , vector2)

def cosine_similarity(vector1 , vector2):
    dot_product = torch.dot(vector1 , vector2)

    # Get Eucludian/L2 norm
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))


    return dot_product / (norm_vector1 * norm_vector2)

# Examples vectors/tensors
vector1 = torch.tensor([1 , 2 , 3] , dtype=torch.float32)
vector2 = torch.tensor([1 , 2 , 3] , dtype=torch.float32)
vector3 = torch.tensor([4 ,5 , 6] , dtype=torch.float32)
vector4 = torch.tensor([-1 , -2 , -3] , dtype=torch.float32)

# Calculate dot product
print("Dot Product between vector1 and vector2 " , dot_products(vector1 , vector2))
print("Dot Product between vector1 and vector3 " , dot_products(vector1 , vector3))
print("Dot Product between vector1 and vector4 " , dot_products(vector1 , vector4))

# Calculate cosine similarity
print("Cosine Similarity between vector1 and vector2 " , cosine_similarity(vector1 , vector2))
print("Cosine Similarity  between vector1 and vector3 " , cosine_similarity(vector1 , vector3))
print("Cosine Similarity  between vector1 and vector4 " , cosine_similarity(vector1 , vector4))

### Functionizing our semantic search pipeline

In [31]:
def retrieve_relevant_resources(query: str , embeddings: torch.tensor , model: SentenceTransformer = embedding_model , n_resources_to_return : int = 5 , print_time: bool = True):
    """Embeds a query with model and returns top k scores and indices from embeddings. """

    # Embed the query
    query_embedding = model.encode(query , convert_to_tensor=True)
    query_embedding = query_embedding.to("mps")

    embeddings = embeddings.to("mps")

    # Get dot products scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding , embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on ({len(embeddings)}) embeddings: {end_time - start_time:.5f} seconds.")
    
    scores , indices = torch.topk(input=dot_scores , k = n_resources_to_return)

    return scores , indices

def print_top_results_and_scores(query: str , embeddings: torch.tensor , pages_and_chunks: list[dict] = pages_and_chunks , n_resources_to_return: int = 5):
    """
    Finds relevant passages given a query and prints them out along with their scores. 
    """
    scores , indices = retrieve_relevant_resources(query=query , embeddings=embeddings , n_resources_to_return=n_resources_to_return)

    # Loop through zipped together scores and indices from torch.topk
    for score , idx in zip(scores ,indices):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"Page Number: {pages_and_chunks[idx]['page_number']}")
        print("\n")


In [None]:
print_top_results_and_scores(query="foods high in fiber" , embeddings=embeddings)

### Getting an LLM for Local generations

In [33]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb}GB")

In [34]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")


### Loading an LLM locally

In [None]:
import torch
from transformers import AutoTokenizer , AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# 1. Create a quantization config
# Note: requires !pip install bitsandbytes accelerate
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True , bnb_4bit_compute_dtype=torch.float16)


# Bonus: flash attention 2 = faster attention mechanism
# Flash Attention 2 requires a GPU with a compute capability score of 8.0+
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability()[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa" # Scaled dot product attention

# 2. Pick a model we'd like to use
model_id = "google/gemma-7b-it"
model_id = model_id

# 3. Instantiate tokenizer (tokenizer turns text into tokens)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id , torch_dtype = torch.float16 , 
                                                 quantization_config = quantization_config if use_quantization_config else None , 
                                                 low_cpu_mem_usage = False , 
                                                 attn_implementation = attn_implementation)



if not use_quantization_config:
    llm_model.to("cuda")

In [None]:
# pip install bitsandbytes accelerate

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate model sizes
    model_mem_bytes = mem_params + mem_buffers
    model_mem_mb = model_mem_bytes / (1024**2)
    model_mem_gb = model_mem_bytes / (1024**3)

    return {"model_mem_bytes": model_mem_bytes , 
            "model_mem_mb": round(model_mem_mb , 2) , 
            "model_mem_gb": round(model_mem_gb , 2)}


get_model_mem_size(llm_model)

### Generating text with our LLM

In [None]:
input_text = "What are the macronutrients, and what roles do they play in the human body?"
print(f"Input text : \n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role":"user" , 
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template , 
                                       tokenize=False , 
                                       add_generation_prompt=True)

print(f"\n Prompt (formated): \n {prompt} ")

In [None]:
%%time

# Tokenize the input text (turn it into numbers) and send it to the GPU
input_ids = tokenizer(prompt , 
    return_tensors = "pt").to("cuda")

# Generate outputs from local LLM
outputs = llm_model.generate(**input_ids , max_new_tokens = 256)

print(f"Model output (toknes): \n {outputs[0]}\n")

In [None]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded): \n {outputs_decoded}\n")

### Augmenting our prompt with context items

In [None]:
def prompt_formatter(query:str , 
                     context_items: list[dict]) -> str:
    context = "_ " + 