In [2]:
#let's begin
import fitz
import re
from tqdm.auto import tqdm
import random
import pandas as pd
import numpy as np


In [None]:
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

In [5]:
def text_format(text)->str:
  cleaned_text = text.replace("\n", " ").strip()
  cleaned_text = re.sub(r"(\s*)([+\-*/=])(\s*)", r" \2 ", cleaned_text)

    # Handle inline LaTeX math (if applicable, e.g., $x^2 + y^2$)
  cleaned_text = re.sub(r"\$(.*?)\$", r"[MATH:\1]", cleaned_text)

  return cleaned_text


In [7]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:

    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_format(text)
        pages_and_texts.append({"page_number": page_number - 41,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts



In [None]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

In [None]:

random.sample(pages_and_texts, k=3)

In [None]:
df=pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe()

In [None]:
df.info()

So with a rough estimate of the tokens per page we can choose an embedding model. A small one should suffice here

In [14]:
from spacy.lang.en import English

In [None]:
nlp = English()

nlp.add_pipe("sentencizer")


In [None]:
#Let's play around a bit?
doc=nlp("Sentence 1 is this one right here. What about another one? And another!")

print(list(doc.sents))

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Change all sentences to string to avoid any errors
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]


    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
random.sample(pages_and_texts, k=5)

In [19]:
df = pd.DataFrame(pages_and_texts)

In [None]:
df.head()

In [None]:
df.describe()

With about 10 sentences per page and 287 tokens(10.5 sentences on average)) on average, with a grouping of 10 or 11 sentences we should be able to work well within the limits of quite a few lightweight but efficient models

In [22]:
num_sentence_chunk_size = 10
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
            return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]



In [None]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

Take a look to see how it did

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe()

Now instead of having multiple chunks per page we split each  chunk into its own item and list its page number along with it for identification

In [None]:
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]


        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)

In [None]:
random.sample(pages_and_chunks,1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe()

In [None]:
df.head()

In [32]:
import seaborn as sns

In [None]:
sns.scatterplot(data=df, x='page_number', y='chunk_token_count' )


lot of chunks with quite a lot amount of tokens especially towards the end. Might want to take a look at the ones below a certain threshold, say 50( by eye-balling the above graph)


In [None]:
min_token_length = 50
for row in df[df["chunk_token_count"] <= min_token_length].sample(20).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Ok so a lot of this seems to be pretty irrelevant for when we use this to chat with our pdf.but some stuff is part of the actual text we  need so maybe we can set a tifhterr threshold and then get rid of most of this useless jargon.

In [None]:
min_token_length = 25
for row in df[df["chunk_token_count"] <= min_token_length].sample(20).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

looks to be more of irrelevant text with this threshold


In [None]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(20).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

even 30 looks to be a good threshold and helps remmove more jargon but it comes with a tiny bit of information loss.

In [37]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")


In [None]:
pages_and_chunks_over_min_token_len[:2]


Now we shall move on to embedding these chunks so that they can be passed onward

We'll use one form hugging face

In [None]:
from sentence_transformers import SentenceTransformer

Choose this for a good mix of efficiency and accuracy

In [None]:

model_name = "all-distilroberta-v1"
embedding_model = SentenceTransformer(model_name_or_path=model_name, device="cpu")



let's test how this works

In [None]:
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
    print(embedding.shape)


Might as well see how long it takes

In [None]:
%%time



embedding_model.to("cpu")

for item in tqdm(pages_and_chunks_over_min_token_len):
 item["embedding"] = embedding_model.encode(item["sentence_chunk"])

too slow!


In [None]:
embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

This right here is why I got a better device

In [44]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

Look at batching to make this faster

In [None]:
text_chunk_embeddings = embedding_model.encode(text_chunks,batch_size=16,convert_to_tensor=True) 

text_chunk_embeddings

slight upgrade even in this small sample size

Now to save these so we don't waste time generating them everytime we boot this up

In [49]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

To check if it saves and loads well..

In [None]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

We can save these in arrays or tensors or such for small datasets like we have here but for scaling using a vector db would be the optimal approach

In [2]:
import torch
import pandas as pd
import numpy as np

In [3]:
device='cuda'

In [None]:
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

We loaded the data,converted the embedding column back into a numpy array, and the df to a list of dicts(which we started with) and then we convert the embeddings to a torch tensor and send it to the device

In [None]:
text_chunks_and_embedding_df.head()

Now let's define our embedding model again as the earlier defintion is way up and we want to be able to start working from a few cells back 

In [6]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-distilroberta-v1", device=device)

We want to perform a semantic search on the file.

To do this we 
1. Define a query
2. Turn the query string in an embedding with same model we used to embed our text chunks
3. Perform a dot product or cosine similarity function between the text embeddings and the query embedding to get similarity scores
4. Sort the results from step 3 in descending order

In [None]:
query = "macronutrients functions"
print(f"Query: {query}")


query_embedding = embedding_model.encode(query, convert_to_tensor=True)


from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")


top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")


start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

Pretty quick. Might need to look into vector DB later 

In [9]:
import textwrap

Use this function to simulate word wrapping to make it more readable

In [10]:


def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

Now we can finally print out the relevant text portions to see the results of the similarity score matching

In [None]:
print(f"Query: '{query}'\n")
print("Results:")
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Cool so now we can see it works quite well. We can also inspect the page numbers to see more

In [14]:
import fitz

In [None]:
pdf_path = "human-nutrition-text.pdf" 
doc = fitz.open(pdf_path)
page = doc.load_page(7 + 41) 
img = page.get_pixmap(dpi=300)


doc.close()

img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))

import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off') 
plt.show()

Now ideally we should be using a cosine similarity search but our embedding model gives normalized outputs as is so a dot product suffices and is also more efficient 

In [19]:
def retrieve_relevant_resources(query: str,embeddings: torch.tensor,model: SentenceTransformer=embedding_model,n_resources_to_return: int=5,print_time: bool=True):


    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)

    return scores, indices

In [20]:
def print_top_results_and_scores(query: str,embeddings: torch.tensor,pages_and_chunks: list[dict]=pages_and_chunks,n_resources_to_return: int=5):

    
    scores, indices = retrieve_relevant_resources(query=query,embeddings=embeddings,n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

Now lets test these helper functions to see if we get the expected output

In [None]:
query = "symptoms of pellagra"

scores, indices = retrieve_relevant_resources(query=query,embeddings=embeddings)
scores, indices

In [None]:
print_top_results_and_scores(query=query,embeddings=embeddings)

In [28]:
model_id='google/gemma-2b-it'
use_quantization_config = True

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 


from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")


model_id = model_id 
print(f"[INFO] Using model_id: {model_id}")

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, 
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=True, 
                                                 attn_implementation=attn_implementation) 



In [None]:
llm_model


In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

Now to generate text using our llm

In [None]:
input_text = "What are the macronutrients, and what roles do they play in the human body?"
print(f"Input text:\n{input_text}")

dialogue_template = [
    {"role": "user",
     "content": input_text}
]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template,tokenize=False, add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

outputs = llm_model.generate(**input_ids,max_new_tokens=256) 
print(f"Model output (tokens):\n{outputs[0]}\n")

In [None]:
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Might wanna do a little formatting to make it look better

In [None]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

In [37]:
chatgpt_questions = [
    "What are the macronutrients, and what roles do they play in the human body?",
    "How do vitamins and minerals differ in their roles and importance for health?",
    "Describe the process of digestion and absorption of nutrients in the human body.",
    "What role does fibre play in digestion? Name five fibre containing foods.",
    "Explain the concept of energy balance and its importance in weight management."
]

manual_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the RDI for protein per day?",
    "water soluble vitamins"
]

query_list = chatgpt_questions + manual_questions

In [38]:
import random

In [None]:
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Now to augment our prompt to the llm with this data


In [None]:
def prompt_formatter(query: str, context_items: list[dict]) -> str:

    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

 
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    base_prompt = base_prompt.format(context=context, query=query)

    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True,
                             max_new_tokens=256) 

output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

In [None]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
   
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    context_items = [pages_and_chunks[i] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() 
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    if return_answer_only:
        return output_text
    
    return output_text, context_items

Time to test it out

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items