<a href="https://colab.research.google.com/github/Saurav15/RAG-pipeline/blob/main/00-rag-custom-implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
# Step 1: Data processing and embedding creation
# 1. Import a pdf doc.
# 2. Process text for embedding i.e. split pdf into chunk of sentances.
# 3. Embed text chunk with embedding model.
# 4. Save embedding to file for later use.

In [3]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference


[INFO] Running in Google Colab, installing requirements.
Collecting torch
  Downloading torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.11
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64

In [11]:
# This code snippet helps us download the file(PDF) and save it locally.
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
  print("[INFO]: File does not exists. Downloading...")
  file_url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
  filename = pdf_path

  # Get the file from the internet
  response = requests.get(file_url)

  # Check if the response was successful
  if response.status_code == 200:
    # Open the file as binary write mode and save the content in it.
    with open(filename, 'wb') as file:
      file.write(response.content)
      print(f"[INF0]: File has been downloaded and saved as {filename}")
  else:
    print(f"[ERROR]: Failed to download file, got status code: {response.status_code}")
else:
  print("[INFO]: Filed already exists.")

[INFO]: Filed already exists.


In [19]:
import fitz # Used to read the PDF.
from tqdm.auto import tqdm # This will help us show the progress bar of how much data is analyzed.

# This functioin performs minor formatting on text that we input.
def text_formater (text: str) -> str:
  cleaned_text = text.replace("\n", " ").strip(" ")
  # Potentially more text fromatting functions can go here.
  return cleaned_text


# This function actually goes through the pdf and extracts the text and other informative stats for each page in the pdf.
def open_and_read_pdf (pdf_file_path: str) -> list[dir]:
  doc = fitz.open(pdf_file_path)
  pages_and_text = []

  # Loo through each page and analyze the pdf.
  for page_number, page in tqdm(enumerate(doc)):
    text = page.get_text() # Get the text from the current page
    text = text_formater(text) # Format the current text
    # Now analyze each page and add the analytics in page_and_text list
    pages_and_text.append({
        "page_number": page_number - 41, # As this book starts from page 41
        "page_char_count": len(text),
        "page_word_count": len(text.split(" ")),
        "page_sentance_count_raw": len(text.split(". ")),
        "page_token_count": len(text) / 4, # As 1 token = 4 characters. Look openAI doc
        "text": text,
    })

  return pages_and_text


pages_and_text = open_and_read_pdf(pdf_file_path=pdf_path)
pages_and_text[:2]


0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentance_count_raw': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentance_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''}]

In [20]:
print(len(pages_and_text))

1208
