In [1]:
# Simple Local RAG Pipeline (Cleaned)
# 1. Install dependencies
!pip install -q -U torch torchvision torchaudio pymupdf sentence-transformers accelerate bitsandbytes flash-attn

# 2. Import libraries and setup device
import os
import torch
import requests
import fitz
import numpy as np
import pandas as pd
from spacy.lang.en import English
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
# 3. Download and Process PDF Document
pdf_path = "human-nutrition-text.pdf"
if not os.path.exists(pdf_path):
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    response = requests.get(url)
    with open(pdf_path, "wb") as f: f.write(response.content)

def text_formatter(text: str) -> str:
    return text.replace("\n", " ").strip()

nlp = English()
nlp.add_pipe("sentencizer")

pages_and_chunks = []
doc = fitz.open(pdf_path)

for page_num, page in enumerate(doc):
    text = text_formatter(page.get_text())
    sentences = [str(sent) for sent in list(nlp(text).sents)]
    # Split into chunks of 10 sentences
    chunks = [sentences[i:i + 10] for i in range(0, len(sentences), 10)]

    for chunk in chunks:
        joined_chunk = "".join(chunk).replace("  ", " ").strip()
        if len(joined_chunk) > 30: # Filter short chunks
            pages_and_chunks.append({
                "page_number": page_num - 41,
                "sentence_chunk": joined_chunk,
                "chunk_char_count": len(joined_chunk)
            })
print(f"Processed {len(pages_and_chunks)} text chunks.")

embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]
embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)
print(f"Embeddings created with shape: {embeddings.shape}")

def retrieve_relevant_resources(query: str, embeddings: torch.Tensor, model: SentenceTransformer, top_k: int=5):
    query_embedding = model.encode(query, convert_to_tensor=True)
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    scores, indices = torch.topk(dot_scores, k=top_k)
    return scores, indices

login(token="token_goes_here")

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4_bit_compute_dtype=torch.float16)

# Automatically choose model size based on VRAM
gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
model_id = "google/gemma-3-1b-it"
print(f"Loading model: {model_id}")

tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    low_cpu_mem_usage=False
)

Processed 1822 text chunks.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embeddings created with shape: torch.Size([1822, 768])
Loading model: google/gemma-3-1b-it


`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [3]:
# 7. Define RAG Pipeline functions
def prompt_formatter(query: str, context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
    base_prompt = f"""Based on the following context items, please answer the query.\nContext:\n{context}\nQuery: {query}\nAnswer:"""

    messages = [{"role": "user", "content": base_prompt}]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def ask(query: str, temperature: float=0.7, max_new_tokens: int=256):
    # Retrieve
    scores, indices = retrieve_relevant_resources(query, embeddings, embedding_model)
    context_items = [pages_and_chunks[i] for i in indices]

    # Augment
    prompt = prompt_formatter(query, context_items)

    # Generate
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llm_model.generate(**input_ids, temperature=temperature, do_sample=True, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "")

In [6]:
query = "How does saliva help with digestion?"
answer = ask(query)

print(f"Query: {query}\n")
print(f"Answer:\n{answer}")

Query: How does saliva help with digestion?

Answer:
user
Based on the following context items, please answer the query.
Context:
- Digestion and Absorption of Carbohydrates UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM From the Mouth to the Stomach The mechanical and chemical digestion of carbohydrates begins in the mouth.Chewing, also known as mastication, crumbles the carbohydrate foods into smaller and smaller pieces.The salivary glands in the oral cavity secrete saliva that coats the food particles. Saliva contains the enzyme, salivary amylase.This enzyme breaks the bonds between the monomeric sugar units of disaccharides, oligosaccharides, and starches.The salivary amylase breaks down amylose and amylopectin into smaller chains of glucose, called dextrins and maltose.The increased concentration of maltose in the mouth that results from the mechanical and chemical breakdown of starches in whole grains is what enhances their swe