PDF RAG Implementation


In [1]:
# Cell 1: Imports and Setup
import os
import faiss
import numpy as np
import PyPDF2
import ollama
from IPython.display import display, Markdown
import ipywidgets as widgets


In [2]:
%pip install ipywidgets faiss-cpu PyPDF2


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Cell 2: Text Extraction from PDF
def extract_text_from_pdfs(uploaded_files):
    text = ""
    for uploaded_file in uploaded_files:
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [4]:
# Cell 3: Chunking the Text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - chunk_overlap
    return chunks


In [5]:
# Cell 4: Embedding Text using Ollama (mxbai-embed-large)
def get_embedding(text):
    response = ollama.embeddings(
        model="mxbai-embed-large",
        prompt=f"Represent this sentence for searching relevant passages: {text}"
    )
    return np.array(response["embedding"], dtype='float32')

In [6]:
# Cell 5: Build FAISS Index
def build_faiss_index(chunks):
    vectors = [get_embedding(chunk) for chunk in chunks]
    dim = len(vectors[0])
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors))
    return index, chunks

In [7]:
# Cell 6: Retrieve Context from Query
def retrieve_context(index, chunks, query, k=1):
    query_embedding = get_embedding(query).reshape(1, -1)
    _, indices = index.search(query_embedding, k)
    return "\n".join([chunks[i] for i in indices[0]])

In [8]:
# Cell 7: Ask Mistral (via Ollama)
def ask_mistral(context, question):
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    response = ollama.chat(model="mistral", messages=[{"role": "user", "content": prompt}])
    return response["message"]["content"]

In [10]:
# Cell 8: Run RAG
# Provide your PDF file names here (they must be in the same folder as the notebook)
pdf_files = ["test file.pdf"]  # 📝 Replace with your file names
pdf_paths = [os.path.join(os.getcwd(), f) for f in pdf_files]

# Extract → Chunk → Embed → Build Index
raw_text = extract_text_from_pdfs(pdf_paths)
chunks = chunk_text(raw_text)
index, chunks = build_faiss_index(chunks)
print(f"✅ Processed {len(pdf_files)} PDF(s) and created vector store with {len(chunks)} chunks.")

✅ Processed 1 PDF(s) and created vector store with 3 chunks.


In [11]:
# Cell 9: Hardcoded Question and Answer
from IPython.display import display, Markdown

# 🔽 Replace this with your custom question
question = "who did Emma call?"

if not chunks or not index:
    print("Please process your PDFs first.")
else:
    context = retrieve_context(index, chunks, question)
    answer = ask_mistral(context, question)

    display(Markdown(f"**Question:** {question}"))
    display(Markdown(f"**Answer:** {answer}"))


**Question:** who did Emma call?

**Answer:**  Emma called her best friend Jake.