# PDF Q&A Bot

# STEP 1: Install required libraries and upload your pdf.

In [4]:
!pip install pymupdf faiss-cpu google-genai nltk



In [5]:
import fitz
import faiss
import numpy as np
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
from google import genai


# Your API Key


from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=GOOGLE_API_KEY)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
from google.colab import files
import fitz  # PyMuPDF

uploaded = files.upload()
pdf_path = next(iter(uploaded))

Saving PDF Q&A Bot - Colab.pdf to PDF Q&A Bot - Colab.pdf


# Extract text from pdf and split into chunks

In [7]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

raw_text = extract_text_from_pdf(pdf_path)
print("PDF loaded!")

PDF loaded!


In [8]:

from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

def split_into_chunks(text, max_len=250):
    sentences = sent_tokenize(text)
    chunks, chunk = [], ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_len:
            chunk += sentence + " "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + " "
    if chunk: chunks.append(chunk.strip())
    return chunks

chunks = split_into_chunks(raw_text)
print(f"Split into {len(chunks)} chunks")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Split into 10 chunks


#Embed chunks

In [13]:
def embed_texts(text_list):
    result = client.models.embed_content(
        model="models/text-embedding-004",
        contents=text_list,
    )
    return [e.values for e in result.embeddings]

In [14]:
chunk_embeddings = embed_texts(chunks)
chunk_embeddings = np.array(chunk_embeddings).astype('float32')

#Build FAISS Index

In [16]:
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
index.add(chunk_embeddings)

In [17]:
def retrieve_top_k(question, k=5):
    q_embedding = embed_texts([question])[0]
    D, I = index.search(np.array([q_embedding]), k)
    return [chunks[i] for i in I[0]]

#Chat with PDF

In [24]:
MODEL_ID = "gemini-1.5-pro"
from google.genai import types
def interactive_rag_chat():
    print("Ask questions about your PDF. Type 'new' to reset context or 'exit' to quit.")

    while True:
        question = input("\nNew Question: ")
        if question.lower() == 'exit':
            print(" Exiting.")
            break

        # Retrieve relevant chunks for the question
        top_chunks = retrieve_top_k(question, k=5)
        context = "\n\n".join(top_chunks)

        # Start a chat session with system context
        chat = client.chats.create(
            model=MODEL_ID,
            config=types.GenerateContentConfig(
                system_instruction=f"""
You are a helpful assistant answering based only on the following document context.

If the answer isn't present, say "I don't know".

Context:
{context}
"""
            )
        )

        #  Ask first question
        response = chat.send_message(question)
        print("\n Answer:", response.text)

        # Loop for follow-up questions
        while True:
            followup = input("\n Follow-up (or 'new' / 'exit'): ")
            if followup.lower() in ['new', 'exit']:
                if followup.lower() == 'exit':
                    print(" Exiting.")
                    return
                else:
                    break  # restart new question loop
            followup_response = chat.send_message(followup)
            print("\n Answer:", followup_response.text)

In [None]:
interactive_rag_chat()