In [1]:
!pip install -q transformers langchain sentence-transformers faiss-cpu pymupdf langchain-huggingface

import fitz  # PyMuPDF
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.schema import Document
import torch

# Step 1: Load PDF using PyMuPDF
def load_pdf(path):
    print(f"📄 Loading PDF from: {path}")
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return [Document(page_content=text)]

# Step 2: Split PDF into smaller chunks
def split_text(docs):
    print("✂️ Splitting PDF text into chunks...")
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(docs)

# Step 3: Embed the chunks using SentenceTransformer
def embed_chunks(chunks):
    print("🔍 Creating embeddings and vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(chunks, embeddings)
    return vector_store

# Step 4: Load lightweight LLM (Flan-T5 Base)
def load_llm():
    print("🧠 Loading Flan-T5 Base model...")
    model_id = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    return HuggingFacePipeline(pipeline=pipe)

# Step 5: Build the Retrieval QA chain
def create_pdf_qa_bot(pdf_path):
    docs = load_pdf(pdf_path)
    chunks = split_text(docs)
    vectorstore = embed_chunks(chunks)
    llm = load_llm()

    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )
    print("✅ PDF QA Bot ready!")
    return qa_chain

# Step 6: Ask questions interactively
def chat_with_pdf():
    from google.colab import files
    uploaded = files.upload()
    pdf_path = next(iter(uploaded))

    try:
        bot = create_pdf_qa_bot(pdf_path)
    except Exception as e:
        print(f"❌ Error setting up the bot: {e}")
        return

    print("\n💬 Ask questions about your PDF (type 'exit' to quit):")
    while True:
        question = input("You: ")
        if question.lower() == "exit":
            print("👋 Goodbye!")
            break
        try:
            result = bot.invoke({"query": question})
            print(f"\n🤖 Answer: {result['result']}\n")
            for i, doc in enumerate(result["source_documents"]):
                print(f"📄 Source {i+1}: {doc.page_content[:150]}...\n")
        except Exception as e:
            print(f"⚠️ Error during answer generation: {e}")

# Run it
chat_with_pdf()


Saving Hariragavendra_M_G_Resume.pdf to Hariragavendra_M_G_Resume (1).pdf
📄 Loading PDF from: Hariragavendra_M_G_Resume (1).pdf
✂️ Splitting PDF text into chunks...
🔍 Creating embeddings and vector store...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


🧠 Loading Flan-T5 Base model...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


✅ PDF QA Bot ready!

💬 Ask questions about your PDF (type 'exit' to quit):
You: Name

🤖 Answer: ragavendra

📄 Source 1: HARIRAGAVENDRA M G
Chennai, Tamil Nadu | 8754930396 | ragavendra06042@gmail.com
GitHub: https://github.com/Ragavendra0604 | LinkedIn: linkedin.com/in/...

You: exit
👋 Goodbye!
