<a href="https://colab.research.google.com/github/Panos997/Borderless-Table-Location-in-Scanned-Legal-Documents-using-Object-Detection-Models/blob/main/AI_Project_AskMyPDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies
!apt-get -y install poppler-utils tesseract-ocr
!pip install pytesseract pdf2image sentence-transformers transformers chromadb gradio

# Cell 2: PDF QA Interface with Gradio
import pytesseract
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from transformers import pipeline
import gradio as gr

# Initialize models and database client
model = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.Client(Settings(anonymized_telemetry=False))
collection = None
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def process_pdf(file):
    try:
        global collection

        # 1) If there’s an old collection, delete it
        try:
            client.delete_collection("pdf_agent")
        except Exception:
            pass

        # 2) Create a fresh one
        collection = client.create_collection("pdf_agent")

        # 3) OCR each page
        images = convert_from_path(file.name)
        pages = [pytesseract.image_to_string(img) for img in images]

        # 4) Chunk into overlapping 500‐char slices
        chunks = []
        for page in pages:
            for i in range(0, len(page), 450):
                chunks.append(page[i:i+500])

        # 5) Embed & store
        for i, chunk in enumerate(chunks):
            emb = model.encode(chunk).tolist()
            collection.add(
                documents=[chunk],
                embeddings=[emb],
                ids=[str(i)]
            )

        return f"✅ Processed {len(pages)} pages into {len(chunks)} chunks."
    except Exception:
        import traceback
        return "⚠️ Error during processing:\n" + traceback.format_exc()




def answer_question(question, top_k):
    if collection is None:
        return "Please upload and process a PDF first."
    query_emb = model.encode(question).tolist()
    results = collection.query(query_embeddings=[query_emb], n_results=top_k)
    answers = []
    for chunk in results['documents'][0]:
        res = qa_pipeline(question=question, context=chunk)
        answers.append(f"{res['answer']} (confidence: {res['score']*100:.2f}%)")
    return "\n\n".join(answers)

with gr.Blocks() as demo:
    gr.Markdown("## PDF Question‑Answering Interface")
    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
    process_btn = gr.Button("Process PDF")
    status = gr.Textbox(label="Status", interactive=False)
    question = gr.Textbox(label="Question")
    top_k = gr.Slider(1, 10, value=3, step=1, label="Top K Chunks")
    ask_btn = gr.Button("Ask")
    answer = gr.Textbox(label="Answer", interactive=False)

    process_btn.click(fn=process_pdf, inputs=pdf_input, outputs=status)
    ask_btn.click(fn=answer_question, inputs=[question, top_k], outputs=answer)

demo.launch(share=True)