In [None]:
!pip install -q pymupdf pytesseract faiss-cpu sentence-transformers transformers nltk google-generativeai


In [None]:
# Required imports
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import faiss
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from typing import List, Dict
import os

In [None]:
# Download NLTK punkt tokenizer (only once needed)
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

In [None]:
# Function to load and chunk PDF

def load_pdf(file_path: str, chunk_size: int = 1000, lang: str = 'eng+ben') -> List[Dict]:
    """
    Loads PDF (digital or scanned), extracts text (with OCR if needed), and returns sentence-based chunks.

    Args:
        file_path (str): Path to the PDF.
        chunk_size (int): Approx. number of words per chunk.
        lang (str): OCR language for pytesseract.

    Returns:
        List[Dict]: List of text chunks with metadata.
    """
    doc = fitz.open(file_path)
    chunks = []
    chunk_id = 0

    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()

        if len(page_text.strip()) < 20:
            pix = page.get_pixmap(dpi=300)
            img = Image.open(io.BytesIO(pix.tobytes()))
            page_text = pytesseract.image_to_string(img, lang=lang)

        sentences = sent_tokenize(page_text)

        current_chunk = []
        word_count = 0

        for sentence in sentences:
            words = sentence.split()
            word_count += len(words)
            current_chunk.append(sentence)

            if word_count >= chunk_size:
                chunks.append({
                    'id': chunk_id,
                    'text': ' '.join(current_chunk),
                    'page': page_num
                })
                chunk_id += 1
                current_chunk = []
                word_count = 0

        # Add any remaining sentences
        if current_chunk:
            chunks.append({
                'id': chunk_id,
                'text': ' '.join(current_chunk),
                'page': page_num
            })
            chunk_id += 1

    return chunks


In [None]:
# Load and chunk a PDF
pdf_path = '/kaggle/input/ssc-math-pdf/Higher Math 9-10 Com Opt.pdf'
chunks = load_pdf(pdf_path)
print(f"Loaded {len(chunks)} chunks.")


In [None]:
# Load a stronger multilingual sentence-transformer model
embedder = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

# Compute embeddings
texts = [chunk['text'] for chunk in chunks]
embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)

In [None]:
# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index loaded with {index.ntotal} vectors.")


In [None]:
# Function to retrieve top-k chunks

def retrieve(query: str, top_k: int = 25) -> List[Dict]:
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = [chunks[i] for i in indices[0]]
    return results


# Configure Gemini API Key
# Replace with your own key or use environment variable
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyBpnhF7gsPKKpM3qHCRQRWYRnirwLX_8PQ")
genai.configure(api_key=GEMINI_API_KEY)


In [None]:
# Function to generate answer with Gemini

def generate_answer(context_chunks: List[Dict], query: str, model_name="gemini-1.5-pro-latest") -> str:
    context = "\n\n".join([chunk['text'] for chunk in context_chunks])

    prompt = f"""
    You are a professional educational assistant.
    use the given context to answer the user's question.

    VERY IMPORTANT:
    - Do not hallucinate.
    - If the topic is NOT found in the context, reply: "Not enough information in the book."
    - Do NOT use LaTeX.
    - Answer clearly and in normal text format.

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text.strip()


In [None]:
# Example query
query = "explain trigonometric identity with a solved problem"
retrieved_context = retrieve(query)
long_answer = generate_answer(retrieved_context, query)
print("\nGenerated Answer:\n")
print(long_answer)
