In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Step 0: Install necessary libraries (run once)
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-ben
!pip install pytesseract pdf2image sentence-transformers faiss-cpu transformers --quiet

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # suppress warnings

import pytesseract
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
import unicodedata

nltk.download('punkt')

# Step 1: OCR extraction of Bengali text from PDF pages using Tesseract
pdf_path = "/kaggle/input/10mins/HSC26-Bangla1st-Paper.pdf"  # your PDF

print("Starting OCR extraction using Tesseract...")
pages = convert_from_path(pdf_path, dpi=300)

full_text = ""
for i, page in enumerate(pages):
    text_page = pytesseract.image_to_string(page, lang='ben')
    print(f"Extracted page {i + 1} text: {len(text_page)} chars")
    full_text += text_page + "\n"

# Normalize unicode
full_text = unicodedata.normalize('NFC', full_text)

# Step 2: Sentence tokenization and chunking for retrieval
sentences = sent_tokenize(full_text)
chunk_size = 5
chunks = [' '.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
print(f"Total chunks created: {len(chunks)}")

# Step 3: Embedding chunks with multilingual model & build FAISS index
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
print("Encoding chunks...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)

dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))
print(f"FAISS index built with {faiss_index.ntotal} chunks")

# Step 4: Load pretrained QA model supporting Bengali
# Example: multilingual XLM-Roberta fine-tuned on SQuAD2 (flexible QA)
qa_model_name = "deepset/xlm-roberta-large-squad2"  
qa_pipeline = pipeline("question-answering", model=qa_model_name, tokenizer=qa_model_name, device=-1)

# Step 5: Query function performing retrieval + QA inference
def answer_query_with_qa_model(query, top_k=5):
    # 5.1 Embed and retrieve top chunks
    query_embedding = embedding_model.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [chunks[idx] for idx in indices[0]]

    # 5.2 Combine retrieved chunks as context for QA
    context = " ".join(retrieved_chunks)

    # Debug: Print retrieved chunk indices and snippet (optional)
    print(f"Retrieved chunk indices for query: {indices[0].tolist()}")

    # 5.3 Run QA model on query and retrieved context
    result = qa_pipeline(question=query, context=context)

    print(f"Answer found: {result['answer']}")
    return result['answer']

# Step 6: Interactive querying
if __name__ == "__main__":
    print("Enter Bengali questions (type 'exit' to quit):")
    while True:
        user_q = input("Please enter your question: ").strip()
        if user_q.lower() == "exit":
            print("Exiting...")
            break
        answer_query_with_qa_model(user_q)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-ben is already the newest version (1:4.00~git30-7274cfa-1.1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Starting OCR extraction using Tesseract...


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Extracted page 1 text: 117 chars
Extracted page 2 text: 1400 chars
Extracted page 3 text: 1399 chars
Extracted page 4 text: 1101 chars
Extracted page 5 text: 325 chars
Extracted page 6 text: 2242 chars
Extracted page 7 text: 2187 chars
Extracted page 8 text: 2489 chars
Extracted page 9 text: 2194 chars
Extracted page 10 text: 1738 chars
Extracted page 11 text: 2067 chars
Extracted page 12 text: 2459 chars
Extracted page 13 text: 2211 chars
Extracted page 14 text: 2418 chars
Extracted page 15 text: 2410 chars
Extracted page 16 text: 1955 chars
Extracted page 17 text: 1293 chars
Extracted page 18 text: 1955 chars
Extracted page 19 text: 1912 chars
Extracted page 20 text: 1223 chars
Extracted page 21 text: 2162 chars
Extracted page 22 text: 1536 chars
Extracted page 23 text: 1950 chars
Extracted page 24 text: 1946 chars
Extracted page 25 text: 1578 chars
Extracted page 26 text: 1776 chars
Extracted page 27 text: 1573 chars
Extracted page 28 text: 1624 chars
Extracted page 29 text: 1110 ch

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

FAISS index built with 90 chunks


Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Enter Bengali questions (type 'exit' to quit):


Please enter your question:  কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [65, 22, 38, 13, 31]
Answer found:  মামাকে


Please enter your question:  অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [13, 22, 85, 15, 87]
Answer found:  লোকলজ্জা (খ) পিতৃ আদেশ


Please enter your question:  বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [82, 2, 22, 87, 89]
Answer found:  পনেরো,


Please enter your question:  exit


Exiting...


In [None]:
if __name__ == "__main__":
    print("Enter Bengali questions (type 'exit' to quit):")
    while True:
        user_q = input("Please enter your question: ").strip()
        if user_q.lower() == "exit":
            print("Exiting...")
            break
        answer_query_with_qa_model(user_q)

Enter Bengali questions (type 'exit' to quit):


Please enter your question:  অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [13, 22, 85, 15, 87]
Answer found:  লোকলজ্জা (খ) পিতৃ আদেশ


Please enter your question:  সুপুরুষ কাকে বলা হয়েছে?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [8, 4, 21, 1, 57]
Answer found:  বর্ণনার


Please enter your question:  শরৎচন্দ্র চট্টোপাধ্যায়ের উপন্যাসের নাম কী কী?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved chunk indices for query: [21, 1, 22, 56, 15]
Answer found:  'অপরিচিতা'
