In [None]:
!pip install -q pymupdf langchain sentence-transformers faiss-cpu
!pip install -q transformers torch

# Add import checks after installation
try:
    import fitz
    print("fitz imported successfully")
except ImportError:
    print("fitz not found")

try:
    import faiss
    print("faiss imported successfully")
except ImportError:
    print("faiss not found")

try:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    print("langchain.text_splitter imported successfully")
except ImportError:
    print("langchain.text_splitter not found")

try:
    from sentence_transformers import SentenceTransformer
    print("sentence_transformers imported successfully")
except ImportError:
    print("sentence_transformers not found")

try:
    import torch
    print("torch imported successfully")
except ImportError:
    print("torch not found")

try:
    from transformers import pipeline
    print("transformers.pipeline imported successfully")
except ImportError:
    print("transformers.pipeline not found")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import files
uploaded = files.upload()
pdf_file = list(uploaded.keys())[0]  # Get the uploaded PDF filename

Saving HSC26_Bangla_1st_Paper.pdf to HSC26_Bangla_1st_Paper.pdf


In [None]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def clean_bengali_text(text):
    # Normalize Unicode
    text = text.replace('\u09cd\u09af', '\u09ce')  # Fix compound characters
    # Remove special chars but preserve Bengali punctuation
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007E\u0964\u0965\.\?\!\,\;\(\)\[\]\{\}]', '', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    return text.strip()

raw_text = extract_text_from_pdf(pdf_file)
cleaned_text = clean_bengali_text(raw_text) # Define cleaned_text here

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", "।", "?", "!"]
)

# Debug print for cleaned_text length
print(f"Length of cleaned_text: {len(cleaned_text)}")

chunks = text_splitter.split_text(cleaned_text)

# Debug print for number of chunks
print(f"Number of chunks: {len(chunks)}")


# Embedding
# Use the updated embedding model from cell 2OmYS0qLCSgh (assuming it's run before this)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Only encode if there are chunks
if chunks:
    embeddings = embedding_model.encode(chunks)

    # Ensure embeddings are a numpy array and float32
    embeddings = np.array(embeddings).astype('float32')

    # Create FAISS index
    # Get the dimension from the embeddings
    dimension = embeddings.shape[1] if embeddings.ndim == 2 else embedding_model.get_sentence_embedding_dimension()
    index = faiss.IndexFlatL2(dimension)

    # Debug prints
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Embeddings dtype: {embeddings.dtype}")
    print(f"FAISS index dimension: {index.d}")

    # Add to index
    index.add(embeddings)
else:
    print("No chunks were generated. Cannot create embeddings or FAISS index.")
    # Initialize index with a dummy dimension or handle this case downstream
    # For now, we'll just print a message and the rest of the code might fail if it depends on `index`
    dimension = embedding_model.get_sentence_embedding_dimension()
    index = faiss.IndexFlatL2(dimension) # Initialize with expected dimension even if empty

Length of cleaned_text: 0
Number of chunks: 0
No chunks were generated. Cannot create embeddings or FAISS index.


In [None]:
from transformers import pipeline

# Load a multilingual QA model
qa_pipeline = pipeline(
    "question-answering",
    model="bert-large-uncased-whole-word-masking-finetuned-squad",
    tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad"
)

def get_answer(question):
    # Embed the question
    question_embedding = embedding_model.encode([question])

    # Search the index
    D, I = index.search(question_embedding, k=3)

    # Get relevant context
    context = " ".join([chunks[i] for i in I[0]])

    # Generate answer
    result = qa_pipeline(question=question, context=context)
    return result['answer']

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create widgets
question_input = widgets.Textarea(
    value='',
    placeholder='Type your question in English or Bengali...',
    description='Question:',
    layout={'width': '80%'}
)

output_area = widgets.Output()

def on_submit(b):
    with output_area:
        output_area.clear_output()
        question = question_input.value
        if question:
            print("Processing...")
            answer = get_answer(question)
            print(f"\nQuestion: {question}")
            print(f"Answer: {answer}")

submit_button = widgets.Button(description="Submit")
submit_button.on_click(on_submit)


In [None]:
# Example batch questions
questions = [
    "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?",
    "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?",
    "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
]

# Process all questions
for question in questions:
    answer = get_answer(question)
    print(f"Q: {question}")
    print(f"A: {answer}\n")

IndexError: list index out of range

In [None]:
# Save the index
faiss.write_index(index, "bangla_rag.index")

# To load in future sessions:
# index = faiss.read_index("bangla_rag.index")

In [None]:
# Check first 500 characters of extracted text
print(cleaned_text[:500])

In [None]:
# This cell is no longer needed as cleaning is done in the previous step.
# Keeping it here for now, but it can be removed once confirmed.
def clean_bengali_text(text):
    # Normalize Unicode
    text = text.replace('\u09cd\u09af', '\u09ce')  # Fix compound characters
    # Remove special chars but preserve Bengali punctuation
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007E\u0964\u0965\.\?\!\,\;\(\)\[\]\{\}]', '', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    return text.strip()

# Apply the cleaning function after it's defined
# cleaned_text = clean_bengali_text(raw_text) # This line is now in the previous cell

In [None]:
# More robust Bengali model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [None]:
def get_answer(question):
    question_embedding = embedding_model.encode([question])

    # Increase number of retrieved chunks
    D, I = index.search(question_embedding, k=5)

    # Add similarity threshold
    context = ""
    for score, idx in zip(D[0], I[0]):
        if score < 0.8:  # Only use reasonably similar chunks
            context += chunks[idx] + "\n\n"

    if not context:
        return "No relevant information found"

    # Try different QA model for Bengali
    from transformers import AutoModelForQuestionAnswering, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("neuralspace-reverie/indic-transformers-bn-qa")
    model = AutoModelForQuestionAnswering.from_pretrained("neuralspace-reverie/indic-transformers-bn-qa")

    inputs = tokenizer(question, context, return_tensors="pt")
    outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

    return answer

In [None]:
# Verify some known text exists
assert "শুম্ভুনাথ" in cleaned_text or "Shumbhunath" in cleaned_text

NameError: name 'cleaned_text' is not defined

In [None]:
# Check first 500 characters of extracted text
print(cleaned_text[:500])

In [None]:
# Check similarity between known terms
emb1 = embedding_model.encode(["শুম্ভুনাথ"])
emb2 = embedding_model.encode(["অনুপম"])
print(cosine_similarity(emb1, emb2))  # Should be low

In [None]:
# Check if the specific word exists in the cleaned text
target_word_bengali = "শুম্ভুনাথ"
target_word_english = "Shumbhunath"

if target_word_bengali in cleaned_text:
    print(f"'{target_word_bengali}' was found in the cleaned text.")
elif target_word_english in cleaned_text:
    print(f"'{target_word_english}' was found in the cleaned text.")
else:
    print(f"Neither '{target_word_bengali}' nor '{target_word_english}' was found in the cleaned text.")

In [None]:
# Manually check what chunks are being retrieved
test_question = "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?"
test_embedding = embedding_model.encode([test_question])
D, I = index.search(test_embedding, k=3)
for i in I[0]:
    print(chunks[i][:200] + "...")  # Print first 200 chars of each chunk

In [None]:
# Fallback to keyword search for Bengali
from collections import Counter

def bengali_keyword_search(question, chunks):
    question_words = set(question.split())
    best_chunk = None
    best_score = 0

    for chunk in chunks:
        chunk_words = set(chunk.split())
        common = question_words & chunk_words
        score = len(common)
        if score > best_score:
            best_score = score
            best_chunk = chunk

    return best_chunk if best_score > 1 else None

def get_answer(question):
    # Try semantic search first
    question_embedding = embedding_model.encode([question])
    D, I = index.search(question_embedding, k=3)
    context = " ".join([chunks[i] for i in I[0]])

    # Verify if context looks reasonable
    if not any(char in '\u0980-\u09FF' for char in context):  # Check for Bengali chars
        # Fallback to keyword search
        context = bengali_keyword_search(question, chunks)

    if not context:
        return "Could not find relevant information"

    # Simple answer extraction (fallback)
    question_words = set(question.split())
    answer_words = [w for w in context.split() if w in question_words]
    if not answer_words:
        return context.split('.')[0]  # Return first sentence as fallback
    return " ".join(answer_words)

In [None]:
# Verify some known text exists
assert "শুম্ভুনাথ" in cleaned_text or "Shumbhunath" in cleaned_text

NameError: name 'cleaned_text' is not defined

In [None]:
# Check similarity between known terms
emb1 = embedding_model.encode(["শুম্ভুনাথ"])
emb2 = embedding_model.encode(["অনুপম"])
print(cosine_similarity(emb1, emb2))  # Should be low

In [None]:
# Manually check what chunks are being retrieved
test_question = "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?"
test_embedding = embedding_model.encode([test_question])
D, I = index.search(test_embedding, k=3)
for i in I[0]:
    print(chunks[i][:200] + "...")  # Print first 200 chars of each chunk

In [None]:
print(f"Embedding model dimension: {embedding_model.get_sentence_embedding_dimension()}")
print(f"FAISS index dimension: {index.d}")

In [None]:
# Get correct dimension from the model
dimension = embedding_model.get_sentence_embedding_dimension()

# Create new index with proper dimensions
index = faiss.IndexFlatL2(dimension)

# Ensure embeddings are float32 numpy array
embeddings = np.array(embeddings).astype('float32')

# Add to index
index.add(embeddings)

In [None]:
# Check first 500 characters of extracted text
print(raw_text[:500])




### 1. Install necessary libraries

In [None]:
!pip install -q pymupdf langchain sentence-transformers faiss-cpu transformers torch

### 2. Upload your PDF file

In [None]:
from google.colab import files
uploaded = files.upload()

if not uploaded:
    print("No file uploaded. Please upload a PDF file.")
else:
    pdf_file_name = list(uploaded.keys())[0]
    print(f"Uploaded file: {pdf_file_name}")

Saving HSC26-Bangla1st-Paper.pdf to HSC26-Bangla1st-Paper.pdf
Uploaded file: HSC26-Bangla1st-Paper.pdf


### 3. Extract and Clean Text from PDF

In [None]:
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file using PyMuPDF."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def clean_bengali_text(text):
    """Cleans Bengali text by normalizing Unicode and removing unwanted characters."""
    # Normalize Unicode
    text = text.replace('\u09cd\u09af', '\u09ce') # Example: handle specific compound characters
    # Remove special chars but preserve Bengali punctuation and standard English punctuation
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007E\u0964\u0965\.\?\!\,\;\(\)\[\]\{\}]', '', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    return text.strip()

# Ensure pdf_file_name is defined from the upload step
if 'pdf_file_name' in globals() and pdf_file_name:
    raw_text = extract_text_from_pdf(pdf_file_name)
    cleaned_text = clean_bengali_text(raw_text)

    print(f"Length of raw text: {len(raw_text)}")
    print(f"Length of cleaned text: {len(cleaned_text)}")

    # Check if text was extracted
    if not cleaned_text:
        print("No text was extracted or cleaned from the PDF. Cannot proceed with chunking and embedding.")
else:
    print("PDF file name not found. Please run the upload cell first.")
    cleaned_text = "" # Ensure cleaned_text is defined even if upload failed

Length of raw text: 82287
Length of cleaned text: 77803


### 4. Chunk Text and Create Embeddings

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import torch # Import torch here as it's used by SentenceTransformer implicitly

# Check if cleaned_text is available and not empty
if 'cleaned_text' in globals() and cleaned_text:
    # Chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        separators=["\n\n", "\n", "।", "?", "!"]
    )
    chunks = text_splitter.split_text(cleaned_text)

    print(f"Number of chunks: {len(chunks)}")

    # Check if chunks were created
    if chunks:
        # Embedding
        print("Loading embedding model...")
        embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
        print("Encoding chunks...")
        embeddings = embedding_model.encode(chunks, show_progress_bar=True)

        # Ensure embeddings are a numpy array and float32
        embeddings = np.array(embeddings).astype('float32')

        # Create FAISS index
        print("Creating FAISS index...")
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)

        # Add to index
        print("Adding embeddings to index...")
        index.add(embeddings)

        print(f"Embeddings shape: {embeddings.shape}")
        print(f"FAISS index dimension: {index.d}")
        print("FAISS index created successfully.")

    else:
        print("No chunks were generated. Cannot create embeddings or FAISS index.")
        # Initialize an empty index to avoid NameError in later cells
        dimension = 768 # Default dimension for the embedding model
        index = faiss.IndexFlatL2(dimension)


else:
    print("Cleaned text is not available or is empty. Please check the previous step.")
    # Initialize an empty index to avoid NameError in later cells
    dimension = 768 # Default dimension for the embedding model
    index = faiss.IndexFlatL2(dimension)
    chunks = [] # Ensure chunks is defined as an empty list

Number of chunks: 200
Loading embedding model...
Encoding chunks...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Creating FAISS index...
Adding embeddings to index...
Embeddings shape: (200, 768)
FAISS index dimension: 768
FAISS index created successfully.


### 5. Set up Question Answering Pipeline

In [33]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Load a multilingual QA model
# Using a model known to support Bengali
qa_tokenizer = AutoTokenizer.from_pretrained("neuralspace-reverie/indic-transformers-bn-qa")
qa_model = AutoModelForQuestionAnswering.from_pretrained("neuralspace-reverie/indic-transformers-bn-qa")

# Using a Hugging Face pipeline might be simpler
# qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

def get_answer(question, k=3):
    """
    Retrieves relevant context using FAISS and generates an answer using a QA model.
    """
    if not chunks or not 'index' in globals() or index.ntotal == 0:
        return "Index is not ready. Please ensure text was extracted, chunked, and indexed."

    try:
        # Embed the question
        question_embedding = embedding_model.encode([question]).astype('float32')

        # Search the index
        D, I = index.search(question_embedding, k=k)

        # Get relevant context (only use valid indices)
        context = ""
        # Filter out invalid indices and potentially use a similarity threshold
        valid_indices = [i for i in I[0] if i < len(chunks)]

        # Optional: Add a similarity threshold check if needed, but for now, just get top k valid chunks
        context_chunks = [chunks[i] for i in valid_indices]

        if not context_chunks:
             return "No relevant text chunks found for this question."

        context = " ".join(context_chunks)

        # Generate answer using the loaded QA model
        inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = qa_model(**inputs)

        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get the most likely beginning and end of the answer
        answer_start = torch.argmax(answer_start_scores)
        answer_end = torch.argmax(answer_end_scores) + 1

        # Convert the tokens to the answer string
        answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
        answer = qa_tokenizer.decode(answer_tokens)

        # Post-processing: Handle cases where the answer is just special tokens
        if answer.startswith('[CLS]') or answer.endswith('[SEP]'):
             # Fallback or indicate no specific answer found in context
             return "Could not find a specific answer in the relevant text. Here is the most relevant context:\n\n" + context[:500] + "..." # Return part of context

        return answer

    except Exception as e:
        print(f"Error generating answer: {e}")
        return "An error occurred while trying to generate the answer."

OSError: neuralspace-reverie/indic-transformers-bn-qa is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [32]:
from huggingface_hub import notebook_login
notebook_login()  # Follow prompts to authenticate

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Using a known working Bengali model
model_name = "sagorsarker/bangla-bert-base-qa"

try:
    qa_tokenizer = AutoTokenizer.from_pretrained(model_name)
    qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to English QA model...")
    model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
    qa_tokenizer = AutoTokenizer.from_pretrained(model_name)
    qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)

def get_answer(question, k=3):
    try:
        # Embed the question
        question_embedding = embedding_model.encode([question]).astype('float32')

        # Search the index
        D, I = index.search(question_embedding, k=k)

        # Get context from chunks
        context = " ".join([chunks[i] for i in I[0] if i < len(chunks)])

        if not context:
            return "No relevant context found"

        # Generate answer
        inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = qa_model(**inputs)

        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = qa_tokenizer.convert_tokens_to_string(
            qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

        return answer if answer else "Answer not found in context"

    except Exception as e:
        return f"Error generating answer: {str(e)}"

Error loading model: sagorsarker/bangla-bert-base-qa is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Falling back to English QA model...


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 6. Test with Example Questions

In [None]:
# Example batch questions in Bengali
questions = [
    "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?",
    "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?",
    "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
]

# Process all questions
if 'get_answer' in globals():
    print("Processing example questions...")
    for question in questions:
        print(f"\nQuestion: {question}")
        answer = get_answer(question)
        print(f"Answer: {answer}")
else:
    print("Question answering function not defined. Please run the previous cells.")

Processing example questions...

Question: অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?
Answer: ািকািণর্িলনাযকানটি

Question: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?
Answer: ভ িূনয

Question: বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?
Answer: ব্াহকএতযগলযেতাহািআদম


### 7. Create Interactive Widget for Questions

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create widgets
question_input = widgets.Textarea(
    value='',
    placeholder='Type your question in English or Bengali...',
    description='Question:',
    layout={'width': '80%', 'height': '100px'}
)

output_area = widgets.Output()

def on_submit(b):
    with output_area:
        output_area.clear_output()
        question = question_input.value
        if question:
            print("Processing...")
            # Use the get_answer function defined in the previous cell
            if 'get_answer' in globals():
                 answer = get_answer(question)
                 print(f"\nQuestion: {question}")
                 print(f"Answer: {answer}")
            else:
                 print("Question answering function not loaded. Please run cell 5.")
        else:
            print("Please enter a question.")


submit_button = widgets.Button(description="Submit")
submit_button.on_click(on_submit)

# Display widgets
print("Interactive Q&A Widget:")
display(question_input, submit_button, output_area)

Interactive Q&A Widget:


Textarea(value='', description='Question:', layout=Layout(height='100px', width='80%'), placeholder='Type your…

Button(description='Submit', style=ButtonStyle())

Output()

In [35]:
!pip install -q transformers sentence-transformers faiss-cpu torch

In [36]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch

# Option 1: Try a different Bengali model
try:
    qa_model_name = "csebuetnlp/banglabert"
    qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
    qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
    print("Loaded BanglaBERT successfully!")
except:
    # Fallback to XLM-RoBERTa (multilingual including Bengali)
    qa_model_name = "deepset/xlm-roberta-large-squad2"
    qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
    qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
    print("Falling back to XLM-RoBERTa multilingual model")

# Create QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=qa_model,
    tokenizer=qa_tokenizer
)

tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Loaded BanglaBERT successfully!


In [37]:
def get_answer(question, k=3, similarity_threshold=0.7):
    try:
        # Embed the question
        question_embedding = embedding_model.encode([question]).astype('float32')

        # Search the index
        distances, indices = index.search(question_embedding, k=k)

        # Get relevant context with similarity check
        context = ""
        for dist, idx in zip(distances[0], indices[0]):
            if dist < similarity_threshold and idx < len(chunks):
                context += chunks[idx] + "\n\n"

        if not context:
            return "No relevant information found in documents"

        # Generate answer
        result = qa_pipeline(question=question, context=context)
        answer = result['answer']

        # Post-process Bengali answer
        if any(char in '\u0980-\u09FF' for char in question):  # If question was in Bengali
            answer = answer.replace("##", "")  # Clean up tokenizer artifacts
            answer = answer.split("[SEP]")[0].split("[CLS]")[0].strip()

        return answer if answer else "Answer not found in context"

    except Exception as e:
        return f"Error: {str(e)}"

In [38]:
def get_answer_semantic_search(question, k=3):
    """Fallback using just semantic similarity"""
    question_embedding = embedding_model.encode([question]).astype('float32')
    distances, indices = index.search(question_embedding, k=k)

    best_answer = ""
    best_score = float('-inf')

    for dist, idx in zip(distances[0], indices[0]):
        if idx < len(chunks):
            chunk = chunks[idx]
            # Find the most relevant sentence
            sentences = [s for s in chunk.split('.') if s.strip()]
            if sentences:
                # Get embedding for each sentence
                sent_embeddings = embedding_model.encode(sentences)
                # Compare with question
                sim_scores = cosine_similarity(
                    question_embedding,
                    sent_embeddings
                )[0]
                best_sent_idx = np.argmax(sim_scores)
                if sim_scores[best_sent_idx] > best_score:
                    best_score = sim_scores[best_sent_idx]
                    best_answer = sentences[best_sent_idx].strip()

    return best_answer if best_answer else "No relevant answer found"

In [39]:
# Test with Bengali questions
questions = [
    "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?",
    "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?",
    "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
]

for question in questions:
    print(f"Q: {question}")
    answer = get_answer(question)
    print(f"A: {answer}\n")

Q: অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?
A: No relevant information found in documents

Q: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?
A: No relevant information found in documents

Q: বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?
A: No relevant information found in documents



In [40]:
# Check first 1000 characters of extracted text
print("Extracted text sample:")
print(cleaned_text[:1000])

Extracted text sample:
অনলাইন বৎাচ সম্পর্কিত যেককাকনা জিজ্ঞাাসা ,অপরিরিতাআল ািয রিষয়িাাং া১ম পত্র১। অনুপলেি িািা কী কলি জীরিকা রনিবাহ কিলতন?ক) ডাক্তার্িখ) ওকালর্তগ) মাস্টার্িঘ) বৎব্সা২। োোলক ভাগৎ দেিতাি প্রধান এলজন্ট ি াি কািণ, তাি-ক) প্রর্তপজিখ) প্রভাব্ গ) র্ব্চক্ষণতাঘ) কূট ব্ুর্ির্নকচি অনুকেদটি পক়ে ৩ ও ৪ সংখযক প্রকেি উিি দাও।র্পতৃহীন দীপুি চাচাই র্িকলন পর্িব্াকিি কতিা। দীপু র্িজক্ষত হকলও তাি র্সিান্ত যনও াি ক্ষমতা র্িল না। চাচা তাি র্ব্ক ি উকদযাগ র্নকলও যেৌতুক র্নক ব্া়োব্ার়্ে কিাি কািকণ কনযাি র্পতা অপমার্নত যব্াধ ককি র্ব্ক ি আকলাচনা যভকে যদন। দীপু যমক টিি ির্ব্ যদকখ মুগ্ধ হকলও তাি চাচাকক র্কিুই ব্লকত পাকিনর্ন।৩। েীপুি িািাি সলে অপরিরিতা' গ্লেি দকান িরিলেি রে আলে?ক) হর্িকিিখ) মামািগ) র্িক্ষককিঘ) র্ব্নুি৪। উক্ত িরিলে প্রাধানয দপলয়লে -i) যদৌিাত্মii) হীনম্মনযতা iii) যলাভর্নকচি যকানটি ঠিক?ক। i ও ii খ। ii ও iii গ। i ও iii ঘ। i, ii ও iii৫. অনুপলেি িয়স কত িেি?ক) পঁর্চি খ) িাব্বিি গ) সাতাি ঘ) আটািপ্রাক-মূলযা নকতগুকলা প্রকেি সঠিক উিি র্দকত পািকল?SLAnsSLAnsSLAnsSLAnsSLAns১খ২গ৩খ৪ক৫গর্নম্নর

In [41]:
def clean_bengali_text(text):
    # Normalize compound characters
    text = text.replace('\u09cd\u09af', '\u09ce')  # Fix "YA-phala"
    # Remove unwanted characters but preserve Bengali punctuation
    text = re.sub(r'[^\u0980-\u09FF\u0020-\u007E\u0964\u0965\.\?\!\,\;\-\–\—\(\)\[\]\{\}]', '', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    return text

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", "।", "?", "!", "\।", "\\."]  # Bengali and English sentence endings
)

chunks = text_splitter.split_text(cleaned_text)

In [43]:
from sentence_transformers import SentenceTransformer

# More robust multilingual model
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [44]:
def get_answer(question, k=5, similarity_threshold=0.6):
    try:
        # Embed the question
        question_embedding = embedding_model.encode([question], show_progress_bar=False)
        question_embedding = question_embedding.astype('float32')

        # Search with higher k and similarity threshold
        distances, indices = index.search(question_embedding, k=k)

        # Build context from relevant chunks
        context = ""
        for dist, idx in zip(distances[0], indices[0]):
            if dist < (1 - similarity_threshold) and idx < len(chunks):
                context += chunks[idx] + "\n\n"

        if not context:
            return "No relevant information found in documents"

        # Simple keyword matching fallback for Bengali
        if any(char in '\u0980-\u09FF' for char in question):
            question_keywords = set(question.split())
            for chunk in chunks:
                chunk_words = set(chunk.split())
                if len(question_keywords & chunk_words) >= 2:  # At least 2 matching words
                    return chunk.split('.')[0]  # Return first sentence

        return context[:500] + "..." if len(context) > 500 else context

    except Exception as e:
        return f"Error processing question: {str(e)}"

In [45]:
def search_text_directly(keyword):
    """Search for keyword in raw text"""
    return [line for line in cleaned_text.split('\n') if keyword in line]

# Test with expected answers
print(search_text_directly("শুম্ভুনাথ"))
print(search_text_directly("মামাকে"))
print(search_text_directly("১৫ বছর"))

[]
[]
[]


In [46]:
!pip install -q pytesseract pillow pdf2image

from pdf2image import convert_from_path
import pytesseract

def ocr_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang='ben+eng')
    return text

if not any("শুম্ভুনাথ" in t for t in cleaned_text.split('\n')):
    print("Falling back to OCR...")
    cleaned_text = ocr_pdf("hsc26_bangla.pdf")
    cleaned_text = clean_bengali_text(cleaned_text)

Falling back to OCR...


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?