In [1]:
# @title 1. Install Dependencies and System Tools (MODIFIED)

# Install Python libraries
!pip install PyPDF2 sentence-transformers langchain chromadb python-dotenv google-generativeai
!pip install pytesseract pdf2image # For OCR capabilities
!pip install rank_bm25 # NEW: For lexical search (BM25)

# Install Poppler Utilities (required by pdf2image)
!sudo apt-get install poppler-utils

# Install Tesseract OCR Engine and Bengali language pack
!sudo apt install tesseract-ocr tesseract-ocr-ben

print("All necessary libraries and tools installed!")

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pro

In [2]:
# @title 2. Enter Gemini API Key and Create .env File

from IPython.display import display, Markdown
import os

# Prompt for Gemini API Key
gemini_api_key = input("Enter your Google Gemini API Key: ").strip()

# Create .env file in Colab's /content/ directory
env_content = f"GEMINI_API_KEY=\"{gemini_api_key}\""
with open("/content/.env", "w") as f:
    f.write(env_content)

# Load environment variables from the created .env file
%load_ext dotenv
%dotenv /content/.env

# Verify if variable is loaded (optional, for debugging)
loaded_gemini_api_key = os.getenv("GEMINI_API_KEY")

if loaded_gemini_api_key:
    display(Markdown("✅ `.env` file created and GEMINI_API_KEY loaded successfully!"))
else:
    display(Markdown("❌ Failed to load GEMINI_API_KEY. Please check your input."))

print("\nReady to run the RAG system!")

Enter your Google Gemini API Key: AIzaSyBrluXB_CmqivsAqOE4bY3tx7sZJXTAH5U


✅ `.env` file created and GEMINI_API_KEY loaded successfully!


Ready to run the RAG system!


In [3]:
# @title 3. RAG Pipeline Code

import PyPDF2
import re
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# NEW: Import for BM25
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt') # Download the punkt tokenizer for word_tokenize
nltk.download('punkt_tab')

# --- Configuration ---
# PDF_PATH is dynamic, provided by user upload or input
# ... (your existing configuration) ...
CHROMA_DB_PATH = "/content/chroma_db"
CHROMA_COLLECTION_NAME = "rag_chunks_collection"
EMBEDDING_MODEL_NAME = "sentence-transformers/LaBSE" # Or whichever model you are using
CHUNK_SIZE = 150
CHUNK_OVERLAP = 100
TOP_K_RETRIEVAL = 10 # Retrieve more for potential re-ranking

# --- LLM Setup: Google Gemini API ---
try:
    import google.generativeai as genai
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") # Loaded from .env
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
        # Using a stable and capable Gemini model for RAG
        # 'models/gemini-1.5-pro-latest' is a strong general-purpose choice.
        # 'models/gemini-2.5-pro' or 'models/gemini-2.5-flash' are newer stable options if available.
        LLM_INSTANCE = genai.GenerativeModel('models/gemini-2.5-pro')
        print("Using Google Gemini as LLM.")
    else:
        LLM_INSTANCE = None
        print("GEMINI_API_KEY not found. Gemini LLM not configured. Falling back to Placeholder LLM.")
except ImportError:
    LLM_INSTANCE = None
    print("Google Generative AI library not installed. Gemini LLM not configured. Falling back to Placeholder LLM.")

# Placeholder LLM (as a fallback if Gemini is not configured or fails)
class PlaceholderLLM:
    def generate_content(self, prompt_text):
        if "answer" in prompt_text.lower() and "not in context" in prompt_text.lower():
             return type('obj', (object,), {'text': "আমি দুঃখিত, আপনার প্রশ্নের উত্তর এই মুহূর্তে দিতে পারছি না। প্রদত্ত তথ্যের উপর ভিত্তি করে উত্তর খুঁজে পাওয়া যায়নি।"})()
        else:
             return type('obj', (object,), {'text': "আমি একটি ডেমো মডেল। আমার কাছে নির্দিষ্ট প্রশ্নের জন্য কোন উত্তর সংজ্ঞায়িত নেই। দয়া করে আপনার Gemini API কী সেট আপ করুন।"})()

if LLM_INSTANCE is None:
    LLM_INSTANCE = PlaceholderLLM()
    print("Warning: Using Placeholder LLM because Gemini could not be set up.")

# --- 1. PDF Pre-processing and Chunking (with OCR) ---
def extract_text_from_pdf(pdf_path, lang='eng+ben'): # Added lang parameter for Tesseract
    """
    Extracts text from a PDF document page by page using OCR (PyTesseract).
    Converts PDF pages to images first.
    """
    full_text_content = []
    try:
        images = convert_from_path(pdf_path) # Converts each page into a PIL Image object

        print(f"[DEBUG] Converted {len(images)} pages to images for OCR.")

        for i, image in enumerate(images):
            # Use Tesseract to do OCR on the image
            # lang='eng+ben' tells Tesseract to recognize both English and Bengali
            page_text = pytesseract.image_to_string(image, lang=lang)
            full_text_content.append(page_text)
            # print(f"[DEBUG] OCR'd page {i+1}. Extracted {len(page_text)} chars.") # Suppressed for less verbose output
            # if len(page_text) < 100:
            #     print(f"  Page {i+1} preview: '{page_text.strip()[:100]}...'")

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'. Please check the path and filename.")
        return ""
    except pytesseract.TesseractNotFoundError:
        print("Error: Tesseract is not installed or not in your PATH.")
        print("Please ensure you run '!sudo apt install tesseract-ocr' and '!pip install pytesseract'.")
        return ""
    except Exception as e:
        print(f"Error during OCR extraction from PDF '{pdf_path}': {e}")
        return ""

    overall_extracted_text = "\n".join(full_text_content)
    # --- DEBUGGING PRINT ---
    print(f"\n[DEBUG] Total raw text extracted by OCR: {len(overall_extracted_text)} characters.")
    if len(overall_extracted_text) > 0:
        print(f"[DEBUG] OCR Raw text preview (first 500 chars):\n'{overall_extracted_text[:500]}'")

    return overall_extracted_text

def clean_text(text):
    """Applies super aggressive cleaning to remove all forms of known PDF/OCR noise."""

    # 1. Normalize line endings and whitespace FIRST, might simplify later regexes
    text = re.sub(r'\n+', ' ', text) # Replace multiple newlines with single space
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space and strip leading/trailing

    # 2. Remove common PDF/OCR layout artifacts and headers/footers
    text = re.sub(r'\[নন\s*\d+\s*>MI\s*eae\s*\d+G\]', '', text) # Removes patterns like '[নন ৯ >MI eae 1G'
    text = re.sub(r'[\d]+\s*MINUT\s*SCHOOL|[\d]+\s*MINUTE\s*SCHOOL', '', text, flags=re.IGNORECASE) # Remove "10 MINUTE SCHOOL"
    text = re.sub(r'HSC\s*\d+\s*অনলাইন\s*ব্যাচ|বাংলা\-ইংরেজি\s*আইসিটি|বাংলা\s*ইংরেজি\s*আইসিটি|HSC\s*\d+\s*অনলাইন\s*ব্যাচ', '', text) # Remove common banners like "HSC 26 অনলাইন ব্যাচ বাংলা-ইংরেজি আইসিটি"
    text = re.sub(r'Ceara অনলাইন ব্যাট MINUTE TG Shock', '', text) # Remove specific common sequence seen

    # 3. Remove Board Exam/University Exam specific tags (more robust patterns)
    # Using non-greedy and specific word lists to prevent over-matching
    text = re.sub(r'\s*\[\s*(?:ঢা|য|রা|কু|চ|ব|দি|ম|সি|জা\.বি|ইসলামী|শাহজালাল|বঙ্গবন্ধু|গার্হস্থ)\.?\s*(?:বো|বি)?\.?\s*\'?\d{2,4}(?:-\d{2})?\s*(?:ইউনিট)?\s*[A-Z]?\s*\d{0,2}\s*\]\s*', '', text)
    text = re.sub(r'\s*\[\s*সকল\s*বোর্ড\s*\d{4}\s*\]\s*', '', text) # [সকল বোর্ড 2018]

    # 4. Remove standalone numbers/symbols that often appear as page numbers or scan artifacts
    text = re.sub(r'^\s*[\d\u09e7-\u09f1]+\s*[\u0964|\.]?\s*$', '', text, flags=re.MULTILINE) # Lines with just numbers or number.
    text = re.sub(r'\s*[\d]+\s*[\u0964|\.]\s*$', '', text) # Numbers at end of line (like page numbers)
    text = re.sub(r'[\d]{1,4}\s*[\u0964|\.]+\s*[\d]{1,4}\s*', '', text) # Patterns like "10। 3" or "42 19111"

    # 5. Remove question/answer prefixes and suffixes within lines more aggressively
    text = re.sub(r'^\s*(\d+|\u09e7|\u09ee|\u09ef|\u09ea|\u09eb|\u09ec|\u09ed|\u09f0|\u09f1)\s*[\u0964|\.]\s*', '', text, flags=re.MULTILINE) # Numbers like 1।, ২। at start
    text = re.sub(r'\(\s*[\u0984-\u09fa]\s*\)', '', text) # (ক) (খ) options
    text = re.sub(r'\(\s*[a-zA-Z]\s*\)', '', text) # (a) (b) options
    text = re.sub(r'উত্তর:\s*\S*', '', text) # Remove "উত্তর: খ" or "উত্তর: গ" etc.
    text = re.sub(r'ব্যাখ্যা:\s*', '', text) # Remove "ব্যাখ্যা:"
    text = re.sub(r'প্রশ্ন(?:-\s*\d+)?\s*[:：]\s*', '', text) # Remove "প্রশ্ন- ১:", "প্রশ্ন ১:"
    text = re.sub(r'উদ্দীপকটি\s*পড়ে\s*(\d+)\s*ও\s*(\d+)\s*সংখ্যক\s*প্রশ্নের\s*উত্তর\s*দাও', '', text) # Remove "উদ্দীপকটি পড়ে ৩ ও ৪ সংখ্যক প্রশ্নের উত্তর দাও।"
    text = re.sub(r'সৃজনশীল\s*প্রশ্ন|বহুনির্বাচনী|পাঠ্যপুস্তকের\s*প্রশ্ন|বিগত\s*বছরের\s*প্রশ্ন|পাঠ\s*পরিচিতি|লেখক\s*পরিচিতি', '', text) # Section headers

    # 6. Remove remaining isolated punctuation/symbols that are likely artifacts
    # Be careful not to remove valid sentence-ending punctuation or Bengali full stops.
    text = re.sub(r'[^a-zA-Z\u0980-\u09FF0-9\s.,?!;:]+', ' ', text) # Remove most symbols, retain Bengali/English letters, nums, common punct
    text = re.sub(r'\\', '', text) # Remove literal backslashes (as per your previous issue)

    # Final normalization
    text = re.sub(r'\s+', ' ', text).strip() # Re-normalize multiple spaces and strip
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text) # Handle hyphenation across lines

    return text

def chunk_text(text, chunk_size, chunk_overlap):
    """
    Chunks text into fixed-size segments with overlap, prioritizing
    splitting at natural language boundaries using Langchain's splitter.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", "।", ".", "?", "!", " ", ""], # Bengali and English sentence endings
        length_function=len,
    )
    return text_splitter.split_text(text)

class ChromaDBVectorDBManager:
    def __init__(self, db_path, collection_name, embedding_model_name):
        self.embedding_model = SentenceTransformer(embedding_model_name)

        try:
            self.client = chromadb.PersistentClient(path=db_path)
            print(f"ChromaDB client initialized at {db_path}.")
            self.collection = self.client.get_or_create_collection(name=collection_name)
            print(f"ChromaDB collection '{collection_name}' ready.")
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

        self.bm25_model = None # Will store the BM25 model
        self.corpus_texts = [] # Will store original chunk texts for BM25 lookup

    def count_documents(self):
        try:
            return self.collection.count()
        except Exception as e:
            print(f"Error counting documents in ChromaDB: {e}")
            return 0

    def clear_collection(self):
        """Deletes all documents in the collection and recreates it."""
        try:
            self.client.delete_collection(name=self.collection.name)
            self.collection = self.client.get_or_create_collection(name=self.collection.name)
            self.bm25_model = None # Reset BM25 model
            self.corpus_texts = [] # Reset corpus texts
            print(f"Cleared existing documents and re-created collection '{self.collection.name}' in ChromaDB.")
        except Exception as e:
            print(f"Error clearing ChromaDB collection: {e}")
            raise

    def add_documents(self, document_chunks_list, source_pdf_path="unknown_source.pdf"):
        """Generates embeddings for chunks and stores them in ChromaDB."""
        print(f"Adding {len(document_chunks_list)} chunks to ChromaDB from '{source_pdf_path}'...")

        ids = []
        embeddings = []
        documents_text_for_chroma = []
        metadatas = []

        # Temporary list to build the BM25 corpus from newly added documents
        new_bm25_corpus_texts = []

        for i, chunk_content in enumerate(document_chunks_list):
            # --- Heuristic to skip question chunks (unchanged) ---
            # ... (your existing heuristic for skipping question chunks) ...
            cleaned_chunk = chunk_content.strip()
            is_potential_question = False
            if cleaned_chunk.endswith('?'):
                is_potential_question = True
            elif re.search(r'^\s*(\d+|\u09e7|\u09ee|\u09ef|\u09ea|\u09eb|\u09ec|\u09ed|\u09f0|\u09f1)\s*[\u0964|\.]', cleaned_chunk):
                is_potential_question = True
            elif re.search(r'\(\s*[\u0985-\u09fa]\s*\)', cleaned_chunk) or re.search(r'\([a-zA-Z]\)', cleaned_chunk):
                is_potential_question = True
            elif re.search(r'উদ্দীপকটি\s*পড়ে\s*(\d+|\u09e7|\u09ee|\u09ef|\u09ea|\u09eb|\u09ec|\u09ed|\u09f0|\u09f1)\s*ও\s*(\d+|\u09e7|\u09ee|\u09ef|\u09ea|\u09eb|\u09ec|\u09ed|\u09f0|\u09f1)\s*সংখ্যক\s*প্রশ্নের\s*উত্তর\s*দাও', cleaned_chunk):
                is_potential_question = True
            elif re.search(r'^বহুনির্বাচনী\s*|\s*সৃজনশীল\s*প্রশ্ন\s*|\s*পাঠ্যপুস্তকের\s*প্রশ্ন\s*|\s*বিগত\s*বছরের\s*প্রশ্ন\s*', cleaned_chunk):
                is_potential_question = True

            if is_potential_question and len(cleaned_chunk) < 300:
                print(f"  [DEBUG_SKIP] Skipping potential question/header chunk ({len(cleaned_chunk)} chars): '{cleaned_chunk[:100]}...'")
                continue

            # --- End Heuristic ---

            # --- DEBUGGING PRINT ---
            # print(f"[DEBUG_EMBED_TYPE] Chunk {i}: Type is {type(chunk_content)}, Content starts with '{str(chunk_content)[:50]}...'")

            try:
                if not isinstance(chunk_content, str):
                    print(f"  [ERROR_EMBED] Expected chunk to be a string, but found type: {type(chunk_content)}. Skipping.")
                    continue

                chunk_embedding = self.embedding_model.encode(chunk_content).tolist()

                # Assign a unique ID for ChromaDB, important for retrieval and consistency
                # Use a counter or combination of source/index for unique IDs
                unique_id = f"{os.path.basename(source_pdf_path)}_{i}"
                ids.append(unique_id)
                embeddings.append(chunk_embedding)
                documents_text_for_chroma.append(chunk_content)
                metadatas.append({"source": source_pdf_path, "chunk_idx": i})

                # Add to the corpus for BM25
                self.corpus_texts.append(chunk_content)

            except Exception as e:
                print(f"Error encoding chunk {i}: {e}. Skipping this chunk.")

        if not documents_text_for_chroma:
            print("No valid documents to insert into ChromaDB.")
            return

        try:
            batch_size = 500
            for i in range(0, len(documents_text_for_chroma), batch_size):
                self.collection.add(
                    embeddings=embeddings[i:i + batch_size],
                    documents=documents_text_for_chroma[i:i + batch_size],
                    metadatas=metadatas[i:i + batch_size],
                    ids=ids[i:i + batch_size]
                )
                print(f"  Added batch {i//batch_size + 1}/{(len(documents_text_for_chroma)-1)//batch_size + 1}")
            print(f"Finished adding documents to ChromaDB from '{source_pdf_path}'.")

            # Rebuild BM25 model after adding documents
            self.rebuild_bm25_model()

        except Exception as e:
            print(f"Error inserting documents into ChromaDB: {e}")
            raise

    def rebuild_bm25_model(self):
        """Builds or rebuilds the BM25 model from the current corpus_texts."""
        if self.corpus_texts:
            tokenized_corpus = [word_tokenize(doc.lower()) for doc in self.corpus_texts]
            self.bm25_model = BM25Okapi(tokenized_corpus)
            print(f"BM25 model rebuilt with {len(self.corpus_texts)} documents.")
        else:
            self.bm25_model = None
            print("BM25 model not built: Corpus is empty.")

    def retrieve_chunks(self, query, top_k):
        """Retrieves top-k relevant chunks using Hybrid Search (Vector + BM25)."""
        query_embedding = self.embedding_model.encode(query).tolist()

        print(f"\n[DEBUG] Query: '{query}'")
        print(f"[DEBUG] Query embedding dimension: {len(query_embedding)}")

        # --- 1. Vector Search ---
        vector_results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k * 2, # Retrieve more candidates for blending
            # FIX: Removed 'ids' from include list
            include=['documents', 'distances']
        )
        vector_retrieved_texts = vector_results['documents'][0]
        # Access IDs directly from results['ids'][0] - they are usually present even if not in 'include'
        vector_retrieved_ids = vector_results['ids'][0] # This line should now work

        # Create a dictionary for quick lookup and score (lower distance is better for ChromaDB)
        vector_candidates = {doc_id: {'text': text, 'score': 1 - distance, 'type': 'vector'} # Convert distance to similarity
                             for doc_id, text, distance in zip(vector_retrieved_ids, vector_retrieved_texts, vector_results['distances'][0])}

        # --- 2. Lexical Search (BM25) ---
        bm25_candidates = {}
        if self.bm25_model and self.corpus_texts:
            tokenized_query = word_tokenize(query.lower())
            bm25_scores = self.bm25_model.get_scores(tokenized_query)

            # Get top N BM25 results, mapping back to original chunks
            bm25_scored_docs = sorted(
                [(score, self.corpus_texts[i])
                 for i, score in enumerate(bm25_scores)],
                key=lambda x: x[0], reverse=True
            )[:top_k * 2] # Get top N BM25 candidates

            for score, text in bm25_scored_docs:
                if text not in bm25_candidates:
                    bm25_candidates[text] = {'text': text, 'score': score, 'type': 'bm25'}

        # --- 3. Combine and Re-rank Results ---
        combined_candidates = {}

        # Add vector candidates
        for doc_id, data in vector_candidates.items():
            combined_candidates[data['text']] = {'text': data['text'], 'vector_score': data['score'], 'bm25_score': 0, 'id': doc_id}

        # Add BM25 candidates, update if already present from vector search
        for text, data in bm25_candidates.items():
            if text in combined_candidates:
                combined_candidates[text]['bm25_score'] = data['score']
            else:
                combined_candidates[text] = {'text': text, 'vector_score': 0, 'bm25_score': data['score'], 'id': None}

        # Simple combination score: give a boost for BM25 hits
        ranked_results = []
        for text, scores in combined_candidates.items():
            combined_score = scores['vector_score'] # Base score from vector search
            if scores['bm25_score'] > 0: # If it's a BM25 hit
                # INCREASED BOOST for keyword matches. Adjust weight (e.g., 0.5, 1.0, 2.0)
                # This will make keyword matches contribute much more to the overall score.
                combined_score += scores['bm25_score'] * 1.5 # Try 1.5, can go higher like 2.0 or 3.0

            ranked_results.append((combined_score, text))

        # Sort by combined score, descending
        ranked_results.sort(key=lambda x: x[0], reverse=True)

        # Extract top_k texts
        final_retrieved_texts = [text for score, text in ranked_results[:top_k]]

        print(f"[DEBUG] Retrieved {len(final_retrieved_texts)} chunks via Hybrid Search:")
        if final_retrieved_texts:
            for i, text in enumerate(final_retrieved_texts):
                score_display = "N/A"
                if i < len(ranked_results):
                    score_display = f"{ranked_results[i][0]:.4f}"
                print(f"  Chunk {i+1} (Score: {score_display}): {text[:150]}...")
        else:
            print("[DEBUG] No chunks retrieved by Hybrid Search.")

        return final_retrieved_texts, None

# ... (rest of your RAGSystem class and main execution logic, unchanged) ...

# --- 3. RAG System Core Logic ---
class RAGSystem:
    def __init__(self, vector_db_manager, llm_instance):
        self.vector_db_manager = vector_db_manager
        self.llm = llm_instance
        self.chat_history = [] # Short-term memory: stores recent user queries and bot responses

    def ask_question(self, query: str):
        """
        Processes a user query, retrieves relevant context, and generates an answer
        using the LLM, incorporating short-term conversational memory.
        """
        # 1. Add current user query to short-term memory
        self.chat_history.append({"role": "user", "content": query})

        # 2. Retrieve relevant chunks from the long-term memory (ChromaDB)
        retrieved_chunks, _ = self.vector_db_manager.retrieve_chunks(query, TOP_K_RETRIEVAL)
        context = "\n\n".join(retrieved_chunks)

        # 3. Construct the prompt for the LLM, including conversational history
        conversation_context = ""
        # Limit chat history to last N turns (e.g., 4 turns = 2 user queries + 2 AI responses)
        # This helps manage the LLM's context window and keeps the conversation focused.
        for turn in self.chat_history[-4:]:
            conversation_context += f"{turn['role'].capitalize()}: {turn['content']}\n"

        prompt = f"""You are a helpful assistant. Your task is to answer the user's question based ONLY on the provided context. If the exact numerical answer or specific factual detail is not explicitly available in the context, clearly state that you don't know or cannot find the answer based on the provided information. Maintain the conversation flow and answer in the same language as the the user's question (English or Bengali).

        ---
        Context:
        {context}
        ---

        ---
        Conversation History (for current question's context):
        {conversation_context}
        ---

        Question: {query}

        Answer:
        """

        # --- DEBUGGING PRINT ---
        print("\n[DEBUG] Full Prompt sent to LLM:")
        print("-----------------------------------")
        print(prompt)
        print("-----------------------------------\n")

        try:
            llm_response = self.llm.generate_content(prompt)
            answer = llm_response.text
        except Exception as e:
            print(f"Error calling LLM: {e}")
            answer = "Sorry, I encountered an error while generating the response. Please try again later."

        self.chat_history.append({"role": "assistant", "content": answer})

        return answer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Using Google Gemini as LLM.


In [4]:
# After the PDF has been processed and stored (Section 4 has completed its initial loop)

# Re-extract and clean text for inspection (as it was done internally)
# You might need to adjust 'pdf_input_path' if you manually typed it and didn't save it.
# For simplicity, let's assume it's the last processed PDF
# import os
# _last_pdf_path_processed = "/content/HSC26-Bangla1st-Paper.pdf" # Make sure this matches the PDF you uploaded/used

# _raw_text_for_check = extract_text_from_pdf(_last_pdf_path_processed)
# _cleaned_text_for_check = clean_text(_raw_text_for_check)

# # Search for the key phrase in the cleaned text
# search_phrase = "পনেরো" # The number fifteen in Bengali

# found_index = _cleaned_text_for_check.find(search_phrase)

# if found_index != -1:
#     print(f"'{search_phrase}' found in cleaned text at index {found_index}.")
#     # Print the surrounding context of the phrase
#     start_index = max(0, found_index - 100) # 100 chars before
#     end_index = min(len(_cleaned_text_for_check), found_index + 100) # 100 chars after
#     print("\n--- Context around 'পনেরো' in cleaned_text ---")
#     print(_cleaned_text_for_check[start_index:end_index])
#     print("-------------------------------------------------\n")
# else:
#     print(f"'{search_phrase}' (পনেরো) NOT found in the cleaned text. This is a critical problem.")
#     print("Please check your OCR output and cleaning regexes again, or the original PDF's quality.")

# You can also manually scroll through your cleaned_text_for_check if it's not too long
# print(_cleaned_text_for_check)

In [None]:
# @title 4. Run the RAG System

from google.colab import files # For PDF upload in Colab

# Initial checks for environment variables
if not GEMINI_API_KEY:
    print("Warning: GEMINI_API_KEY environment variable not set. Using Placeholder LLM for answers.")

# Initialize ChromaDB Vector DB Manager
try:
    vector_db = ChromaDBVectorDBManager(
        db_path=CHROMA_DB_PATH,
        collection_name=CHROMA_COLLECTION_NAME,
        embedding_model_name=EMBEDDING_MODEL_NAME
    )
except Exception as e:
    print(f"Application cannot start due to database initialization issue: {e}")
    exit(1)

# --- Logic for dynamic PDF loading and knowledge base management ---
initial_pdf_path = None
try:
    print("\n--- Upload your PDF document ---")
    print("A file picker will appear. Please select your PDF file.")
    uploaded = files.upload() # This will open a file picker

    if uploaded:
        colab_pdf_filename = list(uploaded.keys())[0]
        # Save the uploaded file to Colab's /content/ directory
        initial_pdf_path = os.path.join("/content/", colab_pdf_filename)
        with open(initial_pdf_path, 'wb') as f:
            f.write(uploaded[colab_pdf_filename])
        print(f"Uploaded '{colab_pdf_filename}' to {initial_pdf_path}")
    else:
        print("No file uploaded. You will be prompted to enter a PDF path manually if the DB is empty.")
except Exception as e:
    print(f"Error during file upload: {e}. You may need to manually provide the PDF path.")
    initial_pdf_path = None


while True:
    current_db_docs = vector_db.count_documents()

    # --- DEBUGGING PRINT ---
    print(f"\n[DEBUG] Current documents in ChromaDB collection: {current_db_docs}")

    if current_db_docs > 0:
        print(f"\nKnowledge base currently contains {current_db_docs} documents.")
        choice = input("Do you want to (1) Add a new PDF to existing data, or (2) Clear existing data and load a new PDF? (1/2): ").strip()
        if choice == '2':
            vector_db.clear_collection()
            current_db_docs = 0 # Reset count after clearing
        elif choice != '1':
            print("Invalid choice. Defaulting to 'Add to existing data'.")
    else:
        print("\nKnowledge base is empty. Please provide a PDF to populate it.")

    pdf_input_path = initial_pdf_path if initial_pdf_path else input("Enter the path to the PDF document (e.g., my_document.pdf): ").strip()

    if not os.path.exists(pdf_input_path):
        print(f"Error: PDF file not found at '{pdf_input_path}'. Please enter a valid path.")
        initial_pdf_path = None # Clear it so next loop prompts manually
        continue # Loop back to ask for PDF path again

    print(f"Processing '{pdf_input_path}'...")
    raw_text = extract_text_from_pdf(pdf_input_path)

    # --- DEBUGGING PRINT ---
    print(f"[DEBUG] Length of raw text extracted: {len(raw_text)} characters")
    if len(raw_text) < 200: # Print raw text if it's very short
        print(f"[DEBUG] Raw text (first 200 chars): '{raw_text[:200]}'")

    if not raw_text:
        print("Failed to extract text from PDF. Please try a different PDF or check its content.")
        initial_pdf_path = None # Clear it so next loop prompts manually
        continue # Loop back

    cleaned_text = clean_text(raw_text)
    # --- DEBUGGING PRINT ---
    print(f"[DEBUG] Length of cleaned text: {len(cleaned_text)} characters")
    if len(cleaned_text) < 200: # Print cleaned text if it's very short
        print(f"[DEBUG] Cleaned text (first 200 chars): '{cleaned_text[:200]}'")

    document_chunks = chunk_text(cleaned_text, CHUNK_SIZE, CHUNK_OVERLAP)

    # --- DEBUGGING PRINT ---
    print(f"[DEBUG] Number of chunks generated: {len(document_chunks)}")
    if document_chunks:
        print(f"[DEBUG] First chunk (first 150 chars): '{document_chunks[0][:150]}'")
        if len(document_chunks) > 1:
            print(f"[DEBUG] Second chunk (first 150 chars): '{document_chunks[1][:150]}'")
    else:
        print("[DEBUG] No chunks were generated after chunking.")

    if not document_chunks:
        print("No chunks generated from the PDF. Please check the PDF content or chunking parameters.")
        initial_pdf_path = None # Clear it so next loop prompts manually
        continue

    try:
        vector_db.add_documents(document_chunks, source_pdf_path=pdf_input_path)
        # --- DEBUGGING PRINT ---
        print(f"[DEBUG] Documents in DB after adding: {vector_db.count_documents()}")

        print(f"Knowledge base updated with content from '{pdf_input_path}'.")
        break # Exit the PDF loading loop if successful
    except Exception as e:
        print(f"Failed to populate knowledge base with '{pdf_input_path}': {e}")
        initial_pdf_path = None # Clear it so next loop prompts manually
        continue # Loop back to ask for PDF path again

# Initialize the RAG System
rag_system = RAGSystem(vector_db, LLM_INSTANCE)

print("\n--- Multilingual RAG System CLI (ChromaDB & Gemini) ---")
print("Knowledge Base is ready. You can now ask questions based on the loaded PDF(s).")
print("Type your questions in English or Bengali. Type 'exit' to quit.")
print("Example: 'What is the main topic of this document?' or 'এই নথির মূল বিষয় কি?'")

# Start the interactive loop
while True:
    user_input = input("\nYour Question: ")
    if user_input.lower() == 'exit':
        print("Exiting RAG system. Goodbye!")
        break

    if not user_input.strip():
        print("Please enter a question.")
        continue

    try:
        answer = rag_system.ask_question(user_input)
        print(f"RAG System: {answer}")
    except Exception as e:
        print(f"An unexpected error occurred during question processing: {e}")
        print("Please try again or check the system logs.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

ChromaDB client initialized at /content/chroma_db.
ChromaDB collection 'rag_chunks_collection' ready.

--- Upload your PDF document ---
A file picker will appear. Please select your PDF file.


Saving HSC26-Bangla1st-Paper.pdf to HSC26-Bangla1st-Paper.pdf
Uploaded 'HSC26-Bangla1st-Paper.pdf' to /content/HSC26-Bangla1st-Paper.pdf

[DEBUG] Current documents in ChromaDB collection: 0

Knowledge base is empty. Please provide a PDF to populate it.
Processing '/content/HSC26-Bangla1st-Paper.pdf'...
[DEBUG] Converted 49 pages to images for OCR.

[DEBUG] Total raw text extracted by OCR: 87265 characters.
[DEBUG] OCR Raw text preview (first 500 chars):
'অপরিচিতা

অনলাইন ব্যাচ সম্পর্কিত যেকোনো জিজ্ঞাসায়,

কলকরো ৬ 86919

[নন ৯ >MI
eae 1G

৮ নিন্নবিত্ত ব্যক্তির হঠাৎ বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ করবে।

৮ তৎকালীন সমাজ-সভ্যতা ও মানবতার অবমাননা সম্পর্কে জানতে পারবে।

৮ তৎকালীন সমাজের পণপ্রথার কুপ্রভাব সম্পর্কে জানতে পারবে।

৮ তৎকালে সমাজে ভদ্রলোকের স্বভাববৈশিষ্ট্য সম্পর্কে জ্ঞানলাভ করবে।

৮ নারী কোমল ঠিক, কিন্তু দুর্বল নয়- কল্যাণীর জীবনচরিত দ্বারা প্রতিষ্ঠিত এই সত্য অনুধাবন করতে
পারবে।

৮ মানুষ আশা নিয়ে বেঁচে থাকে- '
[DEBUG] Length of raw text extracted: 8