<a href="https://colab.research.google.com/github/Sajidcodecrack/Multilingual-RAG-System/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install All Necessary Packages
print("Installing Tesseract OCR engine, language pack, and Python libraries...")
!sudo apt-get update -qq
!sudo apt-get install -y tesseract-ocr tesseract-ocr-ben -qq
!pip install -q pytesseract pypdf langchain sentence-transformers chromadb google-generativeai python-dotenv PyMuPDF
print(" All necessary packages are installed.")

In [7]:
import os
import re
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
import fitz  # PyMuPDF librarie
from google.colab import files
import pytesseract
from PIL import Image
import io   #Optimizing the Input and Out Operation smoothly

print("All libraries imported successfully")

All libraries imported successfully


In [8]:
#Configuring gemini api
try:
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
    genai.configure(api_key=GOOGLE_API_KEY)
    print(" Google API Key configured successfully!")
except (ImportError, KeyError):

    print(" Google API Key not found in Colab secrets")



 Google API Key configured successfully!


In [9]:
# Uploading the PDF File
print("\n---- Upload the file -----")
try:
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    print(f"Successfully uploaded: {file_name}")
except (Exception, IndexError):
    print(f" File upload failed or was cancelled.")
    file_name = None


---- Upload the file -----


Saving HSC26-Bangla1st-Paper.pdf to HSC26-Bangla1st-Paper.pdf
Successfully uploaded: HSC26-Bangla1st-Paper.pdf


In [10]:
#  TEXT EXTRACTION WITH OCR (with Flexible Markers and Progress Indicator)
def extract_story_with_ocr(pdf_path: str) -> str:
    if not pdf_path: return ""

    print("\n Starting OCR-based text extraction...")
    doc = fitz.open(pdf_path)
    full_ocr_text = ""
    for page_num in range(5, 49): # Read pages 6 through last
        if page_num < len(doc):
            page = doc.load_page(page_num)
            pix = page.get_pixmap(dpi=300)
            image = Image.open(io.BytesIO(pix.tobytes()))

            # This line shows which page is currently being processed
            print(f"  > Processing Page {page_num + 1} with OCR...")

            try:
                text = pytesseract.image_to_string(image, lang='ben')
                full_ocr_text += text + "\n"
            except Exception as e:
                print(f" OCR failed on page {page_num + 1}: {e}")
                continue
    doc.close()

    if not full_ocr_text.strip():
        print("\n FATAL ERROR: OCR produced no text.")
        return ""

    # ---  flexible markers ---
    start_marker = "আজ আমার বয়স সাতাশ"  # More robust than the full sentence
    end_marker = "জায়গা পাইয়াছি"        # More robust than the full sentence

    try:
        start_index = full_ocr_text.find(start_marker)
        end_index = full_ocr_text.rfind(end_marker)

        if start_index == -1 or end_index == -1:
            print(" FATAL ERROR: Story markers not found in OCR text.")
            return full_ocr_text

        # End of the line for the end_marker to get the full sentence
        end_marker_full_line_end = full_ocr_text.find('\n', end_index)

        story_text = full_ocr_text[start_index : end_marker_full_line_end]
        story_text = re.sub(r'\s*\n\s*', '\n', story_text).strip()
        print(" OCR extraction and story isolation complete.")
        return story_text
    except Exception as e:
        print(f" Error during text slicing: {e}")
        return ""

# final extraction function
story_text = extract_story_with_ocr(file_name)


 Starting OCR-based text extraction...
  > Processing Page 6 with OCR...
  > Processing Page 7 with OCR...
  > Processing Page 8 with OCR...
  > Processing Page 9 with OCR...
  > Processing Page 10 with OCR...
  > Processing Page 11 with OCR...
  > Processing Page 12 with OCR...
  > Processing Page 13 with OCR...
  > Processing Page 14 with OCR...
  > Processing Page 15 with OCR...
  > Processing Page 16 with OCR...
  > Processing Page 17 with OCR...
  > Processing Page 18 with OCR...
  > Processing Page 19 with OCR...
  > Processing Page 20 with OCR...
  > Processing Page 21 with OCR...
  > Processing Page 22 with OCR...
  > Processing Page 23 with OCR...
  > Processing Page 24 with OCR...
  > Processing Page 25 with OCR...
  > Processing Page 26 with OCR...
  > Processing Page 27 with OCR...
  > Processing Page 28 with OCR...
  > Processing Page 29 with OCR...
  > Processing Page 30 with OCR...
  > Processing Page 31 with OCR...
  > Processing Page 32 with OCR...
  > Processing Page

In [11]:
#  Verification of OCR Extraction
if story_text and len(story_text) > 200:  # Check if the text is not empty and has substantial content
    print(" OCR Extraction Verified. The text appears to be clean and is ready for chunking.")
    print("\n--- Sample of Verified OCR Text (first 500 characters) ---")
    print(story_text[:1500])
    print("\n" + "="*60)
else:
    print(" Verification Failed: The text extracted via OCR is empty or too short.")
    print("    Troubleshooting steps:")
    print("    1. Ensure the correct PDF was uploaded.")
    print("    2. Check the 'Raw OCR Output' in the previous cell for any errors.")
    print("    3. Consider a 'Factory reset runtime' from the Colab menu and run all cells again.")

 OCR Extraction Verified. The text appears to be clean and is ready for chunking.

--- Sample of Verified OCR Text (first 500 characters) ---
আজ আমার বয়স সাতাশ মাত্র। এ জীবনটা না দৈর্ঘ্যের হিসাবে বড়, না গুনের হিসাবে। তবু ইহার একটু বিশেষ
মূল্য আছে।ইহা সেই ফুলের মতো যাহার বুকের উপরে ভ্রমর আসিয়া বসিয়াছিল, এবং সেই পদক্ষেপের ইতিহাস
তাহার জীবনের মাঝখানেফলের মতো গুটি ধরিয়া উঠিয়াছে।
সেই ইতিহাসটুকু আকারে ছোটো, তাহাকে ছোটো করিয়াই লিখিব। ছোটোকে যাহারা সামান্য বলিয়া ভুল করেন
না তাহারা ইহার রস বুঝিবেন। কলেজে যতগুলো পরীক্ষা পাশ করিবার সব আমি ঢুকাইয়াছি। ছেলেবেলায়
আমাকে শিমুল ফুল ও মাকাল ফলের সহিত
তুলনা করিয়া, বিদ্রপ করিবার সুযোগ
পাইয়াছিলেন। ইহাতে তখন বড় লজ্জা পাইতাম;
কিন্তু বয়স হইয়া এ কথা ভাবিয়াছি, যদি
জন্মান্তর থাকে তবে আমার মুখে সুরূপ এবং
পণ্ডিতমশায়দের মুখে বিদ্রপ আবার যেন অমনি করিইয়াই প্রকাশ পায়। আমার পিতা এক কালে গরিব ছিলেন।
ওকালতি করিয়া তিনি প্রচুর টাকা রোজগার করিয়াছেন, ভোগ করিবার সময় নিমেষমাত্র পান নাই। মৃত্যুতে
তিনি যে হাফ ছাড়িলেন সেই তার প্রথম অবকাশ।
আমার তখন বয়স অল্প।

In [12]:
#  Chunking  the Story Text
if story_text:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=750, chunk_overlap=100, separators=["\n\n", "\n", "।"]
    )
    chunks = text_splitter.split_text(story_text)
    print(f"\n Story successfully split into {len(chunks)} high-quality chunks.")



 Story successfully split into 41 high-quality chunks.


In [None]:

    #  Embedding  Chunks with the Model Chroma VectorDB is used in here and LLM is Gemini
    print("\n Initializing RAG components with upgraded model...")
    embed_model = SentenceTransformer("intfloat/multilingual-e5-large")
    prefixed_chunks = [f"passage: {chunk}" for chunk in chunks]
    chroma_client = chromadb.Client()
    collection_name = "oporichita_e5_final_pass"
    if len(chroma_client.list_collections()) > 0 and collection_name in [c.name for c in chroma_client.list_collections()]:
        chroma_client.delete_collection(name=collection_name)
    collection = chroma_client.create_collection(name=collection_name)
    collection.add(
        embeddings=embed_model.encode(prefixed_chunks).tolist(),
        documents=chunks,
        ids=[f"chunk_{i}" for i in range(len(chunks))]
    )
    model = genai.GenerativeModel('gemini-2.5-flash-lite')
    print(" RAG pipeline fully initialized.")


In [14]:
#  Generate Answer Function
def generate_answer(query: str) -> str:
    prefixed_query = f"query: {query}"
    results = collection.query(
        query_embeddings=[embed_model.encode(prefixed_query).tolist()],
        n_results=5
    )
    context = "\n\n---\n\n".join(results['documents'][0])
    prompt = f"Based ONLY on the context below, answer the question with a single word or name. If not found, say 'উত্তর পাওয়া যায়নি'.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    response = model.generate_content(prompt)
    return response.text.strip()

In [15]:
# Checking Test Cases given
if story_text:
    print("\n\n--- Running Final Assessment Test Cases ---")
    test_cases = [
        {"q": "অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?", "e": "শস্তুনাথবাবু"},
        {"q": "কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?", "e": "মামা"},
        {"q": "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?", "e": "পনেরো"},

    ]
    for case in test_cases:
        actual_answer = generate_answer(case["q"])
        print(f"\n Question: {case['q']}")
        print(f" Expected: {case['e']}")
        print(f" RAG Answer: {actual_answer}")
        if case['e'] in actual_answer or actual_answer in case['e']:
            print(" Correct")
        else:
            print("Incorrect")
        print("="*50)
else:
    print("\nCannot run test cases because story extraction failed.")



--- Running Final Assessment Test Cases ---

 Question: অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?
 Expected: শস্তুনাথবাবু
 RAG Answer: শস্তুনাথবাবু
 Correct

 Question: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে?
 Expected: মামা
 RAG Answer: মামা
 Correct

 Question: বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?
 Expected: পনেরো
 RAG Answer: পনেরো
 Correct


In [16]:
# --- Running a New Random Test Case ---
if collection:
    print("\n--- Testing with a new, random question ---")

    # Define the new question and expected answer
    question = "হরিশ কোথায় কাজ করে?"
    expected_answer = "কানপুরে"

    # Generate the answer using your RAG pipeline
    actual_answer = generate_answer(question)

    # Print the results for verification
    print(f"\n Question: {question}")
    print(f" Expected: {expected_answer}")
    print(f" RAG Answer: {actual_answer}")

    if expected_answer in actual_answer or actual_answer in expected_answer:
        print(" Correct")
    else:
        print(" Incorrect")
    print("="*50)
else:
    print("\nCannot run test case because the RAG pipeline is not ready.")


--- Testing with a new, random question ---

 Question: হরিশ কোথায় কাজ করে?
 Expected: কানপুরে
 RAG Answer: কানপুর
 Correct


In [17]:
# Cell 9: MODIFIED Generate Answer Function (Now with Memory)
def generate_answer_with_memory(query: str, history: list) -> str:
    """
    Generates an answer by considering both long-term (document) and short-term (chat) memory.
    """
    # 1. Retrieve context from long-term memory (Vector DB)
    prefixed_query = f"query: {query}"
    results = collection.query(
        query_embeddings=[embed_model.encode(prefixed_query).tolist()],
        n_results=4
    )
    context = "\\n\\n---\\n\\n".join(results['documents'][0])

    # 2. Format the short-term memory (chat history)
    formatted_history = "\\n".join([f"Human: {q}\\nAI: {a}" for q, a in history])

    # 3. Create the prompt with both memories
    prompt = f"""You are a helpful assistant for the story 'Oporichita'.
Answer the user's 'Human' question based on the 'Chat History' and the 'Retrieved Context'.
Be concise and answer in Bengali.

Chat History:
{formatted_history}

Retrieved Context:
{context}

Human: {query}
AI:"""

    # 4. Generate the response
    response = model.generate_content(prompt)
    clean_response = response.text.strip()

    # 5. Update the history with the new interaction
    history.append((query, clean_response))

    return clean_response

In [26]:
def start_chat():
    """
    Initializes a chat session that uses the generate_answer_with_memory function.
    """
    # Each new chat session starts with a fresh memory
    chat_history = []
    print("--- Chat with the AnupomaAI ---")
    print("Type 'exit' to end the conversation.")

    while True:
        user_query = input("You: ")
        # FIX: Add .strip() to remove whitespace before checking the input
        if user_query.strip().lower() == 'exit':
            print("AnupomaAI: Goodbye!")
            break

        # Pass the current session's history to the function
        answer = generate_answer_with_memory(user_query, chat_history)
        print(f"AnupomaAI: {answer}")
        print("-" * 50)

In [28]:

# Cell 11: NEW - Start the Chat
# This cell will begin the interactive chat session.
start_chat()

--- Chat with the AnupomaAI ---
Type 'exit' to end the conversation.
You: tell me a theme about the story in 2 sentence
AnupomaAI: এই গল্পের মূল বিষয় হলো একজন পুরুষের এক রহস্যময়ী অচেনা নারীর প্রতি গভীর আকর্ষণ ও মুগ্ধতা। এই আকর্ষণ শুধু নারীর বাহ্যিক রূপেই সীমাবদ্ধ নয়, বরং তার সজীবতা, প্রাণবন্ততা এবং প্রকৃতির সাথে একাত্মতার অনুভূতিও তাকে বিশেষভাবে আকর্ষণ করে।
--------------------------------------------------
You: হরিশ কোথায় কাজ করে?
AnupomaAI: হরিশ কানপুরে কাজ করে।
--------------------------------------------------
You:   Question: কাকে অনুপমের ভাগ্য দেবতা বলে উল্লেখ করা হয়েছে in 1 word 
AnupomaAI: মামা
--------------------------------------------------
You: exit 
AnupomaAI: Goodbye!
