In [None]:
!pip install -q -U langchain langchain-google-genai langchain-community chromadb pymupdf sentence-transformers google-ai-generativelanguage==0.6.15


import os
from getpass import getpass


if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API Key: ")
    print("Google API Key set.")
else:
    print("Google API Key already set.")

import fitz
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
import asyncio
from typing import List, Dict

# --- 1. PDF Text Extraction  ---
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF document using PyMuPDF's 'text' mode.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The concatenated text from all pages of the PDF.
    """
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text("text")
        doc.close()
        print(f"Successfully extracted text from {pdf_path} using 'text' mode.")
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

# --- 2. Text Pre-processing & Cleaning  ---
def clean_text(text: str) -> str:
    """
    Performs robust text cleaning for PDF extracted content.
    - Replaces common PDF artifacts (hyphenated words).
    - Removes non-standard characters, replacing them with spaces.
    - Normalizes all whitespace (spaces, tabs, newlines, unicode spaces) to single spaces.
    - Consolidates multiple newlines into paragraph breaks.
    - Strips leading/trailing whitespace.

    Args:
        text (str): The raw text extracted from the PDF.

    Returns:
        str: The cleaned text.
    """
    # 1. Replace common PDF artifacts like hyphenated words across lines (e.g., "word-\nword" -> "wordword")
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

    # 2. Remove form feed characters
    text = text.replace('\x0c', '')

    # 3. Normalize all types of whitespace to a single space.
    text = re.sub(r'\s+', ' ', text)

    # 4. Remove any characters that are NOT standard English/Bengali letters, digits, or common punctuation.

    text = re.sub(r'[^a-zA-Z0-9\u0980-\u09FF.,!?;:()\[\]{}\'"\-_ ]+', ' ', text)

    # 5. Consolidate multiple spaces that might have resulted from step 4
    text = re.sub(r' +', ' ', text)

    # 6. Strip leading/trailing whitespace from the entire text
    text = text.strip()

    # 7. Re-introduce paragraph breaks by replacing single newlines with spaces and then double newlines
    text = re.sub(r'(\n\s*){2,}', '\n\n', text)

    print("Text cleaning complete.")
    return text.strip()

# --- 3. Document Chunking & Vectorization with ChromaDB ---
def create_and_vectorize_knowledge_base(text: str, db_path="chroma_db_hsc_bangla") -> Chroma:
    """
    Chunks text, creates embeddings, and stores them in a ChromaDB vector store.

    Args:
        text (str): The cleaned text corpus.
        db_path (str): The path to store the ChromaDB persistent collection.

    Returns:
        Chroma: The initialized ChromaDB vector store.
    """
    # Document Chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=100,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
    )
    chunks = text_splitter.split_text(text)
    print(f"Created {len(chunks)} chunks.")

    # Embedding Model
    print("Loading embedding model: paraphrase-multilingual-MiniLM-L12-v2...")
    embeddings_model = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
    print("Embedding model loaded.")

    # Vectorize and store in ChromaDB
    os.makedirs(db_path, exist_ok=True)

    print(f"Creating/Loading ChromaDB at {db_path}...")
    vectorstore = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings_model,
        persist_directory=db_path
    )
    vectorstore.persist()
    print("ChromaDB vector store created/loaded and persisted.")
    return vectorstore

# --- 4. RAG Chain Setup with Google Gemini ---
def setup_rag_chain(vectorstore: Chroma):
    """
    Sets up the RAG chain using LangChain, Google Gemini, and ChromaDB.

    Args:
        vectorstore (Chroma): The initialized ChromaDB vector store.

    Returns:
        RetrievalQA: The RAG chain.
    """
    # Initialize the LLM (Google Gemini Pro)
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.2)

    # Define the prompt for the LLM

    # Contextualize the question based on chat history (for short-term memory)
    contextualize_q_system_prompt = (
        "Given a chat history and the latest user question "
        "which might reference context in the chat history, "
        "formulate a standalone question which can be understood without "
        "the chat history. Do NOT answer the question, just reformulate it "
        "if necessary and otherwise return it as is."
    )
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )

    # Combined with RAG - main QA prompt
    qa_system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following retrieved context to answer the question. "
        "If you don't know the answer, just say that you don't know. "
        "Do not make up an answer. "
        "Keep the answer concise and to the point, usually one or two sentences. "
        "Answer in the same language as the question.\n\n"
        "{context}"
    )
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", qa_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )

    # Create a chain to combine documents and answer the question
    document_chain = create_stuff_documents_chain(llm, qa_prompt)

    # Create the retrieval chain
    retrieval_chain = create_retrieval_chain(
        vectorstore.as_retriever(), # Use the vectorstore as a retriever
        document_chain
    )

    print("RAG chain setup complete.")
    return retrieval_chain

# --- 5. Function to get RAG response with chat history ---
async def get_rag_response_with_history(rag_chain, question: str, chat_history: List[Dict]):
    """
    Gets a response from the RAG chain, managing chat history.
    Returns the generated answer and the retrieved documents.
    """
    # Convert chat_history from dict to LangChain's message objects
    lc_chat_history = []
    for message in chat_history:
        if message["role"] == "user":
            lc_chat_history.append(HumanMessage(content=message["content"]))
        elif message["role"] == "ai":
            lc_chat_history.append(AIMessage(content=message["content"]))

    response = await rag_chain.ainvoke({"input": question, "chat_history": lc_chat_history})
    return response["answer"], response["context"]
# --- Main Execution Block ---
pdf_file_path = "/content/HSC26-Bangla1st-Paper.pdf"
chroma_db_path = "./chroma_db_hsc_bangla"

# Check if the PDF exists
if not os.path.exists(pdf_file_path):
    print(f"Error: PDF file not found at '{pdf_file_path}'")
    print("Please upload 'HSC26_Bangla_1st_paper.pdf' to your Colab environment.")
else:
    print(f"PDF file '{pdf_file_path}' found.")

# Initialize RAG components
rag_chain = None
vector_db = None

# Check if the database already exists and load it.
if os.path.exists(chroma_db_path) and os.path.isdir(chroma_db_path) and len(os.listdir(chroma_db_path)) > 0:
    print(f"Loading existing ChromaDB from {chroma_db_path}...")
    embeddings_model = SentenceTransformerEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
    vector_db = Chroma(persist_directory=chroma_db_path, embedding_function=embeddings_model)
    print("ChromaDB loaded.")
else:
    print("ChromaDB not found or empty. Creating new knowledge base...")
    if os.path.exists(pdf_file_path):
        corpus_text = extract_text_from_pdf(pdf_file_path)
        cleaned_corpus_text = clean_text(corpus_text)
        if cleaned_corpus_text.strip():
            vector_db = create_and_vectorize_knowledge_base(cleaned_corpus_text, db_path=chroma_db_path)
            print("Knowledge base created and vectorized.")
        else:
            print("Extracted and cleaned text is empty. Cannot create knowledge base.")
    else:
        print("Cannot create knowledge base: PDF file not found.")

if vector_db:
    rag_chain = setup_rag_chain(vector_db)
    print("RAG system initialized successfully. You can now ask questions!")
else:
    print("RAG system could not be initialized. Please check the PDF file and API key.")

# --- Interactive Query Loop ---
if rag_chain:
    chat_history = []
    print("\n--- Start Chatting ---")
    print("Type 'exit' or 'quit' to end the conversation.")

    while True:
        user_question = input("\nYour Question (English/Bengali): ")
        if user_question.lower() in ["exit", "quit"]:
            print("Exiting chat. Goodbye!")
            break

        try:
            # Get response from RAG chain
            answer, retrieved_docs = await get_rag_response_with_history(rag_chain, user_question, chat_history)

            print(f"\nAI Answer: {answer}")



            # Update chat history (short-term memory)
            chat_history.append({"role": "user", "content": user_question})
            chat_history.append({"role": "ai", "content": answer})
            # Keep only the last few turns (e.g., 4 messages: 2 user, 2 AI)
            chat_history = chat_history[-4:]

        except Exception as e:
            print(f"An error occurred during query: {e}")
            print("Please ensure your Google API Key is valid and the PDF content is suitable.")
