In [None]:
%pip install langchain-community sentence-transformers gradio python-dotenv google-generativeai langchain-google-genai chromadb firebase-admin langchain



In [None]:
# Import statements
import os
from dotenv import load_dotenv
import firebase_admin
from firebase_admin import credentials, firestore
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_core.messages import HumanMessage, AIMessage
import gradio as gr
import shutil
import traceback
import datetime
import re
from typing import List, Dict, Any, Optional, Tuple

# --- Configuration ---
print("[INFO] --- Configuration ---")
print("[INFO] Loading environment variables...")
try:
    load_dotenv()
    print("[INFO] Environment variables loaded.")
except Exception as e:
    print(f"[ERROR] Failed to load environment variables: {e}")
    traceback.print_exc()

# --- Google Drive Configuration for Persistence ---
DRIVE_CHROMA_DIR = "/content/drive/MyDrive/chroma_dbs/alumni_chatbot_gemini_firestore"
VECTOR_DB_PATH = DRIVE_CHROMA_DIR
print(f"[INFO] Configured Chroma DB persistence path: {VECTOR_DB_PATH}")

# --- Cloud Database Configuration ---
FIRESTORE_KEY_PATH = '/content/firestore_key.json' # Ensure this path is correct in Colab or local setup
print(f"[INFO] Configured Firestore key path (needed for rebuild): {FIRESTORE_KEY_PATH}")

FIRESTORE_COLLECTION_NAME = os.getenv("FIRESTORE_COLLECTION_NAME", "alumni_profiles")
print(f"[INFO] Configured Firestore collection name (needed for rebuild): {FIRESTORE_COLLECTION_NAME}")

# --- Model Configuration ---
EMBEDDING_MODEL = "models/embedding-001"
LLM_MODEL = os.getenv("LLM_MODEL", "gemini-1.5-flash") # Allow overriding LLM model via env var
print(f"[INFO] Configured Embedding Model: {EMBEDDING_MODEL}")
print(f"[INFO] Configured LLM Model: {LLM_MODEL}")
print("[INFO] --- End Configuration ---")

# --- Global Variables ---
vector_store = None
rag_chain = None
embeddings = None
llm = None
firestore_db = None
google_api_key = None

# --- Helper Functions ---
def load_google_api_key():
    """Loads the Google API Key securely from Colab Secrets or environment."""
    global google_api_key
    print("[INFO] Attempting to load Google API Key...")
    try:
        # Try Colab Secrets first
        try:
            from google.colab import userdata
            api_key = userdata.get('GOOGLE_API_KEY')
            if api_key:
                print("✅ Google API Key loaded from Colab Secrets.")
                google_api_key = api_key
                return api_key
            else:
                 print("⚠️ Colab Secrets available, but 'GOOGLE_API_KEY' not found.")
        except ImportError:
            print("⚠️ Not running in Google Colab or `userdata` not available.")

        # Fallback to environment variable
        api_key = os.getenv("GOOGLE_API_KEY")
        if api_key:
            print("✅ Google API Key loaded from environment variable.")
            google_api_key = api_key
            return api_key

        print("❌ Google API Key not found in environment variables or Colab Secrets. Please set GOOGLE_API_KEY.")
        google_api_key = None
        return None

    except Exception as e:
        print(f"❌ Error loading API key: {e}")
        traceback.print_exc()
        google_api_key = None
        return None

def initialize_database():
    """Initializes the connection to Firestore."""
    global firestore_db
    print("[INFO] Attempting to initialize Firebase Admin and connect to Firestore...")
    try:
        if not os.path.exists(FIRESTORE_KEY_PATH):
            print(f"❌ Firestore key file not found at {FIRESTORE_KEY_PATH}. Cannot initialize database.")
            firestore_db = None
            return

        if not firebase_admin._apps:
            print("[INFO] Firebase app not initialized. Proceeding with initialization.")
            try:
                cred = credentials.Certificate(FIRESTORE_KEY_PATH)
                firebase_admin.initialize_app(cred)
                print("✅ Firebase Admin initialized.")
            except Exception as init_e:
                print(f"❌ Failed to initialize Firebase Admin app: {init_e}")
                traceback.print_exc()
                firestore_db = None
                return
        else:
            print("✅ Firebase app already initialized.")

        firestore_db = firestore.client()
        print("✅ Firestore client obtained.")

    except Exception as e:
        print(f"❌ Failed to initialize Firebase Admin or Firestore client: {e}")
        traceback.print_exc()
        firestore_db = None

def initialize_embeddings():
    """Initializes the embedding model."""
    global embeddings, google_api_key
    print(f"[INFO] Attempting to initialize embeddings model: {EMBEDDING_MODEL}")
    try:
        if google_api_key is None:
            print("[INFO] API key not loaded yet, attempting to load...")
            google_api_key = load_google_api_key()
            if google_api_key is None:
                print("❌ Cannot initialize embeddings: Google API Key not available.")
                embeddings = None
                return

        embeddings = GoogleGenerativeAIEmbeddings(
            model=EMBEDDING_MODEL,
            google_api_key=google_api_key
        )
        print(f"✅ Embeddings model ({EMBEDDING_MODEL}) initialized successfully.")
    except Exception as e:
        print(f"❌ Failed to initialize embeddings model: {e}")
        traceback.print_exc()
        embeddings = None

def initialize_llm():
    """Initializes the LLM."""
    global llm, google_api_key
    print(f"[INFO] Attempting to initialize LLM: {LLM_MODEL}")
    try:
        if google_api_key is None:
             print("[INFO] API key not loaded yet, attempting to load...")
             google_api_key = load_google_api_key()
             if google_api_key is None:
                print("❌ Cannot initialize LLM: Google API Key not available.")
                llm = None
                return

        # Using convert_system_message_to_human=True will raise a deprecation warning.
        # Consider removing this parameter if not strictly needed, or be aware it will
        # need removal/code adjustment in a future library version.
        llm = ChatGoogleGenerativeAI(
            model=LLM_MODEL,
            google_api_key=google_api_key,
            temperature=0.2,
            # convert_system_message_to_human=True # Keep for now to match original, but note the warning
        )

        # Test connection
        try:
            print(f"[INFO] Testing LLM connection with a simple invoke...")
            llm.invoke("Hello")
            print("✅ LLM test invoke successful.")
            print(f"✅ LLM ({LLM_MODEL}) initialized successfully.")
        except Exception as api_check_e:
            print(f"❌ Failed LLM test invoke. This might indicate an API key issue, billing issue, or quota limit: {api_check_e}")
            traceback.print_exc()
            llm = None

    except Exception as e:
        print(f"❌ Error during LLM initialization: {e}")
        traceback.print_exc()
        llm = None

# --- Enhanced Text Processing ---
def chunk_text(text: str) -> List[Document]:
    """Improved text chunking with better context preservation."""
    print(f"[INFO] Chunking text (length: {len(text)})...")
    if not isinstance(text, str) or not text.strip():
        print("[WARN] Cannot chunk empty or non-string text.")
        return []
    try:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,  # Smaller chunks for better precision
            chunk_overlap=200,  # More overlap for context
            separators=["\n\n", "\n", ". ", " ", ""]  # Better splitting
        )
        # Wrap text in a Document object for the splitter
        chunks = splitter.split_documents([Document(page_content=text)])
        print(f"✅ Created {len(chunks)} chunks.")
        return chunks
    except Exception as e:
        print(f"❌ Error during text chunking: {e}")
        traceback.print_exc()
        return []

def extract_education_info(education_text: str) -> Tuple[Optional[int], List[str]]:
    """Enhanced education info extraction with degree parsing."""
    # print(f"[INFO] Extracting education info from: {education_text[:100]}...") # Log partial text
    graduation_year = None
    degrees = []

    if not isinstance(education_text, str):
        print("[WARN] Education text is not a string.")
        return None, []

    # Extract degree information and years using a more flexible pattern
    # Pattern looks for (TEXT) followed by (YYYY-MM-DD - YYYY-MM-DD) or similar date formats
    # Adjusted pattern to be more robust to variations, capturing text before dated parenthesis
    degree_pattern = r"([^()]+)\s*\((?:[^()]*?)?\b(\d{4})-\d{2}-\d{2}\b\s*-\s*\b(\d{4})-\d{2}-\d{2}\b(?:[^()]*?)?\)"
    matches = re.findall(degree_pattern, education_text)

    years = []
    for match in matches:
        degree_part = match[0].strip()
        # Assume degree name is the captured text before the dated parenthesis
        degree_name = degree_part

        # Use the second captured year (end year)
        end_year_str = match[2].strip()
        try:
            end_year = int(end_year_str)
            years.append(end_year)
            degrees.append(f"{degree_name} (completed {end_year})")
        except ValueError:
            print(f"[WARN] Could not parse integer year from date string: {end_year_str}")
            degrees.append(f"{degree_name} (date unknown)") # Append degree even if year parsing fails

    # If the first pattern didn't find matches, try a simpler one just looking for 4-digit years
    if not years:
         simple_year_pattern = r"\b(\d{4})\b"
         year_matches = re.findall(simple_year_pattern, education_text)
         try:
              if year_matches:
                   # Assume the latest year found might be graduation year if no dated degrees
                   graduation_year = max(int(y) for y in year_matches)
                   print(f"[INFO] Found years via simple pattern, assuming latest ({graduation_year}) is grad year.")
         except ValueError:
              print("[WARN] Could not parse year from simple year pattern.")

    if years:
        graduation_year = max(years)  # Use most recent graduation year from dated entries
        # print(f"✅ Extracted graduation year: {graduation_year}, Degrees: {degrees}")
    # else:
        # print("INFO] No dated education entries found.")
        # If no dated entries, graduation_year remains None unless simple pattern found one.

    return graduation_year, degrees

# --- Data Loading with Enhanced Processing ---
def load_data_from_firestore() -> List[Document]:
    """Enhanced data loading with better metadata extraction."""
    print(f"[INFO] Loading data from Firestore collection: {FIRESTORE_COLLECTION_NAME}")
    if firestore_db is None:
        print("❌ Cannot load data from Firestore: Database connection not initialized.")
        return []

    documents = []
    try:
        collection_ref = firestore_db.collection(FIRESTORE_COLLECTION_NAME)
        docs = collection_ref.stream()
        doc_count = 0
        processed_count = 0

        print("[INFO] Streaming documents from Firestore...")
        for doc in docs:
            doc_data = doc.to_dict()
            alumni_id = doc.id
            doc_count += 1
            # print(f"[INFO] Processing document ID: {alumni_id}")

            # Enhanced field processing
            profile_text_parts = []
            metadata = {
                'alumni_id': alumni_id,
                'name': doc_data.get('Name', 'Unknown Alumni'),
                'Location': doc_data.get('Location', ''),
                'Email': doc_data.get('Email', ''),
                'Phone': doc_data.get('Phone', ''),
                'major': doc_data.get('major', ''), # Assuming 'major' is a field
                'Skills': doc_data.get('Skills', []), # Assuming 'Skills' is a list or comma-separated string
                'WorkExperience': doc_data.get('WorkExperience', ''), # Assuming 'WorkExperience' is text
                'Education': doc_data.get('Education', '') # Keep original education text in metadata
            }

            # Process education with enhanced extraction
            education_text = doc_data.get('Education', '')
            grad_year, degrees = extract_education_info(education_text)
            if grad_year:
                metadata['graduation_year_int'] = grad_year # Store as int for range filtering
            if degrees:
                metadata['degrees'] = degrees # Store parsed degrees
                profile_text_parts.append("Education: " + "; ".join(degrees))
            elif education_text: # If extraction didn't find dated degrees, but education field exists
                 profile_text_parts.append("Education: " + education_text) # Add raw text to content

            # Process Skills field
            skills_value = doc_data.get('Skills', '')
            if isinstance(skills_value, list):
                 # If it's already a list, use it directly for metadata and content
                 metadata['Skills'] = skills_value
                 if skills_value:
                      profile_text_parts.append(f"Skills: {', '.join(map(str, skills_value))}")
            elif isinstance(skills_value, str) and skills_value.strip():
                 # If it's a string, add to content. Attempt parsing into list for metadata.
                 metadata['Skills'] = [s.strip() for s in skills_value.split(',') if s.strip()] # Attempt to parse comma-separated string into list for metadata
                 if skills_value.strip(): # Add to content only if not empty string
                    profile_text_parts.append(f"Skills: {skills_value}")
            else:
                 metadata['Skills'] = [] # Ensure Skills metadata is always a list

            # Add other fields
            for field in ['Name', 'Email', 'Location', 'Phone', 'WorkExperience', 'major']: # Exclude 'Skills' and 'Education' as handled above
                value = doc_data.get(field)
                if value:
                    if isinstance(value, (str, int, float)): # Only add if standard type
                        if isinstance(value, str) and not value.strip(): continue # Skip empty strings
                        profile_text_parts.append(f"{field}: {value}")
                    else:
                         print(f"[WARN] Skipping field '{field}' for ID {alumni_id} due to unexpected type: {type(value)}")

            profile_text = "\n".join(profile_text_parts)

            if profile_text.strip():
                documents.append(Document(page_content=profile_text, metadata=metadata))
                processed_count += 1
            else:
                print(f"[WARN] Skipping document with ID {alumni_id}: No relevant text fields found after processing.")

        print(f"[INFO] Finished streaming documents. Total documents read from Firestore: {doc_count}")
        print(f"✅ Loaded {len(documents)} documents for processing (Processed count: {processed_count}).")
        return documents

    except Exception as e:
        print(f"❌ Error loading data from Firestore: {e}")
        traceback.print_exc()
        return []

# --- Enhanced Vector Store Setup ---
def setup_vector_store(force_rebuild: bool = False):
    """Enhanced vector store setup with better chunk processing."""
    global vector_store
    print("[INFO] Setting up vector store...")
    if embeddings is None:
        print("❌ Cannot setup vector store: Embeddings not initialized.")
        return

    os.makedirs(VECTOR_DB_PATH, exist_ok=True)
    print(f"[INFO] Ensuring persistence directory exists: {VECTOR_DB_PATH}")

    # Try loading existing store
    if not force_rebuild:
        if os.path.exists(VECTOR_DB_PATH) and any(os.scandir(VECTOR_DB_PATH)): # Check if directory exists and is not empty
            print(f"[INFO] Attempting to load existing vector store from {VECTOR_DB_PATH}...")
            try:
                vector_store = Chroma(
                    persist_directory=VECTOR_DB_PATH,
                    embedding_function=embeddings
                )
                # Test loading a document to check if the store is healthy
                try:
                    # Perform a simple query to check if the collection is accessible and has embeddings
                    # This is more robust than just counting files.
                    test_docs = vector_store.similarity_search("test query", k=1)
                    count = vector_store._collection.count() if hasattr(vector_store, '_collection') else -1 # Fallback count
                    if count > 0:
                        print(f"✅ Vector store loaded successfully with {count} documents.")
                        return
                    else:
                        print("⚠️ Existing vector store found, but it appears empty or unhealthy. Rebuilding...")
                        # Fall through to rebuild logic
                except Exception as test_e:
                    print(f"❌ Existing vector store failed health check: {test_e}. Rebuilding...")
                    # Fall through to rebuild logic

            except Exception as e:
                print(f"❌ Error loading vector store from {VECTOR_DB_PATH}: {e}")
                traceback.print_exc()
                print("⚠️ Loading failed. Attempting to rebuild vector store.")
                # Fall through to rebuild logic
        else:
            print(f"[INFO] No existing vector store found at {VECTOR_DB_PATH} or directory is empty. Building new one.")

    # Build new store
    print("[INFO] Building new vector store from Firestore data...")
    if firestore_db is None:
        print("❌ Cannot build new vector store: Firestore connection not initialized.")
        vector_store = None # Ensure vector_store is None if build fails
        return

    alumni_documents = load_data_from_firestore()
    if not alumni_documents:
        print("⚠️ No documents loaded from Firestore to build vector store.")
        vector_store = None # Ensure vector_store is None if build fails
        return

    all_chunks = []
    print(f"[INFO] Chunking {len(alumni_documents)} documents...")
    for i, doc in enumerate(alumni_documents):
        # print(f"[INFO] Chunking document {i+1}/{len(alumni_documents)} (ID: {doc.metadata.get('alumni_id')})...")
        chunks = chunk_text(doc.page_content)
        if chunks:
             # Pass relevant metadata to chunks
             for chunk in chunks:
                 # Ensure metadata is a copy so modifications per chunk don't affect original doc metadata
                 chunk.metadata = {
                     **doc.metadata, # Include all original metadata
                     'chunk_index': i, # Index of the chunk within the original document's chunks
                     'content_length': len(chunk.page_content)
                 }
             all_chunks.extend(chunks)
        else:
            print(f"[WARN] No chunks created for document ID: {doc.metadata.get('alumni_id')}")

    if not all_chunks:
        print("⚠️ No chunks created from documents. Vector store will be empty.")
        vector_store = None # Ensure vector_store is None if build fails
        return

    print(f"[INFO] Adding {len(all_chunks)} chunks to Chroma DB...")
    try:
        # If rebuilding, clean up the old directory first
        if os.path.exists(VECTOR_DB_PATH):
             print(f"[INFO] Clearing existing directory before rebuild: {VECTOR_DB_PATH}")
             # Use ignore_errors=True in case of permission issues, though ideally fix permissions
             shutil.rmtree(VECTOR_DB_PATH, ignore_errors=True)
             os.makedirs(VECTOR_DB_PATH, exist_ok=True)

        vector_store = Chroma.from_documents(
            documents=all_chunks,
            embedding=embeddings,
            persist_directory=VECTOR_DB_PATH
        )
        vector_store.persist() # Explicitly persist changes
        print(f"✅ Vector store created and persisted with {len(all_chunks)} chunks.")
    except Exception as e:
        print(f"❌ Error building vector store: {e}")
        traceback.print_exc()
        vector_store = None # Ensure vector_store is None if build fails

# --- Query Expansion and Enhancement ---
def expand_query(original_query: str) -> List[str]:
    """Generate multiple query variations to improve retrieval."""
    # This is used for RAG *query expansion* after filtering is applied.
    # The goal is to find documents relevant to the core question, not the filters.
    print(f"[INFO] Expanding query for RAG retrieval: '{original_query}'")
    variations = [original_query]

    # Identify potential keywords or entities in the *cleaned* query for expansion
    # Basic example: look for common tech/academic terms and names
    # Exclude terms that might have been filters if the cleaning wasn't perfect
    keywords = re.findall(r'\b(python|java|c\+\+|machine learning|data science|computer science|electrical engineering|business|finance|healthcare|tech|cloud computing|renewable energy|embedded systems|skills|experience|education|work|profile|contact|location|major)\b', original_query, re.IGNORECASE)
    names = re.findall(r'\b(?:Mr\.|Ms\.|Dr\.)?\s*([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)\b', original_query) # Basic name detection

    if keywords:
        # print(f"[INFO] Detected potential keywords for expansion: {list(set(keywords))}")
        for keyword in list(set(keywords)): # Use set to avoid duplicates
             variations.extend([
                 f"Alumni with expertise in {keyword}",
                 f"Profiles mentioning {keyword}",
                 f"Information about {keyword} skills/experience" # Added skill/experience angle
             ])

    if names:
        # print(f"[INFO] Detected potential names for expansion: {list(set(names))}")
        for name in list(set(names)): # Use set to avoid duplicates
            variations.extend([
                f"Profile details for {name}",
                f"Information about {name}'s background",
                f"{name}'s skills and experience" # Added skill/experience angle
            ])

    # Add general variations
    simple_variations = [
        f"Search results for {original_query}", # More keyword-oriented
        f"Relevant information for {original_query}"
    ]
    variations.extend([v for v in simple_variations if v not in variations]) # Avoid duplicates

    # Deduplicate the final list while preserving order roughly
    final_variations = []
    seen = set()
    for v in variations:
        if v not in seen:
            final_variations.append(v)
            seen.add(v)

    # print(f"[INFO] Generated {len(final_variations)} query variations for RAG.")
    return final_variations

# --- Core Retrieval Logic (used by the wrapper) ---
def perform_filtered_retrieval(query: str, metadata_filter: Optional[Dict] = None, k: int = 10) -> List[Document]:
    """Performs similarity search with metadata filtering and logs."""
    print(f"[INFO] Searching vector store with query: '{query}' and filter: {metadata_filter}")
    if vector_store is None:
         print("❌ Search failed: Vector store not initialized.")
         return []
    try:
        # Retrieve more documents than needed before re-ranking if re-ranking is desired here
        # Our wrapper combines results from expanded queries and does a final sort,
        # so k is the number of docs per query variation.
        docs = vector_store.similarity_search(query, k=k, filter=metadata_filter)
        # print(f"[INFO] Retrieved {len(docs)} docs for query: '{query}'")
        return docs
    except Exception as e:
        print(f"❌ Error during similarity search: {e}")
        traceback.print_exc()
        return []

# --- Custom Retriever Class for Langchain ---
from langchain.schema.retriever import BaseRetriever
from typing import List
from langchain_core.callbacks import CallbackManagerForRetrieverRun

class CustomFilteredRetriever(BaseRetriever):
    vectorstore: Chroma
    k: int = 10 # Number of documents to return overall
    k_per_query: int = 5 # Number of documents to retrieve per expanded query variation

    # Ensure _get_relevant_documents accepts the filter kwarg
    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun, filter: Optional[Dict] = None
    ) -> List[Document]:
        """
        Retrieve documents based on the query and metadata filter.
        Incorporates query expansion and combines/re-ranks results.
        """
        print(f"[INFO] CustomFilteredRetriever called with query: '{query}' and filter: {filter}")

        # Perform query expansion on the query received from the RAG chain
        expanded_queries = expand_query(query)

        all_retrieved_docs = []
        seen_doc_chunk_ids = set()

        # Perform filtered retrieval for each expanded query variation
        for expanded_q in expanded_queries:
             # Retrieve k_per_query documents for each variation, applying the filter
             docs = perform_filtered_retrieval(expanded_q, filter, k=self.k_per_query)

             for doc in docs:
                 # Create a unique identifier for each document chunk
                 doc_chunk_id = doc.metadata.get('alumni_id', 'unknown') + '_' + str(doc.metadata.get('chunk_index', -1))
                 if doc_chunk_id not in seen_doc_chunk_ids:
                     all_retrieved_docs.append(doc)
                     seen_doc_chunk_ids.add(doc_chunk_id)
                     # print(f"[DEBUG] Added unique doc chunk {doc_chunk_id}")
                 # else:
                     # print(f"[DEBUG] Skipping duplicate doc chunk {doc_chunk_id}")

        print(f"[INFO] CustomFilteredRetriever collected {len(all_retrieved_docs)} unique documents from {len(expanded_queries)} expanded queries.")

        # Apply final re-ranking if needed across all collected documents
        # Re-rank by content length (simple proxy for detail)
        if all_retrieved_docs:
            final_reranked_docs = sorted(
                all_retrieved_docs,
                key=lambda x: x.metadata.get('content_length', 0),
                reverse=True
            )
            print(f"✅ CustomFilteredRetriever returning top {min(self.k, len(final_reranked_docs))} documents after combining and re-ranking.")
            return final_reranked_docs[:self.k] # Return top K overall documents
        else:
             print("⚠️ CustomFilteredRetriever returned no documents.")
             return []

# --- Enhanced RAG Chain Setup ---
def setup_rag_chain():
    """Sets up the enhanced RAG chain with better retrieval."""
    global rag_chain
    print("[INFO] Setting up RAG chain...")
    if vector_store is None or llm is None:
        print("❌ Cannot set up RAG chain: Required components (Vector Store or LLM) not initialized.")
        return

    try:
        # Instantiate our custom retriever
        # Pass the desired total number of documents (k) and docs per query variation (k_per_query)
        custom_retriever = CustomFilteredRetriever(
             vectorstore=vector_store,
             k=10, # Total documents to return to the LLM
             k_per_query=5 # Number of documents to get per expanded query variation before combining
        )
        print("✅ Custom filtered retriever created.")

        memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True,
            output_key="answer" # Ensure memory key matches output key
        )
        print("✅ Conversation buffer memory initialized.")

        current_date = datetime.date.today().strftime("%Y-%m-%d")
        print(f"[INFO] Using current date for prompt: {current_date}")

        # Enhanced prompt template
        # Make sure the prompt clearly instructs the LLM to use ONLY the provided context.
        template = f"""You are an intelligent Alumni Information Assistant. Current date: {current_date}. Provide answers based ONLY on the provided context documents.

When asked about skills:
1. ONLY consider explicit mentions in 'Skills' or 'WorkExperience' fields from the context documents.
2. For "skills of [name]", list ONLY what's explicitly mentioned in the retrieved documents for that person.
3. If no relevant documents are retrieved or no skills are explicitly mentioned in the context for the person, state that information is not available or no specific skills were mentioned.

For education:
1. List ALL degrees and completion dates explicitly stated in the context.
2. Do NOT infer degrees or dates not present in the context.

For work experience:
1. Summarize the work experience as described in the context. Note date ranges if present, relative to {current_date}.
2. Do **not** append any “(Information might be outdated)” tags.

For contact information (Email, Phone, Location):
1. Only provide contact information if it is explicitly present in the context documents.
2. If requested information is not in the context, state that it is not available.

General Answer Guidelines:
- Be precise, factual, and directly answer the user's question using ONLY the provided context.
- If the answer is not in the context, politely state that you cannot find the information. Do not make up answers.
- Use bullet points or clear formatting for lists (like skills or degrees).
- Avoid conversational filler outside of the initial greeting and inability to find information.
- Do not mention the source documents or retrieval process in the final answer.
- If a query involves criteria (like graduation year or location) that were used to filter the search results, assume the context provided already meets those criteria, and answer based on the *content* within that filtered context. Do not re-verify the filter criteria from the document content itself unless explicitly asked to list all criteria.

Chat History:
{{chat_history}}

Context:
{{context}}

Question: {{question}}

Answer:""" # Changed 'Answer guidelines:' to 'Answer:' to make the LLM start generating directly

        prompt = PromptTemplate(
            template=template,
            input_variables=["chat_history", "context", "question"]
        )
        print("✅ Prompt template created.")

        # Create the ConversationalRetrievalChain using our custom retriever
        rag_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=custom_retriever, # Use the custom retriever that handles filters and expansion
            memory=memory,
            combine_docs_chain_kwargs={"prompt": prompt},
            return_source_documents=True # Useful for debugging
        )
        print("✅ Enhanced RAG chain setup complete.")

    except Exception as e:
        print(f"❌ Error setting up RAG chain: {e}")
        traceback.print_exc()
        rag_chain = None

# --- Enhanced Query Processing ---
async def query_alumni_bot(user_query: str, history: List[Tuple[str, str]]):
    """Enhanced query processing with better filtering and error handling."""
    print(f"\n[INFO] Received user query: '{user_query}'")
    if rag_chain is None:
        error_msg = "⚠️ Chatbot system not ready. Please check logs for initialization errors."
        print(error_msg)
        return "", history + [(user_query, error_msg)]

    if not user_query or not user_query.strip():
        print("[INFO] Empty or whitespace-only user query received.")
        return "", history

    original_query = user_query.strip() # Use a cleaned version for processing

    # --- Improved Metadata Filtering Logic ---
    # Define filter patterns and corresponding metadata fields/operators
    # Order matters here - try specific patterns first.
    # Regex breakdown:
    # - (?:...) non-capturing group
    # - (.+?) non-greedy capture of the value
    # - (?:\s*(?:and|or)\s*|$) non-capturing group matching ' and', ' or' (with optional surrounding space) or end of string
    # Added more flexibility to capture values
    filters_list = [
        ("major", "$eq", r"(?:major is|majored in) (.+?)(?:\s*(?:and|or)\s*|$)"),
        ("Location", "$eq", r"(?:live in|located in|is in|from) (.+?)(?:\s*(?:and|or)\s*|$)"),
        ("graduation_year_int", "$lt", r"graduated before (\d{4})(?:\s*(?:and|or)\s*|$)"),
        ("graduation_year_int", "$gt", r"graduated after (\d{4})(?:\s*(?:and|or)\s*|$)"),
        # Regex for skills - looks for phrases like "with skill/skills", "experience in", followed by content
        ("Skills", "$in", r"(?:with|has) (?:skill(?:s)?|experience in) (.+?)(?:\s*(?:and|or)\s*|$)"),
        # Add patterns for specific skills/domains if you want to map them to the Skills metadata field
        # e.g., ("Skills", "$in", r"(?:\bpython\b)(?:\s*(?:and|or)\s*|$)") # Very specific
        # A more general approach is to let RAG handle skill details after filtering by other criteria.
        # Let's keep the Skills pattern general to capture phrases like "python, java, cloud"
    ]

    metadata_filter = {}
    spans_to_remove = []
    query_processing_copy = original_query # Work on a copy for matching

    print(f"[INFO] Attempting to extract filters from query: '{original_query}'")

    for field, op, pattern in filters_list:
        # Use finditer to get spans of all matches for the current pattern
        matches = list(re.finditer(pattern, query_processing_copy, re.IGNORECASE))

        for match in matches:
            value_str = match.group(1).strip()
            start, end = match.span() # Get the span of the full match

            try:
                value = value_str
                # Type conversion based on field
                if field == 'graduation_year_int':
                     try:
                         value = int(value_str)
                     except ValueError:
                         print(f"[WARN] Could not convert '{value_str}' to integer for graduation year. Skipping filter.")
                         continue # Skip this filter if value is not an integer
                elif field == 'Skills' and op == "$in":
                     # Assuming comma-separated list for $in operator value
                     value = [s.strip() for s in value_str.split(',') if s.strip()]
                     if not value: # Skip empty skill lists
                         print(f"[WARN] Extracted empty skill list from '{value_str}'. Skipping filter.")
                         continue # Skip adding this filter

                # Add to metadata_filter dictionary
                # This logic assumes simple ANDing of different filter fields.
                # For the same field, it handles $lt/$gt ranges or accumulates $in values.
                # $eq for the same field would overwrite previous $eq for simplicity.
                if field not in metadata_filter:
                    if op in ["$lt", "$gt"]:
                         metadata_filter[field] = {} # Initialize as dict for range filters
                    elif op == "$in":
                         metadata_filter[field] = {op: []} # Initialize as list for $in
                    # else: # $eq will just set the value directly below
                        # pass

                if op in ["$lt", "$gt"]:
                    # Combine $lt and $gt for the same field
                    if field in metadata_filter and isinstance(metadata_filter[field], dict):
                        metadata_filter[field][op] = value
                    else:
                        # Should not happen with initialization above, but handle defensively
                        print(f"[WARN] Filter logic issue: field '{field}' not dict for range filter. Setting directly.")
                        metadata_filter[field] = {op: value}
                elif op == "$in":
                     # Extend the list for $in operator
                     if field in metadata_filter and op in metadata_filter[field] and isinstance(metadata_filter[field][op], list):
                         metadata_filter[field][op].extend(value) # Extend the list with new values
                     else:
                          # Should not happen, but handle defensively
                          print(f"[WARN] Filter logic issue: field '{field}' not list for $in filter. Setting directly.")
                          metadata_filter[field] = {op: value}
                else: # $eq
                    # Set or overwrite $eq filter
                    metadata_filter[field] = value

                # Record the span of the *full match* including the connector if present
                spans_to_remove.append((start, end))
                print(f"[FILTER] Identified filter: Field='{field}', Operator='{op}', Value='{value}' (Span: {start}-{end}, Matched Text: '{match.group(0)}')") # Log matched text too

            except Exception as filter_e:
                print(f"❌ Error processing filter match for pattern '{pattern}': {filter_e}")
                traceback.print_exc()

    # Sort spans by start index in reverse order to remove without affecting subsequent indices
    spans_to_remove.sort(key=lambda x: x[0], reverse=True)

    # Build the cleaned question string by removing identified filter spans
    cleaned_query_parts = []
    last_end = len(original_query)
    for start, end in spans_to_remove:
        # Add the segment *before* the current filter span
        segment = original_query[end:last_end].strip()
        if segment:
            cleaned_query_parts.append(segment)
        last_end = start # Update last_end to the start of the current filter span

    # Add the segment before the first filter (or the whole string if no filters)
    segment = original_query[0:last_end].strip()
    if segment:
         cleaned_query_parts.append(segment)

    # Join parts and clean up extra whitespace
    # Join in reverse order because spans_to_remove was processed reverse
    cleaned_query_parts.reverse()
    # Use regex to replace multiple spaces with a single space and strip leading/trailing space
    question = re.sub(r'\s+', ' ', " ".join(cleaned_query_parts)).strip()

    if not question:
        print("[WARN] Question became empty after removing filters. Using original query for RAG.")
        question = original_query
    elif question.lower().strip() == original_query.lower().strip() and not metadata_filter:
         print("[INFO] No filters were successfully extracted or applied.")
    else:
        print(f"[INFO] Cleaned question for RAG: '{question}'")

    if metadata_filter:
        print(f"[INFO] Active metadata filters passed to retriever: {metadata_filter}")
    else:
         print("[INFO] No metadata filters were applied.")

    # --- End Improved Metadata Filtering Logic ---

    try:
        # Invoke the RAG chain.
        # Pass the filter via the config dictionary. Langchain's ConversationalRetrievalChain
        # should pass this config down to the retriever's `get_relevant_documents` method.
        # Our `CustomFilteredRetriever` is designed to accept this 'filter' kwarg.
        # We use retriever.with_config() to dynamically add the filter for this specific invoke call.
        result = rag_chain.invoke(
            {"question": question, "chat_history": history}, # Pass the cleaned question and history
            config={
                 "configurable": {
                       "retriever": rag_chain.retriever.with_config( # Use .with_config to add filter to retriever for this call
                           search_kwargs={"filter": metadata_filter}
                       )
                 }
            }
        )
        print("✅ RAG chain invoke successful.")
        answer = result.get('answer', "Sorry, I couldn't generate a response based on the available information.")
        source_documents = result.get('source_documents', [])
        print(f"[INFO] Received answer (first 100 chars): '{answer[:100]}'")
        print(f"[INFO] Retrieved {len(source_documents)} source documents.")
        # for i, doc in enumerate(source_documents):
        #     print(f"[DEBUG] Source {i}: Metadata={doc.metadata}, Content={doc.page_content[:200]}...")

        # Post-process answer for better clarity - remove canned phrases from prompt
        answer = re.sub(r"(?i)based on the provided context,?", "", answer).strip()
        answer = re.sub(r"(?i)according to the context,?", "", answer).strip()
        answer = re.sub(r"(?i)based on the information provided,?", "", answer).strip()
        answer = re.sub(r"(?i)according to the profile,?", "", answer).strip() # From the prompt
        answer = re.sub(r"(?i)information not available", "Information not available in the profiles.", answer).strip() # From the prompt
        answer = re.sub(r"(?i)no specific skills mentioned", "No specific skills mentioned in the profiles.", answer).strip() # From the prompt

        # If the answer seems empty or uninformative after cleaning and no docs were retrieved
        # Add a check to ensure the answer isn't just whitespace or punctuation
        if (not answer or answer.lower() in ["", "sorry, i couldn't generate a response based on the available information.", "information not available in the profiles.", "no specific skills mentioned in the profiles."] or re.fullmatch(r'[^\w\s]+', answer)) and not source_documents:
             answer = "Sorry, I could not find relevant information in the alumni profiles for your query."
             print("[INFO] Answer is uninformative and no documents were retrieved. Returning default message.")

        # Append source document metadata (optional, for debugging/transparency)
        # source_info = "\n\n---\nSource Documents:\n"
        # if source_documents:
        #     for doc in source_documents:
        #         source_info += f"- ID: {doc.metadata.get('alumni_id', 'N/A')}, Chunk: {doc.metadata.get('chunk_index', 'N/A')}, Name: {doc.metadata.get('name', 'N/A')}\n"
        #     answer += source_info

        return "", history + [(user_query, answer)]

    except Exception as e:
        print(f"❌ Query processing error: {e}")
        traceback.print_exc()
        error_msg = "⚠️ An internal error occurred while processing your query. Please try again or rephrase."
        return "", history + [(user_query, error_msg)]

# --- Main Execution ---
if __name__ == "__main__":
    print("\n--- Starting Enhanced Alumni Chatbot ---")

    # Initialize environment (Colab Drive mount or local path)
    initial_vector_db_path = VECTOR_DB_PATH # Store initial value
    try:
        print("[INFO] Checking environment...")
        # Check if running in Colab by looking for specific environment variables
        if 'COLAB_GPU' in os.environ or 'GOOGLE_COLAB' in os.environ:
            print("[INFO] Running in Google Colab environment.")
            try:
                from google.colab import drive
                if not os.path.exists('/content/drive'):
                    print("[INFO] Attempting to mount Google Drive...")
                    drive.mount('/content/drive')
                    print("✅ Google Drive mounted.")
                else:
                    print("✅ Google Drive already mounted.")
                os.makedirs(VECTOR_DB_PATH, exist_ok=True)
                print(f"✅ Ensured Chroma DB path exists: {VECTOR_DB_PATH}")
            except ImportError:
                 print("⚠️ `google.colab` not available despite environment indicators. Using local storage.")
                 VECTOR_DB_PATH = "./chroma_db_local_firestore"
                 os.makedirs(VECTOR_DB_PATH, exist_ok=True)
                 print(f"✅ Switched to local Chroma DB path: {VECTOR_DB_PATH}")
            except Exception as e:
                print(f"❌ Error during Google Drive mount: {e}")
                traceback.print_exc()
                print("⚠️ Drive mount failed. Falling back to local storage.")
                VECTOR_DB_PATH = "./chroma_db_local_firestore"
                os.makedirs(VECTOR_DB_PATH, exist_ok=True)
                print(f"✅ Switched to local Chroma DB path: {VECTOR_DB_PATH}")
        else:
            print("[INFO] Not running in Google Colab. Using local storage for Chroma DB.")
            if initial_vector_db_path.startswith("/content/drive/"):
                 # If the default path was a Drive path but we're not in Colab
                 VECTOR_DB_PATH = "./chroma_db_local_firestore"
                 print(f"⚠️ Adjusted Chroma DB path from Drive path to local: {VECTOR_DB_PATH}")
            os.makedirs(VECTOR_DB_PATH, exist_ok=True)
            print(f"✅ Ensured local Chroma DB path exists: {VECTOR_DB_PATH}")

    except Exception as e:
        print(f"❌ Unexpected error during environment check: {e}")
        traceback.print_exc()
        print("⚠️ Proceeding with potential default local path.")
        if VECTOR_DB_PATH.startswith("/content/drive/"):
             VECTOR_DB_PATH = "./chroma_db_local_firestore"
             print(f"⚠️ Adjusted Chroma DB path due to error: {VECTOR_DB_PATH}")
        os.makedirs(VECTOR_DB_PATH, exist_ok=True)
        print(f"✅ Ensured Chroma DB path exists: {VECTOR_DB_PATH}")

    # Initialize components
    print("\n[INFO] --- Initializing Components ---")
    load_google_api_key()
    initialize_database()
    initialize_embeddings()

    # Check if embeddings initialized before setting up vector store
    if embeddings is None:
        print("[ERROR] Embeddings failed to initialize. Cannot proceed with vector store setup or RAG chain.")
    else:
        # Set force_rebuild=True ONLY if you want to rebuild the vector store from Firestore every time
        # Set to False to load from the persistence directory if available.
        setup_vector_store(force_rebuild=False)

        # Check if LLM initialized before setting up RAG chain
        initialize_llm()
        if llm is None:
             print("[ERROR] LLM failed to initialize. Cannot proceed with RAG chain setup.")
        elif vector_store is None:
             print("[ERROR] Vector store failed to initialize. Cannot proceed with RAG chain setup.")
        else:
            setup_rag_chain()

    print("[INFO] --- Initialization Complete ---")

    # Launch Gradio interface
    if rag_chain:
        print("\n[INFO] Launching Gradio interface...")
        with gr.Blocks(theme=gr.themes.Soft()) as app: # Added a simple theme
            gr.Markdown("# Alumni Information Chatbot")
            gr.Markdown("Ask questions about alumni profiles. Examples: 'What are Virginia Hammond's skills?', 'Find alumni who graduated after 2020 and live in California.'")

            chatbot = gr.Chatbot(
                label="Chat History",
                height=500, # Increased height for better visibility
                layout="bubble", # Use bubble layout
                # avatar_images=((os.path.join(os.path.dirname(__file__), "user.png") if '__file__' in locals() else None),
                #                (os.path.join(os.path.dirname(__file__), "bot.png") if '__file__' in locals() else None)), # Example avatar paths - requires files
                value=[(None, "Hello! I can help you find information about alumni. Ask me anything about their skills, education, or work experience.")]
            )

            with gr.Row():
                msg = gr.Textbox(label="Your Question", placeholder="Enter your question here...", scale=4, autofocus=True)
                send_btn = gr.Button("Send", scale=1)

            with gr.Row():
                clear = gr.Button("Clear Chat")
                # Add a button to force rebuild the vector store (optional, for testing)
                rebuild_btn = gr.Button("Force Rebuild DB")

            # Comprehensive example questions
            example_queries = [
                 "Tell me everything you know about Virginia Hammond.",
                 "What are Virginia Hammond's key skills?",
                 "What is Virginia Hammond's contact information?",
                 "Tell me about Paul Walker's work experience.",
                 "Did Paul Walker study Computer Science?", # Will likely be RAG only unless 'major is computer science' filter is added
                 "Find alumni with Python programming skills.", # This will now attempt filter ($in Skills) + RAG
                 "List alumni with experience in machine learning.", # This will now attempt filter ($in Skills) + RAG
                 "Who has experience with cloud computing?", # This will now attempt filter ($in Skills) + RAG
                 "Find alumni skilled in data analysis.", # This will now attempt filter ($in Skills) + RAG
                 "Which alumni know JavaScript?", # This will now attempt filter ($in Skills) + RAG
                 "What are the key skills or areas of expertise mentioned for alumni who studied Computer Science?", # This will use filter (major if pattern matches) + RAG for skills
                 "List alumni who majored in Electrical Engineering.", # This will use filter (major)
                 "Find alumni who graduated before 2015.", # This will use filter ($lt graduation_year_int)
                 "Who completed their degree after 2018?", # This will use filter ($gt graduation_year_int)
                 "Find alumni who live in California.", # This will use filter ($eq Location)
                 "Who is located in New York?", # This will use filter ($eq Location)
                 "List alumni in Texas.", # This will use filter ($eq Location)
                 "Find alumni who graduated before 2020 and live in California.", # This will use filters ($lt graduation_year_int, $eq Location)
                 "List Computer Science majors with Python skills.", # This will use filter (major) + filter ($in Skills)
                 "Find alumni in tech with more than 5 years of experience.", # RAG only (experience years not a filter)
                 "Find alumni with AI skills who graduated after 2015.", # This will use filter ($in Skills) + filter ($gt graduation_year_int)
                 "Can you find someone with experience in renewable energy or embedded systems?", # This will use filter ($in Skills) + RAG
            ]

            gr.Examples(
                examples=example_queries,
                inputs=msg,
                outputs=[msg, chatbot],
                fn=query_alumni_bot,
                cache_examples=False, # Set to True if your function is fast
                label="Suggested Questions (Click any example to try):"
            )

            # Connect actions
            msg.submit(query_alumni_bot, [msg, chatbot], [msg, chatbot], queue=False) # Use queue=False for quicker response feel
            send_btn.click(query_alumni_bot, [msg, chatbot], [msg, chatbot], queue=False)
            clear.click(lambda: gr.Chatbot.update(value=[(None, "Hello! I can help you find information about alumni. Ask me anything about their skills, education, or work experience.")]), None, chatbot, queue=False)
            rebuild_btn.click(lambda: setup_vector_store(force_rebuild=True), None, None, queue=False) # Add action for rebuild button

        # Launch the app
        # debug=True can help see more detailed logs from Gradio/backend
        # share=True generates a public URL (useful for Colab)
        app.launch(share=True, debug=True)
    else:
        print("\n[ERROR] Chatbot components failed to initialize. Gradio interface will not launch. Please check logs for specific errors during initialization.")



[INFO] --- Configuration ---
[INFO] Loading environment variables...
[INFO] Environment variables loaded.
[INFO] Configured Chroma DB persistence path: /content/drive/MyDrive/chroma_dbs/alumni_chatbot_gemini_firestore
[INFO] Configured Firestore key path (needed for rebuild): /content/firestore_key.json
[INFO] Configured Firestore collection name (needed for rebuild): alumni_profiles
[INFO] Configured Embedding Model: models/embedding-001
[INFO] Configured LLM Model: gemini-1.5-flash
[INFO] --- End Configuration ---

--- Starting Enhanced Alumni Chatbot ---
[INFO] Checking environment...
[INFO] Running in Google Colab environment.
✅ Google Drive already mounted.
✅ Ensured Chroma DB path exists: /content/drive/MyDrive/chroma_dbs/alumni_chatbot_gemini_firestore

[INFO] --- Initializing Components ---
[INFO] Attempting to load Google API Key...
✅ Google API Key loaded from Colab Secrets.
[INFO] Attempting to initialize Firebase Admin and connect to Firestore...
✅ Firebase app already init

  chatbot = gr.Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a0d62bf80bc18c3722.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



[INFO] Received user query: 'can you give information about John carter ? '
[INFO] Attempting to extract filters from query: 'can you give information about John carter ?'
[INFO] No filters were successfully extracted or applied.
[INFO] No metadata filters were applied.
[INFO] CustomFilteredRetriever called with query: 'can you give information about John carter ?' and filter: None
[INFO] Expanding query for RAG retrieval: 'can you give information about John carter ?'
[INFO] Searching vector store with query: 'can you give information about John carter ?' and filter: None
[INFO] Searching vector store with query: 'Search results for can you give information about John carter ?' and filter: None
[INFO] Searching vector store with query: 'Relevant information for can you give information about John carter ?' and filter: None
[INFO] CustomFilteredRetriever collected 5 unique documents from 3 expanded queries.
✅ CustomFilteredRetriever returning top 5 documents after combining and re-ran