In [1]:
# Cell 1: Imports
import os
import glob
import shutil

# Standard LangChain Imports
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# Retrievers
from langchain_community.retrievers import BM25Retriever

# Try importing EnsembleRetriever from s
from langchain_classic.retrievers import EnsembleRetriever

print("‚úÖ Imports successful!")

‚úÖ Imports successful!


In [2]:
# Cell 2: Setup Embeddings & Splitter
print("üì• Loading Hugging Face Embedding Model...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Define headers to split on
headers_to_split_on = [
    ("#", "DocName"),
    ("##", "Section"),
    ("###", "SubSection"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
print("‚úÖ Models & Splitter Ready.")

üì• Loading Hugging Face Embedding Model...
‚úÖ Models & Splitter Ready.


In [3]:
# Cell 3: Process Files (Recursive)
all_splits = []
data_folder = "data" 

# FIX: Use recursive search to find files in subfolders (like data/departments/)
# "root_dir/**/*.md" will find matching files in any subdirectory
md_files = glob.glob(os.path.join(data_folder, "**/*.md"), recursive=True)

print(f"üìÇ Found {len(md_files)} Markdown files in '{data_folder}' and its subfolders.")

for file_path in md_files:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            file_content = f.read()
        
        # Split text based on headers
        splits = markdown_splitter.split_text(file_content)
        
        # Metadata Injection
        for split in splits:
            # Add source filename
            split.metadata["source"] = os.path.basename(file_path)
            
            # Create a context string from headers
            header_context = ""
            if "DocName" in split.metadata:
                header_context += f"Document: {split.metadata['DocName']}\n"
            if "Section" in split.metadata:
                header_context += f"Section: {split.metadata['Section']}\n"
            if "SubSection" in split.metadata:
                header_context += f"Topic: {split.metadata['SubSection']}\n"
            
            # Prepend to content
            split.page_content = header_context + "\n" + split.page_content
            
        all_splits.extend(splits)
        print(f"   ‚úÖ Processed {os.path.basename(file_path)} -> {len(splits)} chunks.")
        
    except Exception as e:
        print(f"   ‚ùå Error reading {file_path}: {e}")

print(f"Total Chunks to Index: {len(all_splits)}")

üìÇ Found 23 Markdown files in 'data' and its subfolders.
   ‚úÖ Processed about_college_accreditations.md -> 6 chunks.
   ‚úÖ Processed academic_regulations.md -> 8 chunks.
   ‚úÖ Processed admissions_process.md -> 12 chunks.
   ‚úÖ Processed campus_facilities.md -> 7 chunks.
   ‚úÖ Processed departments_overview.md -> 9 chunks.
   ‚úÖ Processed eligibility_criteria.md -> 7 chunks.
   ‚úÖ Processed fee_structure.md -> 8 chunks.
   ‚úÖ Processed governance_and_contact.md -> 18 chunks.
   ‚úÖ Processed hostel_transport.md -> 7 chunks.
   ‚úÖ Processed placements_statistics.md -> 7 chunks.
   ‚úÖ Processed reserach_innovation.md -> 4 chunks.
   ‚úÖ Processed student_life.md -> 6 chunks.
   ‚úÖ Processed ai_ds_department.md -> 11 chunks.
   ‚úÖ Processed ai_ml_department.md -> 11 chunks.
   ‚úÖ Processed civil_department.md -> 12 chunks.
   ‚úÖ Processed cse_department.md -> 13 chunks.
   ‚úÖ Processed cyber_department.md -> 10 chunks.
   ‚úÖ Processed ece_department.md -> 13 chunks.
   

In [4]:
# Cell 4: Create Vector Database
if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
    print("   üóëÔ∏è  Cleared old database.")

print(f"‚è≥ Ingesting {len(all_splits)} chunks into ChromaDB...")
vectordb = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)
print("üéâ Database Created Successfully!")

   üóëÔ∏è  Cleared old database.
‚è≥ Ingesting 226 chunks into ChromaDB...
üéâ Database Created Successfully!


In [5]:
# Cell 5: Hybrid Retriever Setup

# 1. Vector Retriever (Semantic)
vector_retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# 2. Keyword Retriever (BM25)
# Note: We pass 'all_splits' directly to ensure 100% indexing
bm25_retriever = BM25Retriever.from_documents(all_splits)
bm25_retriever.k = 3

# 3. Ensemble (Hybrid)
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.5, 0.5]
)

print("‚úÖ Hybrid Retrieval System Online")

‚úÖ Hybrid Retrieval System Online


In [6]:
# Cell 6: Test Query
query = "who is hod of aids"
print(f"\nüîé SEARCHING FOR: '{query}'")
print("="*60)

results = ensemble_retriever.invoke(query)

for i, doc in enumerate(results):
    print(f"üîπ Result #{i+1}")
    print(f"   Source: {doc.metadata.get('source', 'Unknown')}")
    # Print first 200 characters to verify context injection worked
    print(f"   üìÑ Content: {doc.page_content}...") 
    print("-" * 50)


üîé SEARCHING FOR: 'who is hod of aids'
üîπ Result #1
   Source: ece_department.md
   üìÑ Content: Document: Department of Electronics & Communication Engineering (ECE) - Overview
Section: 2. Vision & Mission (ECE)
Topic: Vision

"To produce **innovative, ethical and socially responsible graduates** in the field of Electronics and Communication Engineering who can address the global and environmental challenges through excellence in technical education and research."...
--------------------------------------------------
üîπ Result #2
   Source: ai_ds_department.md
   üìÑ Content: Document: Department of CSE - Artificial Intelligence & Data Science (AI&DS)
Section: 3. Leadership & Contact

* **HOD :** **Dr. P.Sudhakar** (M.Tech, Ph.D).
* **Search Tags:** **AIDS_HOD, CONTACT_AIDS, AI_COORDINATOR.**
* **Contact Email:** hod-ai-ds@rcee.ac.in (Standardized) / admissions@rcee.ac.in
* **Key Faculty / Counselors:**
* **Ms. Reshma Aman** (Assistant Professor).
* **Mr. K. Rajendhra** (Assi