In [1]:
# Cell 5 (Fixed): Master Ingestion with Header Injection
import os
import glob
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:

# 1. Setup Embeddings
print("üì• Loading Hugging Face Embedding Model...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 2. Define Splitter
headers_to_split_on = [
    ("#", "DocName"),
    ("##", "Section"),
    ("###", "SubSection"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [None]:
# 3. Process Files
all_splits = []
data_folder = "data"
md_files = glob.glob(os.path.join(data_folder, "*.md"))

print(f"üìÇ Found {len(md_files)} Markdown files.")

for file_path in md_files:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            file_content = f.read()
            
        splits = markdown_splitter.split_text(file_content)
        
        # --- THE FIX STARTS HERE ---
        for split in splits:
            # 1. Add Source Metadata
            split.metadata["source"] = file_path
            
            # 2. INJECT Metadata back into Content
            # We construct a "Context String" from the headers
            header_context = ""
            if "Section" in split.metadata:
                header_context += f"Section: {split.metadata['Section']}\n"
            if "SubSection" in split.metadata:
                header_context += f"Role/Topic: {split.metadata['SubSection']}\n"
            
            # Prepend it to the actual text
            split.page_content = header_context + split.page_content
        # --- THE FIX ENDS HERE ---
            
        all_splits.extend(splits)
        print(f"   ‚úÖ Processed {os.path.basename(file_path)} -> {len(splits)} chunks.")
        
    except Exception as e:
        print(f"   ‚ùå Error reading {file_path}: {e}")

In [None]:
# 4. Re-create Database (Clean Start)
import shutil
if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
    print("   üóëÔ∏è  Cleared old database.")

print(f"‚è≥ Ingesting {len(all_splits)} chunks...")
vectordb = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)
print("üéâ Database Updated! Headers are now visible to the AI.")

In [None]:
# Cell 11: The "Best of Both Worlds" - Hybrid Search
from langchain_classic.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# 1. Setup Vector Retriever (The "Concept" Brain)
# k=3 finds general matches like "Dean of Research"
vector_retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# 2. Setup Keyword Retriever (The "Exact Match" Brain)
# We fetch all docs from the DB to build the keyword index
raw_docs = vectordb.similarity_search("dummy", k=100) 
bm25_retriever = BM25Retriever.from_documents(raw_docs)
bm25_retriever.k = 3

# 3. Create the Hybrid (Ensemble)
# weights=[0.5, 0.5] means we trust both brains equally
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.5, 0.5]
)

In [None]:

print("‚úÖ Hybrid System Online: Vectors + Keywords")

# 4. FINAL TEST: "Tell me about Ranga"
query = "top companies for placments"
print(f"\nüîé HYBRID SEARCH FOR: '{query}'")
print("="*60)

results = ensemble_retriever.invoke(query)

for i, doc in enumerate(results):
    print(f"üîπ Result #{i+1}")
    print("Source: ",doc.metadata['source'])
    # Check source to see which brain found it (BM25 usually finds names)
    print(f"   üìÑ Content: {doc.page_content}...")
    print("-" * 50)