In [1]:
# ==========================================
# CELL 1: SETUP & IMPORTS
# ==========================================
import json
import os
import pickle
import numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- CONFIGURATION ---
KB_PATH = "./data/raw/knowledge_base_augmented.json"
MODELS_DIR = "./models"
EMBEDDING_MODEL_NAME = "BAAI/bge-small-en-v1.5"
MODEL_TOKEN_LIMIT = 512
EST_CHAR_PER_TOKEN = 4
SAFE_CHAR_LIMIT = MODEL_TOKEN_LIMIT * EST_CHAR_PER_TOKEN

# Create directories
os.makedirs(MODELS_DIR, exist_ok=True)

print("‚úÖ Setup complete. Ready to build the Retrieval Engine.")

‚úÖ Setup complete. Ready to build the Retrieval Engine.


In [2]:
print("üîç Analyzing Paragraph Distributions...")

# 1. Load Data
with open(KB_PATH, "r", encoding="utf-8") as f:
    kb_data = json.load(f)

# 2. Extract Paragraphs
all_paragraph_lengths = []
oversized_paragraphs = []

for doc in kb_data:
    # We assume double newline is the standard paragraph separator
    paragraphs = doc['full_text'].split('\n\n')
    
    for p in paragraphs:
        # Strip whitespace
        clean_p = p.strip()
        p_len = len(clean_p)
        
        # Filter out empty or tiny artifacts (like headers or stray newlines)
        if p_len > 50: 
            all_paragraph_lengths.append(p_len)
            
            # Record if it exceeds our model's safety limit
            if p_len > SAFE_CHAR_LIMIT:
                oversized_paragraphs.append({
                    "doc": doc['title'],
                    "length": p_len,
                    "preview": clean_p[:100] + "..."
                })

# 3. Calculate Stats
max_p = np.max(all_paragraph_lengths)
avg_p = np.mean(all_paragraph_lengths)
median_p = np.median(all_paragraph_lengths)
percentile_99 = np.percentile(all_paragraph_lengths, 99)

# 4. Report
print(f"\nüìä DATASET STATISTICS (Paragraphs)")
print(f"   ‚Ä¢ Total Paragraphs:     {len(all_paragraph_lengths)}")
print(f"   ‚Ä¢ Average Length:       {avg_p:.0f} chars")
print(f"   ‚Ä¢ Median Length:        {median_p:.0f} chars")
print(f"   ‚Ä¢ Largest Paragraph:    {max_p} chars")
print(f"   ‚Ä¢ 99th Percentile:      {percentile_99:.0f} chars")
print("-" * 40)

print(f"ü§ñ MODEL CONSTRAINTS (mxbai-embed-large-v1)")
print(f"   ‚Ä¢ Max Context:          {MODEL_TOKEN_LIMIT} tokens")
print(f"   ‚Ä¢ Est. Char Limit:      ~{SAFE_CHAR_LIMIT} chars")
print("-" * 40)

# 5. The Verdict
if max_p > SAFE_CHAR_LIMIT:
    print(f"‚ö†Ô∏è WARNING: Found {len(oversized_paragraphs)} paragraphs larger than the model limit.")
    print(f"   The largest is {max_p} chars (approx {max_p/4:.0f} tokens).")
    print(f"   Example: '{oversized_paragraphs[0]['preview']}' from {oversized_paragraphs[0]['doc']}")
else:
    print(f"‚úÖ SUCCESS: All paragraphs fit within the model window!")
    print("   Simple paragraph splitting is safe.")

üîç Analyzing Paragraph Distributions...

üìä DATASET STATISTICS (Paragraphs)
   ‚Ä¢ Total Paragraphs:     38566
   ‚Ä¢ Average Length:       1038 chars
   ‚Ä¢ Median Length:        774 chars
   ‚Ä¢ Largest Paragraph:    12798 chars
   ‚Ä¢ 99th Percentile:      4322 chars
----------------------------------------
ü§ñ MODEL CONSTRAINTS (mxbai-embed-large-v1)
   ‚Ä¢ Max Context:          512 tokens
   ‚Ä¢ Est. Char Limit:      ~2048 chars
----------------------------------------
   The largest is 12798 chars (approx 3200 tokens).
   Example: 'Beyonc√© Giselle Knowles-Carter (  bee-ON-say; born September 4, 1981) is an American singer, songwri...' from Beyonc√©


In [3]:
# ==========================================
# CELL 2: INTELLIGENT CHUNKING
# ==========================================
print("üî™ Starting Chunking Process...")

# Strategy: 1000 chars is approx 250 tokens. 
# This fits easily into bge-small's 512 token limit.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=250,
    separators=["\n\n", "\n", ". ", " ", ""], 
    length_function=len,
)

# Load Data
with open(KB_PATH, "r", encoding="utf-8") as f:
    kb_data = json.load(f)

chunks = []
doc_id_counter = 0

print(f"üìö Processing {len(kb_data)} articles...")

for doc in tqdm(kb_data, desc="Chunking"):
    title = doc['title']
    full_text = doc['full_text']
    url = doc['source_url']
    original_id = doc['id']
    
    doc_splits = text_splitter.split_text(full_text)
    
    for i, split_text in enumerate(doc_splits):
        chunks.append({
            "chunk_id": doc_id_counter,
            "doc_id": original_id,
            "title": title,
            "text": split_text,
            "source_url": url,
            "chunk_index": i
        })
        doc_id_counter += 1

print("-" * 40)
print(f"‚úÖ CHUNKING COMPLETE")
print(f"   ‚Ä¢ Total Chunks: {len(chunks)}")

üî™ Starting Chunking Process...
üìö Processing 477 articles...


Chunking: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 477/477 [00:00<00:00, 3999.23it/s]

----------------------------------------
‚úÖ CHUNKING COMPLETE
   ‚Ä¢ Total Chunks: 39141





In [4]:
# ==========================================
# CELL 3: VECTORIZATION
# ==========================================
print(f"üß† Loading Model: {EMBEDDING_MODEL_NAME}...")

# 1. Load Model
# bge-small is highly efficient and runs great on local CPUs/MPS
model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# 2. Batch Encoding
batch_size = 64  # Increased batch size since model is smaller
all_texts = [c['text'] for c in chunks]

print(f"‚ö° Encoding {len(chunks)} chunks (Dimensions: 384)...")

embeddings = model.encode(
    all_texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True # CRITICAL: bge models need normalization for dot product
)

print("-" * 40)
print(f"‚úÖ EMBEDDING COMPLETE")
print(f"   ‚Ä¢ Matrix Shape: {embeddings.shape}")

üß† Loading Model: BAAI/bge-small-en-v1.5...
‚ö° Encoding 39141 chunks (Dimensions: 384)...


Batches:   0%|          | 0/612 [00:00<?, ?it/s]

----------------------------------------
‚úÖ EMBEDDING COMPLETE
   ‚Ä¢ Matrix Shape: (39141, 384)


In [5]:
# ==========================================
# CELL 4: INDEXING & PERSISTENCE
# ==========================================
print("üóÑÔ∏è Building FAISS Index...")

# 1. Initialize FAISS Index
# BGE-Small produces 384-dimensional vectors
dimension = 384 
index = faiss.IndexFlatIP(dimension)

# 2. Add Vectors
index.add(embeddings)

print(f"   ‚Ä¢ Index contains {index.ntotal} vectors.")

# 3. Save Artifacts
index_path = f"{MODELS_DIR}/faiss_index.bin"
metadata_path = f"{MODELS_DIR}/chunk_metadata.pkl"

faiss.write_index(index, index_path)

with open(metadata_path, "wb") as f:
    pickle.dump(chunks, f)

print("-" * 40)
print("üéâ SYSTEM PERSISTED SUCCESSFULLY")
print(f"   ‚Ä¢ Index File:    {index_path}")
print(f"   ‚Ä¢ Metadata File: {metadata_path}")

üóÑÔ∏è Building FAISS Index...
   ‚Ä¢ Index contains 39141 vectors.
----------------------------------------
üéâ SYSTEM PERSISTED SUCCESSFULLY
   ‚Ä¢ Index File:    ./models/faiss_index.bin
   ‚Ä¢ Metadata File: ./models/chunk_metadata.pkl


In [8]:
# ==========================================
# CELL 5: HEALTH CHECK
# ==========================================
print("ü©∫ RUNNING HEALTH CHECK...")

query_text = "Who managed Destiny's Child?"

# BGE models use this specific instruction for queries for best results
query_prompt = f"Represent this sentence for searching relevant passages: {query_text}"
query_vector = model.encode([query_prompt], normalize_embeddings=True)

k = 5
D, I = index.search(query_vector, k)

print(f"\n‚ùì Test Query: '{query_text}'")
print("-" * 40)

for i in range(k):
    idx = I[0][i]
    score = D[0][i]
    retrieved_chunk = chunks[idx]
    
    print(f"ü•á Rank {i+1} (Score: {score:.4f})")
    print(f"   Source: {retrieved_chunk['title']}")
    print(f"   Text:   {retrieved_chunk['text'][:150]}...")
    print("")

print("‚úÖ Retrieval Engine is ONLINE.")

ü©∫ RUNNING HEALTH CHECK...

‚ùì Test Query: 'Who managed Destiny's Child?'
----------------------------------------
ü•á Rank 1 (Score: 0.6261)
   Source: Beyonc√©
   Text:   Following several lineup changes, Destiny's Child ultimately comprised Beyonc√©, Rowland, and Michelle Williams. In early 2001, while the group were co...

ü•á Rank 2 (Score: 0.6242)
   Source: Beyonc√©
   Text:   In November 2003, Beyonc√© embarked on the European Dangerously in Love Tour and North American Verizon Ladies First Tour alongside Missy Elliott and A...

ü•á Rank 3 (Score: 0.6144)
   Source: Beyonc√©
   Text:   Beyonc√© Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (n√©e Beyinc√©), a hairdresser and salon owner, and Mathew Knowles, ...

ü•á Rank 4 (Score: 0.6084)
   Source: Beyonc√©
   Text:   On January 7, 2012, Beyonc√© gave birth to a daughter, Blue Ivy Carter, at Lenox Hill Hospital in New York under heavy security. Two days later, Jay Z ...

ü•á Rank 5 (Score: 