In [1]:
# ==============================================================================
# Imports
# ==============================================================================
import os
import pickle
import re
import numpy as np
from typing import List, Dict, Any
from pathlib import Path

# 1. For Dense Embedding (The "Meaning" Search)
from sentence_transformers import SentenceTransformer

# 2. For Sparse Embedding (The "Keyword" Search)
# Note: We only need the tokenizer logic here, but we import the library to ensure it's installed.
from rank_bm25 import BM25Okapi

# ==============================================================================
# Config
# ==============================================================================
INPUT_PKL  = "repackaged_transcript_data.pkl"
OUTPUT_PKL = "transcript_embeddings_hybrid.pkl"

CUSTOMERS_TO_PROCESS = 50

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ==============================================================================
# Initialize Dense Model (Qwen)
# ==============================================================================
print("Loading Qwen3 model... (Dense Embeddings)")

embedding_model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B",
    trust_remote_code=True,
    tokenizer_kwargs={"padding_side": "left"}
)

print("Qwen model loaded.")

In [None]:
# ==============================================================================
# Helpers
# ==============================================================================

def atomic_pickle(obj, path: str):
    """Saves data safely."""
    tmp = path + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(obj, f)
    os.replace(tmp, path)

def load_input() -> List[List[str]]:
    with open(INPUT_PKL, "rb") as f:
        return pickle.load(f)

def load_existing() -> List[Dict[str, Any]]:
    if not os.path.exists(OUTPUT_PKL):
        return []
    with open(OUTPUT_PKL, "rb") as f:
        return pickle.load(f)

# --- Dense Logic (Qwen) ---
def embed_transcripts_dense(model, transcripts: List[str]) -> List[List[float]]:
    """Generates the vector embeddings."""
    embeddings = model.encode(transcripts)
    return embeddings.tolist()

# --- Sparse Logic (BM25) ---
def tokenize_text(text: str) -> List[str]:
    """
    Splits text into tokens for BM25.
    Logic: Lowercase -> Remove special chars -> Split by space.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = [t for t in text.split(" ") if t.strip()]
    return tokens

In [None]:
# ==============================================================================
# Main Hybrid Generation
# ==============================================================================
def main():
    # 1. Load Data
    try:
        customer_data = load_input()
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find '{INPUT_PKL}'.")
    
    # 2. Resume Logic
    processed = load_existing()
    done_ids = {rec["customer_id"] for rec in processed}
    total = min(CUSTOMERS_TO_PROCESS, len(customer_data))
    
    print(f"Starting HYBRID processing for {total} customers...")

    for i, transcripts in enumerate(customer_data[:total]):
        if i in done_ids:
            if (i+1) % 10 == 0: print(f"Skipping {i+1} (Done)")
            continue

        try:
            # --- A. Generate Dense Embeddings (Qwen) ---
            # Result: List of Lists of Floats (e.g. [[0.1, 0.5...], ...])
            dense_vectors = embed_transcripts_dense(embedding_model, transcripts)

            # --- B. Generate Sparse Tokens (BM25) ---
            # Result: List of Lists of Strings (e.g. [['hello', 'world'], ...])
            sparse_tokens = [tokenize_text(t) for t in transcripts]

            # --- C. Create Hybrid Record ---
            rec = {
                "customer_id": i,
                "transcripts": transcripts,      # Raw Text (for reading)
                "dense_embeddings": dense_vectors, # For Semantic Search
                "bm25_tokens": sparse_tokens     # For Keyword Search
            }
            
            processed.append(rec)
            atomic_pickle(processed, OUTPUT_PKL)
            
            print(f"Processed customer {i+1}/{total}")

        except Exception as e:
            print(f"Error on customer {i+1}: {e}")
            break

    print(f"\nDone. Hybrid data saved to '{OUTPUT_PKL}'.")

if __name__ == "__main__":
    main()

In [None]:
# ==============================================================================
# Check file contents / Validation
# ==============================================================================
import pickle
import numpy as np

FILE_TO_CHECK = "transcript_embeddings_hybrid.pkl"

def main_check():
    if not os.path.exists(FILE_TO_CHECK):
        print("File not found.")
        return

    data = pickle.load(open(FILE_TO_CHECK, "rb"))
    print(f"Loaded {len(data)} records.")

    if len(data) == 0:
        return

    # Check the first record
    rec = data[0]
    print("\n--- Hybrid Record Structure Check ---")
    print(f"Keys found: {list(rec.keys())}")
    
    # Check Dense
    dense = rec.get("dense_embeddings", [])
    if dense:
        dims = len(dense[0])
        print(f"\n[Dense] Embeddings present? Yes.")
        print(f"[Dense] Count: {len(dense)}")
        print(f"[Dense] Dimensions: {dims} (Expected ~1024 for Qwen)")
    else:
        print("\n[Dense] MISSING!")

    # Check Sparse
    sparse = rec.get("bm25_tokens", [])
    if sparse:
        print(f"\n[Sparse] Tokens present? Yes.")
        print(f"[Sparse] Count: {len(sparse)}")
        print(f"[Sparse] First 5 tokens of transcript 0: {sparse[0][:5]}")
    else:
        print("\n[Sparse] MISSING!")

    print("\nStatus: Ready for Hybrid RAG.")

if __name__ == "__main__":
    main_check()