In [1]:
# ==============================================================================
# Imports
# ==============================================================================
import os
import pickle
import time
import re # We use Regex for cleaner word splitting
from typing import List, Dict, Any
from pathlib import Path

# NEW: The library for BM25 ranking
from rank_bm25 import BM25Okapi

# ==============================================================================
# Config
# ==============================================================================
INPUT_PKL  = "repackaged_transcript_data.pkl"

# We change the name to indicate these are NOT vector embeddings anymore
OUTPUT_PKL = "transcript_data_bm25_ready.pkl"

CUSTOMERS_TO_PROCESS = 50

# We don't need retries/sleep because this runs locally on your CPU instantly.

In [2]:
# ==============================================================================
# Helpers
# ==============================================================================

def atomic_pickle(obj, path: str):
    """Saves data to a temporary file first to prevent corruption."""
    tmp = path + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(obj, f)
    os.replace(tmp, path)

def load_input() -> List[List[str]]:
    with open(INPUT_PKL, "rb") as f:
        return pickle.load(f)

def load_existing() -> List[Dict[str, Any]]:
    if not os.path.exists(OUTPUT_PKL):
        return []
    with open(OUTPUT_PKL, "rb") as f:
        return pickle.load(f)

def simple_tokenize(text: str) -> List[str]:
    """
    Splits text into words (tokens) for BM25.
    1. Lowercase everything (so 'Help' and 'help' are the same).
    2. Remove punctuation.
    3. Split by whitespace.
    """
    text = text.lower()
    # Replace non-alphanumeric chars with spaces
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Split by spaces and remove empty strings
    tokens = [t for t in text.split(" ") if t.strip()]
    return tokens

In [3]:
# ==============================================================================
# Main Execution
# ==============================================================================
def main():
    # 1. Load Data
    try:
        customer_data = load_input()
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find '{INPUT_PKL}'.")
    print(f"Loaded {len(customer_data)} customer records.")

    # 2. Resume Logic
    processed = load_existing()
    done_ids = {rec["customer_id"] for rec in processed}
    total = min(CUSTOMERS_TO_PROCESS, len(customer_data))
    
    print(f"Processing {total} customers for BM25 (Tokenization)...")

    # 3. Loop through customers
    for i, transcripts in enumerate(customer_data[:total]):
        if i in done_ids:
            continue

        try:
            # Tokenize all 4 transcripts for this customer
            # We store the TOKENS, not vectors.
            tokenized_transcripts = []
            for t in transcripts:
                tokens = simple_tokenize(t)
                tokenized_transcripts.append(tokens)

            # Create the record
            # We keep the raw 'transcripts' for reading later
            # We add 'bm25_tokens' for the search engine to use later
            rec = {
                "customer_id": i, 
                "transcripts": transcripts, 
                "bm25_tokens": tokenized_transcripts
            }
            processed.append(rec)

            # Save
            atomic_pickle(processed, OUTPUT_PKL)
            
            # Print progress every 10 customers (it's very fast now)
            if (i+1) % 10 == 0:
                print(f"Processed customer {i+1}/{total}")

        except Exception as e:
            print(f"Error on customer {i+1}: {e}")
            break

    print(f"\nDone. Saved {len(processed)} customers to '{OUTPUT_PKL}'.")

if __name__ == "__main__":
    main()

Loaded 50 customer records.
Processing 50 customers for BM25 (Tokenization)...

Done. Saved 50 customers to 'transcript_data_bm25_ready.pkl'.


In [4]:
# ==============================================================================
# Check file contents / Validation
# ==============================================================================
import pickle
import numpy as np
from rank_bm25 import BM25Okapi

# Load the file we just made
INPUT_PKL_CHECK = "transcript_data_bm25_ready.pkl"

def load_pickle(path: str) -> List[Dict[str, Any]]:
    with open(path, "rb") as f:
        return pickle.load(f)

# Re-define tokenizer for the test query
def simple_tokenize(text: str) -> List[str]:
    import re
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    return [t for t in text.split(" ") if t.strip()]

def main_check():
    if not os.path.exists(INPUT_PKL_CHECK):
        print("File not found.")
        return

    data = load_pickle(INPUT_PKL_CHECK)
    print(f"Loaded {len(data)} records for checking.")

    # 1. Structure Check
    first_rec = data[0]
    print("\nKeys in record:", first_rec.keys())
    
    # Check if we have 4 lists of tokens
    tokens = first_rec["bm25_tokens"]
    print(f"Number of token lists: {len(tokens)} (Should be 4)")
    print(f"First transcript token count: {len(tokens[0])}")
    print(f"First 5 tokens of transcript 0: {tokens[0][:5]}")

    # 2. REAL BM25 TEST
    # We will simulate how the RAG will work later.
    print("\n--- Simulating BM25 Search for Customer 0 ---")
    
    # A. Build the index for this specific customer
    bm25 = BM25Okapi(tokens)
    
    # B. Define a test query
    query_str = "billing issue with my credit card"
    query_tokens = simple_tokenize(query_str)
    print(f"Test Query: '{query_str}'")
    print(f"Query Tokens: {query_tokens}")
    
    # C. Get scores for the 4 transcripts
    scores = bm25.get_scores(query_tokens)
    print(f"\nBM25 Scores for the 4 transcripts: {scores}")
    
    # D. Find the winner
    best_idx = np.argmax(scores)
    print(f"Winner is Transcript Index: {best_idx}")
    print(f"Winner Score: {scores[best_idx]:.4f}")
    
    # Print a snippet of the winner
    winner_text = first_rec["transcripts"][best_idx]
    print(f"Winner Text Snippet: {winner_text[:100]}...")

if __name__ == "__main__":
    main_check()

Loaded 50 records for checking.

Keys in record: dict_keys(['customer_id', 'transcripts', 'bm25_tokens'])
Number of token lists: 4 (Should be 4)
First transcript token count: 397
First 5 tokens of transcript 0: ['thank', 'you', 'for', 'calling', 'organization']

--- Simulating BM25 Search for Customer 0 ---
Test Query: 'billing issue with my credit card'
Query Tokens: ['billing', 'issue', 'with', 'my', 'credit', 'card']

BM25 Scores for the 4 transcripts: [0.22333747 0.13214417 0.19896963 2.67638632]
Winner is Transcript Index: 3
Winner Score: 2.6764
Winner Text Snippet: Hello? Hello? Hello? Hello, Hi, this is [PERSON_NAME]. [PERSON_NAME], I'm sorry to bother you right ...
