In [None]:
# ==============================================================================
# Imports
# ==============================================================================
import os
import pickle
import time
import torch # Imported to manage CPU/GPU settings if needed
from typing import List, Dict, Any
from pathlib import Path

# NEW: Import the library needed for local Hugging Face models
from sentence_transformers import SentenceTransformer

# ==============================================================================
# Config
# ==============================================================================
# File paths for your data
INPUT_PKL  = "repackaged_transcript_data.pkl"
OUTPUT_PKL = "transcript_embeddings.pkl"

# Processing settings
CUSTOMERS_TO_PROCESS = 50



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\pfeil\anaconda3\envs\local-rag\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\pfeil\anaconda3\envs\local-rag\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\pfeil\anaconda3\envs\local-rag\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\pfeil\anaconda3\envs\local-rag\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
 

In [None]:
# ==============================================================================
# Model Initialization
# ==============================================================================

def load_local_model():
    """
    Loads the Qwen3-Embedding-0.6B model from Hugging Face.
    """
    print("Loading Qwen3 model... this may take a moment to download on first run.")
    
    # We load the model exactly as described in the instructions.
    # 'trust_remote_code=True' is often needed for newer/custom architectures like Qwen.
    model = SentenceTransformer(
        "Qwen/Qwen3-Embedding-0.6B",
        trust_remote_code=True,
        tokenizer_kwargs={
            "padding_side": "left"
        },
    )
    
    return model

# Initialize the model immediately to ensure it downloads/loads correctly
# before we start processing data.
embedding_model = load_local_model()
print("Model loaded successfully.")

Loading Qwen3 model... this may take a moment to download on first run.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:
# ==============================================================================
# Helpers
# ==============================================================================

def atomic_pickle(obj, path: str):
    """
    Saves data to a temporary file first, then renames it.
    This prevents file corruption if the script crashes while saving.
    """
    tmp = path + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(obj, f)
    os.replace(tmp, path)

def load_input() -> List[List[str]]:
    """Loads the raw customer transcript data."""
    with open(INPUT_PKL, "rb") as f:
        return pickle.load(f)

def load_existing() -> List[Dict[str, Any]]:
    """Loads data we have already processed to allow resuming."""
    if not os.path.exists(OUTPUT_PKL):
        return []
    with open(OUTPUT_PKL, "rb") as f:
        return pickle.load(f)

def embed_transcripts(model, transcripts: List[str]) -> List[List[float]]:
    """
    Embeds a list of transcripts (documents) using the local Qwen model.
    Returns a list of vectors (embeddings).
    """
    # According to instructions, we treat these as documents.
    # We do NOT use prompt_name="query" because these are the texts to be stored, not the questions.
    
    # model.encode returns a numpy array or tensor. 
    # We convert it to a standard Python list using .tolist() for storage compatibility.
    embeddings = model.encode(transcripts)
    
    return embeddings.tolist()

In [None]:
# ==============================================================================
# Main Execution
# ==============================================================================
def main():
    # 1. Load input data
    try:
        customer_data = load_input()
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find '{INPUT_PKL}'. Make sure the file exists.")
    print(f"Loaded {len(customer_data)} customer records from '{INPUT_PKL}'.")

    # 2. Resume support: load existing output and find which IDs are done
    processed = load_existing()
    done_ids = {rec["customer_id"] for rec in processed}
    
    # Determine how many we still need to do
    total = min(CUSTOMERS_TO_PROCESS, len(customer_data))
    print(f"Starting processing for {total} customers. Already done: {len(done_ids)}")

    # 3. Loop through customers
    for i, transcripts in enumerate(customer_data[:total]):
        # Skip if we already did this customer
        if i in done_ids:
            # Only print every 10th skipped to avoid clogging screen
            if i % 10 == 0: 
                print(f"Skipping customer {i} (already processed)")
            continue

        try:
            # NEW: We pass the whole list of transcripts (usually 4) to the model at once.
            # The model variable 'embedding_model' comes from Section 2.
            embs = embed_transcripts(embedding_model, transcripts)

            # Create the record
            rec = {
                "customer_id": i, 
                "transcripts": transcripts, 
                "embeddings": embs
            }
            processed.append(rec)

            # Save immediately (atomic)
            atomic_pickle(processed, OUTPUT_PKL)
            
            # Simple progress print
            print(f"Processed customer {i+1}/{total}")

        except Exception as e:
            print(f"Error on customer {i+1}: {e}")
            break

    print(f"\nDone. Saved {len(processed)} customers to '{OUTPUT_PKL}'.")

if __name__ == "__main__":
    main()

In [None]:
# ==============================================================================
# Check file contents / Validation
# ==============================================================================
import statistics
import numpy as np
import pickle

# We read the file we just created
INPUT_PKL_CHECK = "transcript_embeddings.pkl"

def load_pickle(path: str) -> List[Dict[str, Any]]:
    with open(path, "rb") as f:
        return pickle.load(f)

def cosine_matrix(X: np.ndarray) -> np.ndarray:
    # Calculates similarity between all vectors in X against each other
    norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-12
    Xn = X / norms
    return Xn @ Xn.T 

def main_check():
    if not os.path.exists(INPUT_PKL_CHECK):
        print(f"File {INPUT_PKL_CHECK} not found. Run the previous cells first.")
        return

    data = load_pickle(INPUT_PKL_CHECK)
    n_customers = len(data)
    print(f"Loaded {n_customers} customer records from '{INPUT_PKL_CHECK}'.")

    # Storage for stats
    bad_struct = []
    dims = []
    norms = []
    lens_chars = []

    for rec_idx, rec in enumerate(data):
        if not isinstance(rec, dict):
            bad_struct.append((rec_idx, "record not dict"))
            continue

        cid = rec.get("customer_id")
        transcripts = rec.get("transcripts")
        embeddings = rec.get("embeddings")

        # Basic validation: We expect 4 transcripts per customer
        if not isinstance(transcripts, list) or len(transcripts) != 4:
            bad_struct.append((cid, "transcripts != 4"))
        if not isinstance(embeddings, list) or len(embeddings) != 4:
            bad_struct.append((cid, "embeddings != 4"))

        # Collect Length Stats
        for t in (transcripts or []):
            if isinstance(t, str):
                lens_chars.append(len(t))

        # Collect Embedding Stats
        for v in (embeddings or []):
            if isinstance(v, (list, tuple, np.ndarray)):
                dims.append(len(v))
                norms.append(float(np.linalg.norm(np.asarray(v, dtype=np.float32))))

    if bad_struct:
        print(f"Found {len(bad_struct)} structural issues: {bad_struct[:5]}")
    else:
        print("All records have 4 transcripts and 4 embeddings.")

    if dims:
        unique_dims = sorted(set(dims))
        # Explanation: Qwen3-0.6B usually has 1024 dimensions.
        print(f"Embedding dimension(s) found: {unique_dims} (Qwen3-0.6B usually returns 1024)")
        
    if norms:
        print(f"Embedding L2-norm â€” mean: {statistics.mean(norms):.3f}, min: {min(norms):.3f}")

    # Show one example
    if n_customers:
        print("\n--- Sample Record Check ---")
        r0 = data[0]
        print(f"Customer ID: {r0['customer_id']}")
        print(f"Number of embeddings: {len(r0['embeddings'])}")
        print(f"First embedding dimension: {len(r0['embeddings'][0])}")
        
        # Calculate similarity for the first customer to ensure vectors aren't all zeros
        E = np.array(r0["embeddings"], dtype=np.float32)
        print("\nCosine Similarity Matrix for Customer 0:")
        print(cosine_matrix(E))

if __name__ == "__main__":
    main_check()