## MS MACRO

In [1]:
import torch
import pandas as pd
import random
import time
import numpy as np
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
DEVICE

'cuda'

Load Datasets

In [4]:
# Load Ground Truth & Candidate Pool
BASE = "ms_macro"
QRELS_FILE = f"{BASE}/qrels.dev.tsv"
CANDS_FILE = f"{BASE}/top1000.dev"   # BM25 top-1000 pool

In [5]:
# Load Ground Truth [question id, unused, passage id, ground truth]
qrels = pd.read_csv(
    QRELS_FILE,
    sep="\t",
    header=None,
    names=["qid", "unused", "pid", "rel"]
)[["qid", "pid", "rel"]]


In [6]:
# Load BM25 candidates [question id passage id, query, passage]
top1000 = pd.read_csv(
    CANDS_FILE,
    sep="\t",
    header=None,
    names=["qid", "pid", "query", "passage"]
)

In [7]:
print("===== Shape of each datasets =====")
print("qrels:", qrels.shape, "| unique qids:", qrels["qid"].nunique())
print("top1000:", top1000.shape, "| unique qids:", top1000["qid"].nunique())

print("\n===== Ground Truth where it says 'relevant' =====")
print(qrels["rel"].value_counts(dropna=False))


print("\n===== Ground Truth Datasets Looking =====")
print(qrels.head())


print("\n===== BM25 Candidate Datasets Looking =====")
print(top1000.head(100))

print("\n===== Making Sure BM25 has actually 1000 passage for each query =====")
count = top1000[top1000['qid'] == 1082792].shape[0] # 12 is random qid here
print(f"Number of rows for Query 188714: {count}")

===== Shape of each datasets =====
qrels: (59273, 3) | unique qids: 55578
top1000: (6668967, 4) | unique qids: 6980

===== Ground Truth where it says 'relevant' =====
rel
1    59273
Name: count, dtype: int64

===== Ground Truth Datasets Looking =====
       qid      pid  rel
0  1102432  2026790    1
1  1102431  7066866    1
2  1102431  7066867    1
3  1090282  7066900    1
4    39449  7066905    1

===== BM25 Candidate Datasets Looking =====
        qid      pid                                              query  \
0    188714  1000052         foods and supplements to lower blood sugar   
1   1082792  1000084  what does the golgi apparatus do to the protei...   
2    995526  1000094           where is the federal penitentiary in ind   
3    199776  1000115               health benefits of eating vegetarian   
4    660957  1000115              what foods are good if you have gout?   
..      ...      ...                                                ...   
95   684459  1001292         

Combine them (all in variable df)

In [8]:
# 1. Check the Answer Key
# adding new colomn 'rel' into df of the top 1000 candidate datasets
# the qid and pid from the qrels are combined into the qid and pid from top 1000 candidate datasets
df = top1000.merge(qrels, on=["qid", "pid"], how="left") # ["qid", "pid", "query", "passage", "rel"]

# 2. Mark wrong answers as 0
# The answer key only contains the Correct (1) answers.
# So, we fill all the empty spots (NaN) with 0.
df["rel"] = df["rel"].fillna(0).astype(int) 


In [9]:
print("Total candidates:", len(df))
print("Distinct queries:", df["qid"].nunique())
print("Relevant pairs in candidates:", int((df["rel"] == 1).sum()))


Total candidates: 6668967
Distinct queries: 6980
Relevant pairs in candidates: 6005


Create 100 Query 
Conditions: 
1. Find query that has exactly 1 relevance mark from ground truth, of total number of Q_LIMIT (no. query, select 100 default)
2. Total passage (relevant + non-relevant) must sum to maximum of K = 1000
3. skip that query it has no these conditions

In [10]:
# 1. Calculate stats for every query
# We count how many relevant items (sum) and how many total candidates (count) each QID has.
q_stats = df.groupby("qid").agg(
    relevant_count=("rel", "sum"),
    total_candidates=("pid", "count")
)

# 2. Filter: Find QIDs that match your strict criteria
# Condition A: Exactly 1 relevant answer
# Condition B: Exactly 1000 candidates (standard for MS MARCO)
valid_qids = q_stats[
    (q_stats["relevant_count"] == 1) & 
    (q_stats["total_candidates"] == 1000)
].index.tolist()

print(f"Found {len(valid_qids)} queries that match your criteria.")

# 3. Randomly select 100 QIDs
# (We use a seed so you get the same 100 queries every time you run this)
random.seed(42)
selected_qids = random.sample(valid_qids, 100)

# 4. Create the final Evaluation Dataset
# We filter the big 'df' to keep only the rows belonging to our 100 chosen queries.
eval_set = df[df["qid"].isin(selected_qids)].copy()

# 5. Verify the Result
print("\n===== Evaluation Set Created =====")
print("Total Rows:", len(eval_set))      # Should be 100,000 (100 queries * 1000 docs)
print("Unique Queries:", eval_set["qid"].nunique()) # Should be 100
print("Total Relevant:", eval_set["rel"].sum())     # Should be 100 (1 per query)

# Show a preview
print(eval_set.head())

Found 5161 queries that match your criteria.

===== Evaluation Set Created =====
Total Rows: 100000
Unique Queries: 100
Total Relevant: 100
       qid      pid                                         query  \
16  995825  1000492  where is the graphic card located in the cpu   
17  995825  1000494  where is the graphic card located in the cpu   
85  480064  1001246                 price chopper locations in ct   
88  480064  1001252                 price chopper locations in ct   
91  480064  1001253                 price chopper locations in ct   

                                              passage  rel  
16  For example, a “PC Expansion Card” maybe the j...    0  
17  The Common Cards & Buses. The most common type...    0  
85  When I want a T-bone steak or chuck steak for ...    0  
88  Ew...Price Chopper has gone downhill. I shoppe...    0  
91  I love going to Price Chopper because I can fi...    0  


Experiment Setups

In [11]:
# --- CONFIGURATION ---
FEED_SIZE = 32   # Batch size (Try 16 or 32 for Pi, 64 or 128 for HP Z2)
MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'

# Load Model
model = CrossEncoder(MODEL_NAME, token=False)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=False)

# Prepare Data: Convert DataFrame to a clean list of dictionaries for easier processing
# We keep the original index to map scores back later
data_records = eval_set.reset_index().to_dict('records')

# Pre-compute token lengths for the "Proposed" sorting strategy
# (We do this once to avoid measuring tokenization overhead in the sorting time)
print("Pre-computing token lengths for sorting...")
for row in tqdm(data_records):
    # We simulate the exact input the model sees: [CLS] query [SEP] passage [SEP]
    tokens = tokenizer(row['query'], row['passage'], truncation=True, max_length=512)
    row['token_len'] = len(tokens['input_ids'])


def run_benchmark(records, method_name, sort_by_length=False):
    """
    Runs the reranker with specific batching logic and tracks latency/padding.
    """
    
    # 1. Group by Query (So we can measure Per-Query Latency)
    #    We assume the input 'records' is a list of all 100,000 rows.
    grouped_data = {}
    for r in records:
        if r['qid'] not in grouped_data:
            grouped_data[r['qid']] = []
        grouped_data[r['qid']].append(r)
        
    query_latencies = []
    total_padding_waste = []
    total_actual_tokens = []
    total_chunks = 0
    
    # Storage for results: {index: score}
    score_map = {}
    
    start_global = time.time()
    
    # Process one query at a time (to get min/max/median per query)
    for qid, docs in tqdm(grouped_data.items(), desc=f"Running {method_name}"):
        
        t0 = time.time()
        
        # --- A. SORTING STRATEGY ---
        if sort_by_length:
            # PROPOSED: Sort docs by length (Shortest to Longest)
            # This minimizes padding because similar lengths are batched together
            docs.sort(key=lambda x: x['token_len'])
        else:
            # VANILLA: Random shuffle (simulate typical dataloader)
            # We use a fixed seed for reproducibility
            np.random.RandomState(42).shuffle(docs)
            
        # --- B. BATCHING LOOP ---
        # Create pairs for the model: [[q, p], [q, p]...]
        pairs = [[d['query'], d['passage']] for d in docs]
        indices = [d['index'] for d in docs] # Keep track of original ID
        
        for i in range(0, len(pairs), FEED_SIZE):
            batch_pairs = pairs[i : i + FEED_SIZE]
            batch_indices = indices[i : i + FEED_SIZE]
            
            # 1. Measure Padding Waste (Simulation)
            # We explicitly tokenize the batch to see what the model WOULD do
            encoded = tokenizer(
                batch_pairs, 
                padding=True, 
                truncation=True, 
                max_length=512, 
                return_tensors='pt'
            )
            
            # Calc stats
            batch_max_len = encoded['input_ids'].shape[1]
            batch_size_actual = encoded['input_ids'].shape[0]
            
            # Count non-padding tokens (attention_mask is 1 for real tokens)
            actual_toks = torch.sum(encoded['attention_mask']).item()
            total_slots = batch_max_len * batch_size_actual
            waste = total_slots - actual_toks
            
            total_actual_tokens.append(actual_toks)
            total_padding_waste.append(waste)
            total_chunks += 1
            
            # 2. Actual Inference
            # We perform the actual prediction to get the latency
            batch_scores = model.predict(batch_pairs, batch_size=FEED_SIZE, show_progress_bar=False)
            
            # Store scores
            for idx, score in zip(batch_indices, batch_scores):
                score_map[idx] = score

        # Measure Latency for this one query
        query_latencies.append((time.time() - t0) * 1000) # ms

    total_wall_time = time.time() - start_global
    
    return {
        "scores": score_map,
        "latencies": query_latencies,
        "wall_time": total_wall_time,
        "chunks": total_chunks,
        "avg_actual": np.mean(total_actual_tokens) / FEED_SIZE, # Approx per chunk
        "avg_padded": (np.sum(total_actual_tokens) + np.sum(total_padding_waste)) / total_chunks / FEED_SIZE, # Approx
        "waste_pct": (np.sum(total_padding_waste) / (np.sum(total_actual_tokens) + np.sum(total_padding_waste))) * 100
    }

def calculate_metrics(df, score_col):
    """
    Calculates MRR@10/100 and HitRate@10/100
    """
    mrr10, mrr100, hit10, hit100 = [], [], [], []
    
    for qid, group in df.groupby("qid"):
        # Sort by the Cross Encoder Score (High to Low)
        sorted_group = group.sort_values(score_col, ascending=False).reset_index(drop=True)
        
        # Find where the relevant doc (rel=1) is
        # We assume only 1 relevant doc per query for this dataset
        relevant_rank = sorted_group.index[sorted_group['rel'] == 1].tolist()
        
        if not relevant_rank:
            # Should not happen in our filtered dataset
            mrr10.append(0); mrr100.append(0); hit10.append(0); hit100.append(0)
            continue
            
        rank = relevant_rank[0] + 1 # 1-based index
        
        # MRR
        mrr10.append(1/rank if rank <= 10 else 0)
        mrr100.append(1/rank if rank <= 100 else 0)
        
        # Hit Rate
        hit10.append(1 if rank <= 10 else 0)
        hit100.append(1 if rank <= 100 else 0)
        
    return np.mean(mrr10), np.mean(mrr100), np.mean(hit10), np.mean(hit100)

Loading weights: 100%|██████████| 105/105 [00:00<00:00, 1588.80it/s, Materializing param=classifier.weight]                                    
BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Pre-computing token lengths for sorting...


100%|██████████| 100000/100000 [00:11<00:00, 8433.64it/s]


Vanilla Cross Encoder (Random Order)

In [12]:
# --- 1. RUN VANILLA (Random Order) ---
results_vanilla = run_benchmark(data_records, "VANILLA", sort_by_length=False)

# Store scores back to dataframe
eval_set['score_vanilla'] = eval_set.index.map(results_vanilla['scores'])

# Calc Metrics
mrr10_v, mrr100_v, hit10_v, hit100_v = calculate_metrics(eval_set, 'score_vanilla')

# Print Vanilla Report
print("\n" + "="*40)
print(f"[VANILLA] Queries: {eval_set['qid'].nunique()} | total pairs: {len(eval_set):,}")
print(f"[VANILLA] Per-query CE latency (feed {FEED_SIZE} at a time):")
lats = results_vanilla['latencies']
print(f"  mean = {np.mean(lats):.1f} ms   median = {np.median(lats):.1f} ms   min/max = {np.min(lats):.1f}/{np.max(lats):.1f} ms")
print(f"  total wall time = {results_vanilla['wall_time']:.2f} s")

print(f"\n[VANILLA] Token padding waste (per {FEED_SIZE}-sized chunk):")
print(f"  avg padded length  = {results_vanilla['avg_padded']:.1f} tokens")
print(f"  avg actual length  = {results_vanilla['avg_actual']:.1f} tokens")
print(f"  avg padding waste  = {results_vanilla['waste_pct']:.1f}% (lower is better)")
print(f"  chunks processed   = {results_vanilla['chunks']:,}")

print("\n[VANILLA] === Metrics ===")
print(f"MRR@10      : {mrr10_v:.4f}")
print(f"MRR@100     : {mrr100_v:.4f}")
print(f"HitRate@10  : {hit10_v:.4f}")
print(f"HitRate@100 : {hit100_v:.4f}")


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Running VANILLA: 100%|██████████| 100/100 [00:59<00:00,  1.69it/s]


[VANILLA] Queries: 100 | total pairs: 100,000
[VANILLA] Per-query CE latency (feed 32 at a time):
  mean = 590.8 ms   median = 558.8 ms   min/max = 418.6/899.3 ms
  total wall time = 59.15 s

[VANILLA] Token padding waste (per 32-sized chunk):
  avg padded length  = 161.4 tokens
  avg actual length  = 82.7 tokens
  avg padding waste  = 48.8% (lower is better)
  chunks processed   = 3,200

[VANILLA] === Metrics ===
MRR@10      : 0.4336
MRR@100     : 0.4431
HitRate@10  : 0.7600
HitRate@100 : 0.9600





Proposed Cross Encoder (Sorted Order)

In [13]:
# --- 2. RUN PROPOSED (Sorted Order) ---
results_proposed = run_benchmark(data_records, "PROPOSED", sort_by_length=True)

# Store scores
eval_set['score_proposed'] = eval_set.index.map(results_proposed['scores'])

# Calc Metrics (Should be identical or very close to Vanilla)
mrr10_p, mrr100_p, hit10_p, hit100_p = calculate_metrics(eval_set, 'score_proposed')

# Print Proposed Report
print("\n" + "="*40)
print(f"[PROPOSED] Queries: {eval_set['qid'].nunique()} | total pairs: {len(eval_set):,}")
print(f"[PROPOSED] Per-query CE latency (feed {FEED_SIZE} at a time):")
lats_p = results_proposed['latencies']
print(f"  mean = {np.mean(lats_p):.1f} ms   median = {np.median(lats_p):.1f} ms   min/max = {np.min(lats_p):.1f}/{np.max(lats_p):.1f} ms")
print(f"  total wall time = {results_proposed['wall_time']:.2f} s")

print(f"\n[PROPOSED] Token padding waste (per {FEED_SIZE}-sized chunk):")
print(f"  avg padded length  = {results_proposed['avg_padded']:.1f} tokens")
print(f"  avg actual length  = {results_proposed['avg_actual']:.1f} tokens")
print(f"  avg padding waste  = {results_proposed['waste_pct']:.1f}% (lower is better)")
print(f"  chunks processed   = {results_proposed['chunks']:,}")

print("\n[PROPOSED] === Metrics ===")
print(f"MRR@10      : {mrr10_v:.4f}")
print(f"MRR@100     : {mrr100_v:.4f}")
print(f"HitRate@10  : {hit10_v:.4f}")
print(f"HitRate@100 : {hit100_v:.4f}")

Running PROPOSED: 100%|██████████| 100/100 [00:33<00:00,  3.02it/s]


[PROPOSED] Queries: 100 | total pairs: 100,000
[PROPOSED] Per-query CE latency (feed 32 at a time):
  mean = 330.2 ms   median = 328.4 ms   min/max = 247.5/536.1 ms
  total wall time = 33.09 s

[PROPOSED] Token padding waste (per 32-sized chunk):
  avg padded length  = 85.3 tokens
  avg actual length  = 82.7 tokens
  avg padding waste  = 3.1% (lower is better)
  chunks processed   = 3,200

[PROPOSED] === Metrics ===
MRR@10      : 0.4336
MRR@100     : 0.4431
HitRate@10  : 0.7600
HitRate@100 : 0.9600





## Retail Dataset

In [14]:
import os
import json
import time
import numpy as np
import pandas as pd
import torch
import chromadb
from tqdm import tqdm
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer

# --- HUGGING FACE TOKEN FIX ---
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1" 

# --- CONFIGURATION ---
FEED_SIZE = 32      # Batch size
DATASET_PATH = "retail_qna_eval_100.json"
CHROMA_PATH = "db"
MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'

# --- LlamaIndex & LangChain Imports ---
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

print("Libraries loaded successfully.")

Libraries loaded successfully.


In [15]:
# 1. Setup Embedding Model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model
Settings.llm = None

# 2. Connect to ChromaDB
print(f"Connecting to ChromaDB at {CHROMA_PATH}...")
db2 = chromadb.PersistentClient(path=CHROMA_PATH)
chroma_collection = db2.get_or_create_collection("retail_qna")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 3. Create Dense Retriever (LlamaIndex)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
dense_retriever = VectorIndexRetriever(index=index, similarity_top_k=50)
dense_query_engine = RetrieverQueryEngine(retriever=dense_retriever)

# 4. Create Sparse Retriever (BM25)
# We pull all docs from Chroma to build the BM25 index in memory
print("Building BM25 Index from Chroma documents...")
all_docs = chroma_collection.get()
documents = [Document(page_content=text, metadata={}) for text in all_docs['documents']]
bm25_retriever = BM25Retriever.from_documents(documents=documents, k=50)

print(f"Retrievers Ready. Total Documents in DB: {len(documents)}")

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1532.40it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


LLM is explicitly disabled. Using MockLLM.
Connecting to ChromaDB at db...
Building BM25 Index from Chroma documents...
Retrievers Ready. Total Documents in DB: 169


In [16]:
# 1. Load Queries
with open(DATASET_PATH, 'r') as f:
    query_data = json.load(f)

# Extract just the text strings
queries = [item['text'] for item in query_data]
print(f"Loaded {len(queries)} queries from {DATASET_PATH}")

# 2. Run Retrieval Loop (Hybrid: Dense + Sparse)
retrieved_records = []
global_idx = 0

print("Retrieving candidates for all queries...")
for q_idx, query_text in tqdm(enumerate(queries), total=len(queries)):
    
    # A. Dense Retrieval
    dense_results = dense_query_engine.retrieve(query_text)
    dense_texts = [n.node.get_text() for n in dense_results]
    
    # B. Sparse Retrieval (BM25)
    sparse_results = bm25_retriever.invoke(query_text) # .invoke is newer than get_relevant_documents
    sparse_texts = [doc.page_content for doc in sparse_results]
    
    # C. Combine & Deduplicate
    # We use a set to ensure we don't rerank the same passage twice
    unique_candidates = list(set(dense_texts + sparse_texts))
    
    # D. Store for Reranking
    for passage in unique_candidates:
        retrieved_records.append({
            'index': global_idx,
            'qid': q_idx,           # Group by Query ID
            'query': query_text,
            'passage': passage
        })
        global_idx += 1

print(f"Total pairs to rerank: {len(retrieved_records)}")

Loaded 100 queries from retail_qna_eval_100.json
Retrieving candidates for all queries...


100%|██████████| 100/100 [00:01<00:00, 76.01it/s]

Total pairs to rerank: 7601





In [17]:
# --- Load Cross Encoder ---
# Using the token fix to ensure it loads
print("Loading Cross-Encoder...")
model = CrossEncoder(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Pre-compute Token Lengths (Crucial for Proposed Method) ---
print("Pre-computing token lengths...")
for row in tqdm(retrieved_records):
    # Fast tokenization (no padding yet) just to get length
    tokens = tokenizer(row['query'], row['passage'], truncation=True, max_length=512)
    row['token_len'] = len(tokens['input_ids'])

# --- Define Benchmark Function ---
def run_benchmark(records, method_name, sort_by_length=False):
    grouped_data = {}
    for r in records:
        if r['qid'] not in grouped_data: grouped_data[r['qid']] = []
        grouped_data[r['qid']].append(r)
        
    query_latencies = []
    total_padding_waste = []
    total_actual_tokens = []
    total_chunks = 0
    start_global = time.time()
    
    for qid, docs in tqdm(grouped_data.items(), desc=f"Running {method_name}"):
        t0 = time.time()
        
        # STRATEGY: Sort vs Shuffle
        if sort_by_length:
            docs.sort(key=lambda x: x['token_len']) # PROPOSED
        else:
            np.random.RandomState(42).shuffle(docs) # VANILLA
            
        pairs = [[d['query'], d['passage']] for d in docs]
        
        # BATCHING LOOP
        for i in range(0, len(pairs), FEED_SIZE):
            batch_pairs = pairs[i : i + FEED_SIZE]
            
            # 1. Measure Padding Waste
            encoded = tokenizer(batch_pairs, padding=True, truncation=True, max_length=512, return_tensors='pt')
            actual = torch.sum(encoded['attention_mask']).item()
            waste = (encoded['input_ids'].shape[0] * encoded['input_ids'].shape[1]) - actual
            
            total_actual_tokens.append(actual)
            total_padding_waste.append(waste)
            total_chunks += 1
            
            # 2. Run Inference
            _ = model.predict(batch_pairs, batch_size=FEED_SIZE, show_progress_bar=False)

        query_latencies.append((time.time() - t0) * 1000)

    total_wall_time = time.time() - start_global
    
    return {
        "latencies": query_latencies,
        "wall_time": total_wall_time,
        "chunks": total_chunks,
        "avg_actual": np.mean(total_actual_tokens) / FEED_SIZE,
        "avg_padded": (np.sum(total_actual_tokens) + np.sum(total_padding_waste)) / total_chunks / FEED_SIZE,
        "waste_pct": (np.sum(total_padding_waste) / (np.sum(total_actual_tokens) + np.sum(total_padding_waste))) * 100
    }

# --- EXECUTE BENCHMARKS ---

# 1. VANILLA
results_vanilla = run_benchmark(retrieved_records, "VANILLA", sort_by_length=False)

# 2. PROPOSED
results_proposed = run_benchmark(retrieved_records, "PROPOSED", sort_by_length=True)

# --- PRINT REPORTS ---

print("\n" + "="*40)
print(f"[VANILLA] Queries: {len(queries)} | total pairs: {len(retrieved_records):,}")
print(f"[VANILLA] Per-query CE latency (feed {FEED_SIZE} at a time):")
lats_v = results_vanilla['latencies']
print(f"  mean = {np.mean(lats_v):.1f} ms   median = {np.median(lats_v):.1f} ms   min/max = {np.min(lats_v):.1f}/{np.max(lats_v):.1f} ms")
print(f"  total wall time = {results_vanilla['wall_time']:.2f} s")
print(f"\n[VANILLA] Token padding waste (per {FEED_SIZE}-sized chunk):")
print(f"  avg padded length  = {results_vanilla['avg_padded']:.1f} tokens")
print(f"  avg actual length  = {results_vanilla['avg_actual']:.1f} tokens")
print(f"  avg padding waste  = {results_vanilla['waste_pct']:.1f}% (lower is better)")
print(f"  chunks processed   = {results_vanilla['chunks']:,}")


print("\n" + "="*40)
print(f"[PROPOSED] Queries: {len(queries)} | total pairs: {len(retrieved_records):,}")
print(f"[PROPOSED] Per-query CE latency (feed {FEED_SIZE} at a time):")
lats_p = results_proposed['latencies']
print(f"  mean = {np.mean(lats_p):.1f} ms   median = {np.median(lats_p):.1f} ms   min/max = {np.min(lats_p):.1f}/{np.max(lats_p):.1f} ms")
print(f"  total wall time = {results_proposed['wall_time']:.2f} s")
print(f"\n[PROPOSED] Token padding waste (per {FEED_SIZE}-sized chunk):")
print(f"  avg padded length  = {results_proposed['avg_padded']:.1f} tokens")
print(f"  avg actual length  = {results_proposed['avg_actual']:.1f} tokens")
print(f"  avg padding waste  = {results_proposed['waste_pct']:.1f}% (lower is better)")
print(f"  chunks processed   = {results_proposed['chunks']:,}")

Loading Cross-Encoder...


Loading weights: 100%|██████████| 105/105 [00:00<00:00, 1555.34it/s, Materializing param=classifier.weight]                                    
BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Pre-computing token lengths...


100%|██████████| 7601/7601 [00:03<00:00, 2316.07it/s]
Running VANILLA: 100%|██████████| 100/100 [00:14<00:00,  6.87it/s]
Running PROPOSED: 100%|██████████| 100/100 [00:08<00:00, 12.15it/s]


[VANILLA] Queries: 100 | total pairs: 7,601
[VANILLA] Per-query CE latency (feed 32 at a time):
  mean = 144.8 ms   median = 148.4 ms   min/max = 102.0/472.8 ms
  total wall time = 14.56 s

[VANILLA] Token padding waste (per 32-sized chunk):
  avg padded length  = 398.9 tokens
  avg actual length  = 186.6 tokens
  avg padding waste  = 53.2% (lower is better)
  chunks processed   = 300

[PROPOSED] Queries: 100 | total pairs: 7,601
[PROPOSED] Per-query CE latency (feed 32 at a time):
  mean = 82.0 ms   median = 81.9 ms   min/max = 67.6/95.5 ms
  total wall time = 8.23 s

[PROPOSED] Token padding waste (per 32-sized chunk):
  avg padded length  = 266.2 tokens
  avg actual length  = 186.6 tokens
  avg padding waste  = 29.9% (lower is better)
  chunks processed   = 300



