In [1]:
# Single-cell DTDR + "Double" Composite-DTDR RAG Experiment
# --------------------------------------------------------
# Demonstrates: DTDR is a computational representation (compute + retrieval in transform domain),
# and that composing orthogonal transforms (e.g., Hadamard then DCT) still supports high-quality RAG retrieval.
#
# Requires: sentence-transformers, numpy. (scipy optional; used for DCT if available)

# ---- Environment bootstrap (safe re-run) ----

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

import numpy as np
import random

FAST_MODE = True

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

import sys
import subprocess

def ensure(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

required = ["numpy", "scipy", "matplotlib", "scikit-learn", "requests"]
for r in required:
    try:
        __import__(r)
    except ImportError:
        ensure(r)

import os, re, math, time, random, textwrap, urllib.request
from dataclasses import dataclass
from typing import List, Tuple, Dict

import numpy as np
from scipy.linalg import hadamard

from sentence_transformers import SentenceTransformer

# Optional DCT
try:
    from scipy.fft import dct as scipy_dct
    HAVE_DCT = True
except Exception:
    HAVE_DCT = False

SEED = 123
random.seed(SEED)
np.random.seed(SEED)

BOOKS_DIR = "books"
os.makedirs(BOOKS_DIR, exist_ok=True)

GUTENBERG = [
    ("alice_in_wonderland.txt", "https://www.gutenberg.org/cache/epub/11/pg11.txt"),
    ("pride_and_prejudice.txt", "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"),
    ("frankenstein.txt", "https://www.gutenberg.org/cache/epub/84/pg84.txt"),
    ("moby_dick.txt", "https://www.gutenberg.org/cache/epub/2701/pg2701.txt"),
    ("sherlock_holmes.txt", "https://www.gutenberg.org/cache/epub/1661/pg1661.txt"),
    ("dracula.txt", "https://www.gutenberg.org/cache/epub/345/pg345.txt"),
]

def download(url: str, path: str, timeout: int = 30) -> bool:
    try:
        req = urllib.request.Request(url, headers={"User-Agent":"Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = resp.read()
        try:
            txt = data.decode("utf-8")
        except UnicodeDecodeError:
            txt = data.decode("latin-1")
        with open(path, "w", encoding="utf-8", errors="ignore") as f:
            f.write(txt)
        return True
    except Exception as e:
        print(f"[WARN] Download failed: {url} ({e})")
        return False

def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def clean_gutenberg(text: str) -> str:
    start = re.search(r"\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG", text, flags=re.IGNORECASE)
    end   = re.search(r"\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG", text, flags=re.IGNORECASE)
    if start and end and start.end() < end.start():
        text = text[start.end():end.start()]
    text = re.sub(r"\s+", " ", text).strip()
    return text

def chunk_text(text: str, chunk_chars: int = 1200, overlap: int = 150, min_len: int = 250) -> List[str]:
    chunks = []
    step = max(1, chunk_chars - overlap)
    for i in range(0, len(text), step):
        ch = text[i:i+chunk_chars].strip()
        if len(ch) >= min_len:
            chunks.append(ch)
    return chunks

print("Books directory:", os.path.abspath(BOOKS_DIR))
for fname, url in GUTENBERG:
    path = os.path.join(BOOKS_DIR, fname)
    if not os.path.exists(path) or os.path.getsize(path) < 2000:
        print("Downloading:", fname)
        download(url, path)

book_paths = [os.path.join(BOOKS_DIR, fn) for fn in os.listdir(BOOKS_DIR) if fn.lower().endswith(".txt")]
book_paths.sort()

# Filter to just our Gutenberg list (and skip demo_book_*.txt)
include_books = {fn for fn, _ in GUTENBERG}
book_paths = [p for p in book_paths if os.path.basename(p) in include_books]

print("Found .txt books:", len(book_paths))
print("First few:", book_paths[:3])

@dataclass
class Passage:
    book: str
    idx: int
    text: str

def build_passages(paths: List[str], chunk_chars=1200, overlap=150, max_passages_per_book=250) -> List[Passage]:
    out: List[Passage] = []
    for p in paths:
        bn = os.path.basename(p)
        raw = read_text_file(p)
        txt = clean_gutenberg(raw)
        chunks = chunk_text(txt, chunk_chars=chunk_chars, overlap=overlap, min_len=250)
        chunks = chunks[:max_passages_per_book]
        print(f"  {bn}: {len(chunks)} passages")
        for i, ch in enumerate(chunks):
            out.append(Passage(book=bn, idx=i, text=ch))
    return out

passages = build_passages(book_paths, chunk_chars=1200, overlap=150, max_passages_per_book=250)
texts = [p.text for p in passages]
books = [p.book for p in passages]
print("Total passages:", len(passages))
assert len(passages) > 200, "Too few passages; increase max_passages_per_book."

# ---------------- Embeddings ----------------
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
print("Loading embedding model:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)

t0 = time.time()
emb = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
emb = np.asarray(emb, dtype=np.float32)
t1 = time.time()
print(f"Embeddings: {emb.shape}  time={t1-t0:.1f}s")

# ---------------- DTDR core ----------------
def next_pow2(n: int) -> int:
    return 1 if n <= 1 else 2 ** int(math.ceil(math.log2(n)))

def make_hadamard(n: int) -> np.ndarray:
    H = hadamard(n).astype(np.float32)
    H /= np.sqrt(n)
    return H

def pad_to_dh(X: np.ndarray, d_h: int) -> np.ndarray:
    N, d = X.shape
    if d == d_h:
        return X
    out = np.zeros((N, d_h), dtype=X.dtype)
    out[:, :d] = X
    return out

def quantize_blockwise(U: np.ndarray, block: int = 16) -> Tuple[np.ndarray, np.ndarray]:
    # int8 symmetric per-block scaling
    N, D = U.shape
    nblocks = (D + block - 1) // block
    qmax = 127
    q = np.zeros((N, D), dtype=np.int8)
    scales = np.zeros((N, nblocks), dtype=np.float32)
    for b in range(nblocks):
        j0, j1 = b*block, min(D, (b+1)*block)
        chunk = U[:, j0:j1]
        amax = np.max(np.abs(chunk), axis=1) + 1e-12
        s = amax / qmax
        scales[:, b] = s
        q[:, j0:j1] = np.clip(np.round(chunk / s[:, None]), -qmax, qmax).astype(np.int8)
    return q, scales

def apply_transform_vector(v: np.ndarray, kind: str, H: np.ndarray) -> np.ndarray:
    if kind == "hadamard":
        return (H @ v).astype(np.float32)
    if kind == "dct":
        if not HAVE_DCT:
            # fallback: use another Hadamard instead (still orthogonal)
            return (H @ v).astype(np.float32)
        return scipy_dct(v.astype(np.float32), norm="ortho").astype(np.float32)
    raise ValueError("Unknown transform kind:", kind)

def apply_transform_matrix(X: np.ndarray, kind: str, H: np.ndarray) -> np.ndarray:
    if kind == "hadamard":
        return (X @ H.T).astype(np.float32)
    if kind == "dct":
        if not HAVE_DCT:
            return (X @ H.T).astype(np.float32)
        # DCT along last axis for each row
        return scipy_dct(X.astype(np.float32), axis=1, norm="ortho").astype(np.float32)
    raise ValueError("Unknown transform kind:", kind)

def build_dtdr(X: np.ndarray, H: np.ndarray, block: int, transforms: Tuple[str, ...]) -> Tuple[np.ndarray, np.ndarray]:
    U = X
    for tkind in transforms:
        U = apply_transform_matrix(U, tkind, H)
    qcoef, scales = quantize_blockwise(U, block=block)
    return qcoef, scales

def dt_scores(qemb: np.ndarray, qcoef: np.ndarray, scales: np.ndarray, H: np.ndarray, block: int, transforms: Tuple[str, ...]) -> np.ndarray:
    # build query transform-domain vector
    v = qemb.astype(np.float32)
    # assume v already padded to d_h outside (caller)
    for tkind in transforms:
        v = apply_transform_vector(v, tkind, H)
    qf = qcoef.astype(np.float32)
    sf = scales.astype(np.float32)
    out = np.zeros((qf.shape[0],), dtype=np.float32)
    nblocks = sf.shape[1]
    for b in range(nblocks):
        j0, j1 = b*block, min(qf.shape[1], (b+1)*block)
        out += (qf[:, j0:j1] * v[j0:j1]).sum(axis=1) * sf[:, b]
    return out

def topk(scores: np.ndarray, k: int) -> np.ndarray:
    k = int(k)
    if k >= scores.shape[0]:
        return np.argsort(-scores)
    idx = np.argpartition(-scores, k)[:k]
    return idx[np.argsort(-scores[idx])]

# Parameters
BLOCK = 16
d_in = emb.shape[1]
d_h = next_pow2(d_in)
H = make_hadamard(d_h)
X = pad_to_dh(emb, d_h)

# Single DTDR = Hadamard
single_tf = ("hadamard",)
q_single, s_single = build_dtdr(X, H, BLOCK, single_tf)

# "Double" Composite DTDR = Hadamard then DCT (or Hadamard fallback if no DCT)
double_tf = ("hadamard", "dct")
q_double, s_double = build_dtdr(X, H, BLOCK, double_tf)

print(f"DTDR (single) built: q={q_single.shape} scales={s_single.shape} blocks={s_single.shape[1]}  tf={single_tf}")
print(f"DTDR (double) built: q={q_double.shape} scales={s_double.shape} blocks={s_double.shape[1]}  tf={double_tf}  (DCT={'yes' if HAVE_DCT else 'no-fallback'})")

# Float baseline for overlap
def float_scores(qemb_small: np.ndarray) -> np.ndarray:
    return emb @ qemb_small.astype(np.float32)

# ---------------- RAG-style queries ----------------
RAG_QUERIES = [
    ("Who is the White Rabbit and what is he doing?", "alice_in_wonderland.txt", ["white rabbit", "rabbit"]),
    ("What does Elizabeth think of Mr. Darcy early on?", "pride_and_prejudice.txt", ["darcy", "elizabeth"]),
    ("Who created the creature and what was the consequence?", "frankenstein.txt", ["frankenstein", "creature"]),
    ("What is Captain Ahab obsessed with?", "moby_dick.txt", ["ahab", "whale"]),
    ("What is Sherlock Holmes known for in solving mysteries?", "sherlock_holmes.txt", ["holmes", "watson"]),
    ("Who is Count Dracula and what is his nature?", "dracula.txt", ["dracula", "count"]),
]

norm_passages = [re.sub(r"\s+", " ", t.lower()) for t in texts]

def book_hit_at_k(idxs: np.ndarray, expected_book: str) -> int:
    return int(any(books[i] == expected_book for i in idxs))

def anchor_hit_at_k(idxs: np.ndarray, anchors: List[str]) -> int:
    anchors_l = [a.lower() for a in anchors]
    for i in idxs:
        t = norm_passages[i]
        if any(a in t for a in anchors_l):
            return 1
    return 0

def eval_rag(qcoef: np.ndarray, scales: np.ndarray, transforms: Tuple[str, ...], label: str, K: int = 8) -> Dict[str, float]:
    overlaps, bookhits, anchorhits = [], [], []
    for q, exp_book, anchors in RAG_QUERIES:
        qemb_small = model.encode([q], normalize_embeddings=True)[0].astype(np.float32)
        base = topk(float_scores(qemb_small), K)

        qpad = np.zeros((d_h,), dtype=np.float32)
        qpad[:d_in] = qemb_small
        dt = topk(dt_scores(qpad, qcoef, scales, H, BLOCK, transforms), K)

        overlaps.append(len(set(base.tolist()).intersection(set(dt.tolist()))) / len(base))
        bookhits.append(book_hit_at_k(dt, exp_book))
        anchorhits.append(anchor_hit_at_k(dt, anchors))
    return {
        "mode": label,
        "K": float(K),
        "mean_overlap_vs_float": float(np.mean(overlaps)),
        "book_hit_rate@K": float(np.mean(bookhits)),
        "anchor_hit_rate@K": float(np.mean(anchorhits)),
    }

print("\nRAG metrics (clean):")
print("single:", eval_rag(q_single, s_single, single_tf, "single_clean", K=8))
print("double:", eval_rag(q_double, s_double, double_tf, "double_clean", K=8))

# ---------------- Corruption ----------------
def corrupt_dropout(q: np.ndarray, drop_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    mask = rng.random(out.shape) < drop_frac
    out[mask] = 0
    return out

def corrupt_block_loss(q: np.ndarray, block_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    Nn, D = out.shape
    nblocks = (D + BLOCK - 1) // BLOCK
    if block_frac <= 0:
        return out
    n_drop = max(1, int(math.ceil(block_frac * nblocks)))
    drop_blocks = rng.choice(nblocks, size=min(nblocks, n_drop), replace=False)
    for b in drop_blocks:
        j0, j1 = b*BLOCK, min(D, (b+1)*BLOCK)
        out[:, j0:j1] = 0
    return out

levels = [0.0, 0.01, 0.05, 0.10, 0.20]

print("\nCorruption sweep (dropout) — SINGLE vs DOUBLE:")
for lvl in levels:
    qs = corrupt_dropout(q_single, lvl, seed=SEED)
    qd = corrupt_dropout(q_double, lvl, seed=SEED)
    print(lvl,
          "single", eval_rag(qs, s_single, single_tf, f"single_dropout_{lvl}", K=8),
          "double", eval_rag(qd, s_double, double_tf, f"double_dropout_{lvl}", K=8))

print("\nCorruption sweep (block loss) — SINGLE vs DOUBLE:")
for lvl in levels:
    qs = corrupt_block_loss(q_single, lvl, seed=SEED)
    qd = corrupt_block_loss(q_double, lvl, seed=SEED)
    print(lvl,
          "single", eval_rag(qs, s_single, single_tf, f"single_blockloss_{lvl}", K=8),
          "double", eval_rag(qd, s_double, double_tf, f"double_blockloss_{lvl}", K=8))

# ---------------- Show example prompts ----------------
def show_example(question: str, expected_book: str, anchors: List[str],
                 qcoef: np.ndarray, scales: np.ndarray, transforms: Tuple[str, ...],
                 title: str, K: int = 5):
    qemb_small = model.encode([question], normalize_embeddings=True)[0].astype(np.float32)
    qpad = np.zeros((d_h,), dtype=np.float32)
    qpad[:d_in] = qemb_small
    idx = topk(dt_scores(qpad, qcoef, scales, H, BLOCK, transforms), K)
    print("\n" + "="*92)
    print(title)
    print("QUESTION:", question)
    print("Expected:", expected_book, "| Anchors:", anchors)
    print("-"*92)
    for r, i in enumerate(idx, 1):
        ps = passages[i]
        print(f"{r:>2}. {ps.book} [chunk {ps.idx}]")
        print(textwrap.shorten(ps.text.replace("\n"," "), width=260, placeholder=" …"))
        print()
    context = "\n\n".join([passages[i].text for i in idx[:min(3, len(idx))]])
    print("RAG PROMPT (example):")
    print("-"*92)
    print("Question:", question)
    print("Context:\n", textwrap.shorten(context.replace("\n"," "), width=650, placeholder=" …"))

q0, exp0, anc0 = RAG_QUERIES[0]
show_example(q0, exp0, anc0, q_single, s_single, single_tf, title="SINGLE DTDR (clean)", K=5)
show_example(q0, exp0, anc0, q_double, s_double, double_tf, title=f"DOUBLE/COMPOSITE DTDR (clean)  (DCT={'yes' if HAVE_DCT else 'fallback'})", K=5)

print("\nDone.")


  from .autonotebook import tqdm as notebook_tqdm


Books directory: G:\train_jw\experiments\books
Downloading: alice_in_wonderland.txt
Downloading: pride_and_prejudice.txt
Downloading: frankenstein.txt
Downloading: moby_dick.txt
Downloading: sherlock_holmes.txt
Downloading: dracula.txt
Found .txt books: 6
First few: ['books\\alice_in_wonderland.txt', 'books\\dracula.txt', 'books\\frankenstein.txt']
  alice_in_wonderland.txt: 137 passages
  dracula.txt: 250 passages
  frankenstein.txt: 250 passages
  moby_dick.txt: 250 passages
  pride_and_prejudice.txt: 250 passages
  sherlock_holmes.txt: 250 passages
Total passages: 1387
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [00:01<00:00, 16.02it/s]


Embeddings: (1387, 384)  time=1.4s
DTDR (single) built: q=(1387, 512) scales=(1387, 32) blocks=32  tf=('hadamard',)
DTDR (double) built: q=(1387, 512) scales=(1387, 32) blocks=32  tf=('hadamard', 'dct')  (DCT=yes)

RAG metrics (clean):
single: {'mode': 'single_clean', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
double: {'mode': 'double_clean', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}

Corruption sweep (dropout) — SINGLE vs DOUBLE:
0.0 single {'mode': 'single_dropout_0.0', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0} double {'mode': 'double_dropout_0.0', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.01 single {'mode': 'single_dropout_0.01', 'K': 8.0, 'mean_overlap_vs_float': 0.9375, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0} double {'mode': 'double_dropout_0.01', 'K': 8.0, 'mean_overlap_vs_float':

In [2]:
# Ablation / Negative Control: "No-transform INT8" vs DTDR
# -------------------------------------------------------
# Purpose: show that the transform step is not incidental.
# We quantize the *raw* embedding vectors directly (no orthogonal transform),
# then do dot-product similarity in the same "quantized-domain" way.

import numpy as np
import math

# --- helper: blockwise quantize already defined above? If not, redefine minimal version here.
def quantize_blockwise_raw(U: np.ndarray, block: int = 16):
    N, D = U.shape
    nblocks = (D + block - 1) // block
    qmax = 127
    q = np.zeros((N, D), dtype=np.int8)
    scales = np.zeros((N, nblocks), dtype=np.float32)
    for b in range(nblocks):
        j0, j1 = b*block, min(D, (b+1)*block)
        chunk = U[:, j0:j1]
        amax = np.max(np.abs(chunk), axis=1) + 1e-12
        s = amax / qmax
        scales[:, b] = s
        q[:, j0:j1] = np.clip(np.round(chunk / s[:, None]), -qmax, qmax).astype(np.int8)
    return q, scales

def scores_quantized_no_transform(qpad: np.ndarray, qcoef: np.ndarray, scales: np.ndarray, block: int):
    # qpad is already length d_h (padded)
    qf = qcoef.astype(np.float32)
    sf = scales.astype(np.float32)
    out = np.zeros((qf.shape[0],), dtype=np.float32)
    nblocks = sf.shape[1]
    for b in range(nblocks):
        j0, j1 = b*block, min(qf.shape[1], (b+1)*block)
        out += (qf[:, j0:j1] * qpad[j0:j1]).sum(axis=1) * sf[:, b]
    return out

# reuse these from your prior cell
# BLOCK, d_h, d_in, emb, model, topk, float_scores, passages, books, norm_passages, RAG_QUERIES

# Build "no-transform" representation: quantize padded embeddings directly
X_raw = np.zeros((emb.shape[0], d_h), dtype=np.float32)
X_raw[:, :d_in] = emb.astype(np.float32)

q_raw, s_raw = quantize_blockwise_raw(X_raw, block=BLOCK)
print(f"NO-TRANSFORM INT8 built: q={q_raw.shape} scales={s_raw.shape} blocks={s_raw.shape[1]}")

def eval_rag_no_transform(qcoef: np.ndarray, scales: np.ndarray, label: str, K: int = 8):
    overlaps, bookhits, anchorhits = [], [], []
    for q, exp_book, anchors in RAG_QUERIES:
        qemb_small = model.encode([q], normalize_embeddings=True)[0].astype(np.float32)
        base = topk(float_scores(qemb_small), K)

        qpad = np.zeros((d_h,), dtype=np.float32)
        qpad[:d_in] = qemb_small
        dt = topk(scores_quantized_no_transform(qpad, qcoef, scales, BLOCK), K)

        overlaps.append(len(set(base.tolist()).intersection(set(dt.tolist()))) / len(base))
        bookhits.append(int(any(books[i] == exp_book for i in dt)))
        anchors_l = [a.lower() for a in anchors]
        anchorhits.append(int(any(any(a in norm_passages[i] for a in anchors_l) for i in dt)))
    return {
        "mode": label,
        "K": float(K),
        "mean_overlap_vs_float": float(np.mean(overlaps)),
        "book_hit_rate@K": float(np.mean(bookhits)),
        "anchor_hit_rate@K": float(np.mean(anchorhits)),
    }

print("\nRAG metrics (clean) — No-transform INT8:")
print(eval_rag_no_transform(q_raw, s_raw, "raw_clean", K=8))

# Corruption utilities (reuse if present, else define)
def corrupt_dropout(q: np.ndarray, drop_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    mask = rng.random(out.shape) < drop_frac
    out[mask] = 0
    return out

def corrupt_block_loss(q: np.ndarray, block_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    Nn, D = out.shape
    nblocks = (D + BLOCK - 1) // BLOCK
    if block_frac <= 0:
        return out
    n_drop = max(1, int(math.ceil(block_frac * nblocks)))
    drop_blocks = rng.choice(nblocks, size=min(nblocks, n_drop), replace=False)
    for b in drop_blocks:
        j0, j1 = b*BLOCK, min(D, (b+1)*BLOCK)
        out[:, j0:j1] = 0
    return out

levels = [0.0, 0.01, 0.05, 0.10, 0.20]

print("\nCorruption sweep (dropout) — No-transform INT8:")
for lvl in levels:
    qr = corrupt_dropout(q_raw, lvl, seed=SEED)
    print(lvl, eval_rag_no_transform(qr, s_raw, f"raw_dropout_{lvl}", K=8))

print("\nCorruption sweep (block loss) — No-transform INT8:")
for lvl in levels:
    qr = corrupt_block_loss(q_raw, lvl, seed=SEED)
    print(lvl, eval_rag_no_transform(qr, s_raw, f"raw_blockloss_{lvl}", K=8))

print("\nDone (ablation).")


NO-TRANSFORM INT8 built: q=(1387, 512) scales=(1387, 32) blocks=32

RAG metrics (clean) — No-transform INT8:
{'mode': 'raw_clean', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}

Corruption sweep (dropout) — No-transform INT8:
0.0 {'mode': 'raw_dropout_0.0', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.01 {'mode': 'raw_dropout_0.01', 'K': 8.0, 'mean_overlap_vs_float': 0.9375, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.05 {'mode': 'raw_dropout_0.05', 'K': 8.0, 'mean_overlap_vs_float': 0.8541666666666666, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.1 {'mode': 'raw_dropout_0.1', 'K': 8.0, 'mean_overlap_vs_float': 0.8125, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.2 {'mode': 'raw_dropout_0.2', 'K': 8.0, 'mean_overlap_vs_float': 0.6666666666666666, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}

Corruption sweep (block loss) — No-transform INT8:
0.0 {'mode': 'raw_blockloss_