In [1]:
# DTDR-RAG Experiment (single cell)
# Purpose: Demonstrate DTDR supports RAG-style retrieval (natural-language questions -> passage retrieval)
#          directly in quantized transform domain, with graceful degradation under corruption.
#
# Requirements:
#   pip install sentence-transformers scipy numpy
#
# Notes:
# - Uses Project Gutenberg texts (public domain) for a realistic corpus.
# - Evaluates retrieval quality using RAG-relevant proxies:
#     (1) overlap with float embedding baseline,
#     (2) "book-hit@K" against known source book for each question,
#     (3) simple anchor-term hit-rate in retrieved context.
# - Does NOT run an LLM (keeps the experiment self-contained and still patent-appropriate).

import os, re, math, time, random, textwrap
from dataclasses import dataclass
from typing import List, Dict, Tuple
import numpy as np
from scipy.linalg import hadamard
from sentence_transformers import SentenceTransformer

SEED = 123
random.seed(SEED)
np.random.seed(SEED)

BOOKS_DIR = "books"
os.makedirs(BOOKS_DIR, exist_ok=True)

GUTENBERG = [
    ("alice_in_wonderland.txt", "https://www.gutenberg.org/cache/epub/11/pg11.txt"),
    ("pride_and_prejudice.txt", "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"),
    ("frankenstein.txt", "https://www.gutenberg.org/cache/epub/84/pg84.txt"),
    ("moby_dick.txt", "https://www.gutenberg.org/cache/epub/2701/pg2701.txt"),
    ("sherlock_holmes.txt", "https://www.gutenberg.org/cache/epub/1661/pg1661.txt"),
    ("dracula.txt", "https://www.gutenberg.org/cache/epub/345/pg345.txt"),
]

def download(url: str, path: str, timeout: int = 30) -> bool:
    import urllib.request
    try:
        req = urllib.request.Request(url, headers={"User-Agent":"Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = resp.read()
        try:
            txt = data.decode("utf-8")
        except UnicodeDecodeError:
            txt = data.decode("latin-1")
        with open(path, "w", encoding="utf-8", errors="ignore") as f:
            f.write(txt)
        return True
    except Exception as e:
        print(f"[WARN] Download failed: {url} ({e})")
        return False

def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def clean_gutenberg(text: str) -> str:
    start = re.search(r"\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG", text, flags=re.IGNORECASE)
    end   = re.search(r"\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG", text, flags=re.IGNORECASE)
    if start and end and start.end() < end.start():
        text = text[start.end():end.start()]
    text = re.sub(r"\s+", " ", text).strip()
    return text

def chunk_text(text: str, chunk_chars: int = 1200, overlap: int = 150, min_len: int = 250) -> List[str]:
    chunks = []
    i, n = 0, len(text)
    step = max(1, chunk_chars - overlap)
    while i < n:
        ch = text[i:i+chunk_chars].strip()
        if len(ch) >= min_len:
            chunks.append(ch)
        i += step
    return chunks

# Download missing books
print("Books directory:", os.path.abspath(BOOKS_DIR))
for fname, url in GUTENBERG:
    path = os.path.join(BOOKS_DIR, fname)
    if not os.path.exists(path) or os.path.getsize(path) < 2000:
        print("Downloading:", fname)
        download(url, path)

book_paths = [os.path.join(BOOKS_DIR, fn) for fn in os.listdir(BOOKS_DIR) if fn.lower().endswith(".txt")]
book_paths.sort()
print("Found .txt books:", len(book_paths))
print("First few:", book_paths[:3])

@dataclass
class Passage:
    book: str
    idx: int
    text: str

def build_passages(book_paths: List[str], chunk_chars=1200, overlap=150,
                   include_books=None, max_passages_per_book=250) -> List[Passage]:
    passages: List[Passage] = []
    for p in book_paths:
        bn = os.path.basename(p)
        if include_books is not None and bn not in include_books:
            continue
        raw = read_text_file(p)
        txt = clean_gutenberg(raw)
        chunks = chunk_text(txt, chunk_chars=chunk_chars, overlap=overlap, min_len=250)
        chunks = chunks[:max_passages_per_book]
        print(f"  {bn}: {len(chunks)} passages")
        for i, ch in enumerate(chunks):
            passages.append(Passage(book=bn, idx=i, text=ch))
    return passages

# Use only the core Gutenberg set (ignore demo books if present)
include_books = {fn for fn, _ in GUTENBERG}
passages = build_passages(book_paths, chunk_chars=1200, overlap=150,
                          include_books=include_books, max_passages_per_book=250)
assert len(passages) > 200, "Too few passages; increase max_passages_per_book or add books."
texts = [p.text for p in passages]
books = [p.book for p in passages]
print("Total passages:", len(passages))

# ---------- Embeddings ----------
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
print("Loading embedding model:", MODEL_NAME)
model = SentenceTransformer(MODEL_NAME)

t0 = time.time()
emb = model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
emb = np.asarray(emb, dtype=np.float32)
t1 = time.time()
print(f"Embeddings: {emb.shape}  time={t1-t0:.1f}s")

# ---------- DTDR build ----------
def next_pow2(n: int) -> int:
    return 1 if n <= 1 else 2 ** int(math.ceil(math.log2(n)))

def make_hadamard(n: int) -> np.ndarray:
    if n & (n-1) != 0:
        raise ValueError("Hadamard size must be power of 2")
    H = hadamard(n).astype(np.float32)
    H /= np.sqrt(n)
    return H

def pad_to_dh(X: np.ndarray, d_h: int) -> np.ndarray:
    N, d = X.shape
    if d == d_h:
        return X
    out = np.zeros((N, d_h), dtype=X.dtype)
    out[:, :d] = X
    return out

def dt_transform(X: np.ndarray, H: np.ndarray) -> np.ndarray:
    return X @ H.T

def quantize_blockwise(U: np.ndarray, bits: int = 8, block: int = 16) -> Tuple[np.ndarray, np.ndarray]:
    N, D = U.shape
    nblocks = (D + block - 1) // block
    scales = np.zeros((N, nblocks), dtype=np.float32)
    qdtype = np.int8
    qmax = 127
    q = np.zeros((N, D), dtype=qdtype)
    for b in range(nblocks):
        j0, j1 = b * block, min(D, (b + 1) * block)
        chunk = U[:, j0:j1]
        amax = np.max(np.abs(chunk), axis=1) + 1e-12
        s = amax / qmax
        scales[:, b] = s
        q[:, j0:j1] = np.clip(np.round(chunk / s[:, None]), -qmax, qmax).astype(qdtype)
    return q, scales

BITS = 8
BLOCK = 16  # << RAG experiment default: 16 (good graded block-loss behavior)
N, d_in = emb.shape
d_h = next_pow2(d_in)
H = make_hadamard(d_h)
U = dt_transform(pad_to_dh(emb, d_h), H)
qcoef, scales = quantize_blockwise(U, bits=BITS, block=BLOCK)
print(f"DTDR built: qcoef={qcoef.shape} dtype={qcoef.dtype} scales={scales.shape} d_h={d_h} blocks={scales.shape[1]}")

def topk(a: np.ndarray, k: int) -> np.ndarray:
    k = int(k)
    if k >= a.shape[0]:
        return np.argsort(-a)
    idx = np.argpartition(-a, k)[:k]
    return idx[np.argsort(-a[idx])]

def float_scores(qemb: np.ndarray) -> np.ndarray:
    return emb @ qemb.astype(np.float32)

def dt_scores(qemb: np.ndarray, qcoef_eval: np.ndarray, scales_eval: np.ndarray) -> np.ndarray:
    uq = dt_transform(pad_to_dh(qemb[None, :], d_h), H)[0]  # (d_h,)
    qf = qcoef_eval.astype(np.float32)
    sf = scales_eval.astype(np.float32)
    out = np.zeros((qf.shape[0],), dtype=np.float32)
    nblocks = sf.shape[1]
    for b in range(nblocks):
        j0, j1 = b * BLOCK, min(d_h, (b + 1) * BLOCK)
        out += (qf[:, j0:j1] * uq[j0:j1]).sum(axis=1) * sf[:, b]
    return out

# ---------- RAG-style query set (known source book + anchors) ----------
# Each item: (question, expected_book_filename, anchor_terms)
# "anchor_terms" are a conservative proxy for context relevance (not a generation metric).
RAG_QUERIES = [
    ("Who is the White Rabbit and what is he doing?", "alice_in_wonderland.txt", ["white rabbit", "rabbit"]),
    ("What does Elizabeth think of Mr. Darcy early on?", "pride_and_prejudice.txt", ["darcy", "elizabeth"]),
    ("Who created the creature and what was the consequence?", "frankenstein.txt", ["frankenstein", "creature"]),
    ("What is Captain Ahab obsessed with?", "moby_dick.txt", ["ahab", "whale"]),
    ("What is Sherlock Holmes known for in solving mysteries?", "sherlock_holmes.txt", ["holmes", "watson"]),
    ("Who is Count Dracula and what is his nature?", "dracula.txt", ["dracula", "count"]),
]

def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", s.lower())

norm_passages = [normalize_text(t) for t in texts]

def book_hit_at_k(idxs: np.ndarray, expected_book: str) -> int:
    return int(any(books[i] == expected_book for i in idxs))

def anchor_hit_at_k(idxs: np.ndarray, anchors: List[str]) -> int:
    # anchor hit if any retrieved passage contains any anchor substring
    anchors_l = [a.lower() for a in anchors]
    for i in idxs:
        t = norm_passages[i]
        if any(a in t for a in anchors_l):
            return 1
    return 0

def eval_rag(qcoef_eval: np.ndarray, scales_eval: np.ndarray, label: str, K: int = 8) -> Dict[str, float]:
    K = min(K, len(passages))
    overlaps, bookhits, anchorhits = [], [], []
    for q, exp_book, anchors in RAG_QUERIES:
        qemb = model.encode([q], normalize_embeddings=True)[0].astype(np.float32)
        base = topk(float_scores(qemb), K)
        dt   = topk(dt_scores(qemb, qcoef_eval, scales_eval), K)
        overlaps.append(len(set(base.tolist()).intersection(set(dt.tolist()))) / len(base))
        bookhits.append(book_hit_at_k(dt, exp_book))
        anchorhits.append(anchor_hit_at_k(dt, anchors))
    return {
        "mode": label,
        "K": float(K),
        "mean_overlap_vs_float": float(np.mean(overlaps)),
        "book_hit_rate@K": float(np.mean(bookhits)),
        "anchor_hit_rate@K": float(np.mean(anchorhits)),
    }

print("\nRAG retrieval metrics (clean):")
print(eval_rag(qcoef, scales, "clean", K=8))

# ---------- Corruption ----------
def corrupt_dropout(q: np.ndarray, drop_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    mask = rng.random(out.shape) < drop_frac
    out[mask] = 0
    return out

def corrupt_block_loss(q: np.ndarray, block_frac: float, seed: int = 123) -> np.ndarray:
    rng = np.random.default_rng(seed)
    out = q.copy()
    Nn, D = out.shape
    nblocks = (D + BLOCK - 1)//BLOCK
    if block_frac <= 0:
        return out
    n_drop = max(1, int(math.ceil(block_frac * nblocks)))
    drop_blocks = rng.choice(nblocks, size=min(nblocks, n_drop), replace=False)
    for b in drop_blocks:
        j0, j1 = b*BLOCK, min(D, (b+1)*BLOCK)
        out[:, j0:j1] = 0
    return out

levels = [0.0, 0.01, 0.05, 0.10, 0.20]
print("\nRAG metrics under dropout corruption:")
for lvl in levels:
    q2 = corrupt_dropout(qcoef, lvl, seed=SEED)
    print(lvl, eval_rag(q2, scales, f"dropout_{lvl}", K=8))

print("\nRAG metrics under block-loss corruption:")
for lvl in levels:
    q2 = corrupt_block_loss(qcoef, lvl, seed=SEED)
    print(lvl, eval_rag(q2, scales, f"blockloss_{lvl}", K=8))

# ---------- Show example RAG prompts (question + retrieved context) ----------
def show_rag_example(question: str, expected_book: str, anchors: List[str], qcoef_eval=qcoef, scales_eval=scales, K: int = 5):
    qemb = model.encode([question], normalize_embeddings=True)[0].astype(np.float32)
    idx = topk(dt_scores(qemb, qcoef_eval, scales_eval), K)
    print("\n" + "="*90)
    print("QUESTION:", question)
    print("Expected source:", expected_book, " | Anchors:", anchors)
    print("-"*90)
    for r, i in enumerate(idx, 1):
        ps = passages[i]
        print(f"{r:>2}. {ps.book} [chunk {ps.idx}]")
        print(textwrap.shorten(ps.text.replace("\n"," "), width=260, placeholder=" …"))
        print()
    # A minimal “RAG prompt” example
    context = "\n\n".join([passages[i].text for i in idx[:min(3, len(idx))]])
    print("RAG PROMPT (example):")
    print("-"*90)
    print("Question:", question)
    print("Context:\n", textwrap.shorten(context.replace("\n"," "), width=600, placeholder=" …"))

# Show one clean example
q0, exp0, anc0 = RAG_QUERIES[0]
show_rag_example(q0, exp0, anc0, qcoef_eval=qcoef, scales_eval=scales, K=5)

# Show the same example under heavier corruption (demonstrates graceful degradation in RAG setting)
qcor = corrupt_dropout(qcoef, 0.10, seed=SEED)
show_rag_example(q0, exp0, anc0, qcoef_eval=qcor, scales_eval=scales, K=5)

print("\nDone.")


  from .autonotebook import tqdm as notebook_tqdm


Books directory: G:\train_jw\books
Found .txt books: 6
First few: ['books\\alice_in_wonderland.txt', 'books\\dracula.txt', 'books\\frankenstein.txt']
  alice_in_wonderland.txt: 137 passages
  dracula.txt: 250 passages
  frankenstein.txt: 250 passages
  moby_dick.txt: 250 passages
  pride_and_prejudice.txt: 250 passages
  sherlock_holmes.txt: 250 passages
Total passages: 1387
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [00:01<00:00, 16.35it/s]


Embeddings: (1387, 384)  time=1.4s
DTDR built: qcoef=(1387, 512) dtype=int8 scales=(1387, 32) d_h=512 blocks=32

RAG retrieval metrics (clean):
{'mode': 'clean', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}

RAG metrics under dropout corruption:
0.0 {'mode': 'dropout_0.0', 'K': 8.0, 'mean_overlap_vs_float': 1.0, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.01 {'mode': 'dropout_0.01', 'K': 8.0, 'mean_overlap_vs_float': 0.9375, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.05 {'mode': 'dropout_0.05', 'K': 8.0, 'mean_overlap_vs_float': 0.8541666666666666, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.1 {'mode': 'dropout_0.1', 'K': 8.0, 'mean_overlap_vs_float': 0.75, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}
0.2 {'mode': 'dropout_0.2', 'K': 8.0, 'mean_overlap_vs_float': 0.7708333333333334, 'book_hit_rate@K': 1.0, 'anchor_hit_rate@K': 1.0}

RAG metrics under block-loss corruption:
0.0 {'mode': 'blockloss_0.0', 'K': 8.0,