In [1]:
# AI Red Team Bytes â€” Notebook 06
# Privacy hygiene: canary checks, redaction, and rate limiting
#
# Goal:
# - Demonstrate safe ways to detect memorization risk (canaries)
# - Show PII redaction before processing/logging
# - Show rate limiting to reduce probing attacks
#
# SAFE demo.

import re
import time
from collections import defaultdict

CANARIES = [
    "CANARY_9F3A_UNIQUE_PHRASE_DO_NOT_REPEAT",
    "CANARY_1B7D_INTERNAL_ONLY_MARKER",
]

def contains_canary(text: str) -> bool:
    """Returns True if a canary marker appears in text."""
    return any(c in text for c in CANARIES)

def privacy_audit(outputs: list[str]) -> str:
    """
    Scans outputs for canary leakage.
    If leakage occurs, it indicates potential memorization/privacy risk.
    """
    leaked = [o for o in outputs if contains_canary(o)]
    if leaked:
        return "PRIVACY RISK: Canary leaked in outputs."
    return "OK: No canary leakage detected."

def redact_pii(text: str) -> str:
    """
    Redacts basic identifiers:
    - 12-digit IDs (example)
    - email addresses
    """
    text = re.sub(r"\b\d{12}\b", "[REDACTED_ID]", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "[REDACTED_EMAIL]", text)
    return text

LAST_CALL = defaultdict(float)

def rate_limit(user_id: str, min_interval: float = 0.5) -> bool:
    """
    Blocks requests if they arrive too quickly.
    This reduces high-frequency probing that can power inference attacks.
    """
    now = time.time()
    if now - LAST_CALL[user_id] < min_interval:
        return False
    LAST_CALL[user_id] = now
    return True

def run_demo():
    print("=== Demo 06: Privacy hygiene (canary + redaction + rate limit) ===\n")

    # Canary audit demo
    outputs = [
        "Normal model output.",
        "Another normal output.",
        f"Accidental leak: {CANARIES[0]}",
    ]
    print("Canary audit result:", privacy_audit(outputs))
    print()

    # Redaction demo
    raw = "Email me at test@example.com and my employee ID is 123456789012."
    print("Raw text     :", raw)
    print("Redacted text:", redact_pii(raw))
    print()

    # Rate limit demo
    user_id = "user_001"
    for i in range(3):
        allowed = rate_limit(user_id, min_interval=1.0)
        print(f"Request {i+1} allowed?", allowed)
        time.sleep(0.2)  # intentionally short to trigger blocking

run_demo()


=== Demo 06: Privacy hygiene (canary + redaction + rate limit) ===

Canary audit result: PRIVACY RISK: Canary leaked in outputs.

Raw text     : Email me at test@example.com and my employee ID is 123456789012.
Redacted text: Email me at [REDACTED_EMAIL] and my employee ID is [REDACTED_ID].

Request 1 allowed? True
Request 2 allowed? False
Request 3 allowed? False
