In [None]:
#@title üéß Download Narration Audio & Play Introduction
import os as _os
if not _os.path.exists("/content/narration"):
    !pip install -q gdown
    import gdown
    gdown.download(id="1HXCxUPrYtInnadQfEbZ0gts10ZmgZl9x", output="/content/narration.zip", quiet=False)
    !unzip -q /content/narration.zip -d /content/narration
    !rm /content/narration.zip
    print(f"Loaded {len(_os.listdir('/content/narration'))} narration segments")
else:
    print("Narration audio already loaded.")

from IPython.display import Audio, display
display(Audio("/content/narration/00_intro.mp3"))

In [None]:
#@title üéß Listen: Setup
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/01_setup.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
# üîß Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("‚ö†Ô∏è No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

print(f"\nüì¶ Python {sys.version.split()[0]}")
print(f"üî• PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"üé≤ Random seed set to {SEED}")

%matplotlib inline

In [None]:
#@title üéß Listen: Title Overview
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/02_title_overview.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

# üöÄ Context Failure Modes: When Good Models Get Bad Context

**Part 2 of the Vizuara Context Engineering Series**

In Part 1, we learned that context engineering is the art of filling an LLM's context window with exactly the right information. But what happens when we get it *wrong*?

In this notebook, we will get our hands dirty with Drew Breunig's taxonomy of four context failure modes. We will not just read about them ‚Äî we will **simulate each one**, measure the damage, and build a diagnostic tool that can detect these failures in any LLM application.

**What you will learn:**
- How a single hallucination can poison an entire conversation (Context Poisoning)
- Why more context often makes models *worse*, not better (Context Distraction)
- How irrelevant information degrades model performance (Context Confusion)
- Why contradictory instructions cause catastrophic failures (Context Clash)
- How to build a Context Health Checker that diagnoses all four failure modes

**Time:** ~25 minutes
**Prerequisites:** Basic Python, familiarity with text similarity concepts
**API keys needed:** None! Everything runs locally with classical NLP.

In [None]:
#@title üéß Listen: Surgeon Analogy
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/03_surgeon_analogy.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 1. Why Does This Matter?

Imagine you are a brilliant surgeon about to operate. Your skills are world-class. But someone has:
- Swapped one of your X-rays with another patient's (poisoning)
- Buried the critical lab result on page 47 of a 60-page report (distraction)
- Included the patient's dental records, tax returns, and vacation photos in the file (confusion)
- Given you two conflicting surgical plans from two different doctors (clash)

Would you expect a good outcome? Of course not. **The surgeon's skill is irrelevant if the information environment is broken.**

This is exactly what happens to LLMs every day. The model might be GPT-4, Claude, or Gemini ‚Äî it does not matter. Bad context produces bad outputs. And unlike a surgeon, an LLM cannot raise its hand and say, "Wait, something seems off here."

Let us build the tools to detect these failures before they cause damage.

In [None]:
#@title üéß Listen: Imports Setup
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/04_imports_setup.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 2. Building Intuition ‚Äî Setup and Imports

Before we dive into the failure modes, let us set up our environment. We will use only standard scientific Python libraries ‚Äî no API keys needed.

In [None]:
# Install any missing dependencies (all standard libraries)
!pip install -q numpy matplotlib scikit-learn

import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Set consistent styling for all our plots
plt.rcParams.update({
    'figure.figsize': (10, 6),
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'lines.linewidth': 2,
    'figure.facecolor': 'white',
    'axes.facecolor': 'white',
    'axes.grid': True,
    'grid.alpha': 0.3,
})

# Color palette for our failure modes
COLORS = {
    'poisoning': '#e74c3c',    # Red
    'distraction': '#f39c12',  # Orange
    'confusion': '#9b59b6',    # Purple
    'clash': '#3498db',        # Blue
    'healthy': '#2ecc71',      # Green
}

print("Setup complete! All libraries loaded.")
print("No API keys needed ‚Äî everything runs locally.")

In [None]:
#@title üéß Listen: Math Foundations
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/05_math_foundations.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 3. The Mathematics ‚Äî Measuring Context Quality

Before we simulate failures, we need a way to **measure** context quality. We will use two key metrics from information retrieval:

**TF-IDF (Term Frequency - Inverse Document Frequency)** converts text into numerical vectors based on word importance. Common words like "the" get low scores; distinctive words like "transformer" get high scores.

**Cosine Similarity** measures how similar two text vectors are, on a scale from 0 (completely different) to 1 (identical).

Think of it this way: if your query is "How does attention work in transformers?" and a context chunk talks about "self-attention mechanisms in transformer architectures," the cosine similarity will be high. If the chunk talks about "recipes for chocolate cake," the similarity will be near zero.

These two tools are our stethoscope ‚Äî they let us diagnose context health without needing an actual LLM.

In [None]:
#@title üéß Listen: Measurement Functions
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/06_measurement_functions.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
def compute_relevance(query, documents):
    """
    Compute the relevance of each document to a query using TF-IDF + cosine similarity.

    This is the foundation of our context health analysis. In a real RAG system,
    you'd use neural embeddings ‚Äî but TF-IDF captures the same intuition:
    relevant documents share important words with the query.

    Args:
        query: The question or task (string)
        documents: List of context chunks (list of strings)

    Returns:
        similarities: Array of relevance scores (0 to 1)
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    # Fit on all text, transform query and documents
    all_text = [query] + documents
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Cosine similarity between query (index 0) and each document
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    return similarities


def signal_to_noise_ratio(query, documents):
    """
    Compute the signal-to-noise ratio of a context.

    Signal = average relevance of the top-K most relevant chunks
    Noise = average relevance of everything else

    A healthy context has high SNR (lots of signal, little noise).
    A sick context has low SNR (noise drowns the signal).
    """
    similarities = compute_relevance(query, documents)

    if len(similarities) == 0:
        return 0.0, similarities

    # Top 30% are "signal," rest is "noise"
    k = max(1, int(len(similarities) * 0.3))
    sorted_sims = np.sort(similarities)[::-1]

    signal = np.mean(sorted_sims[:k])
    noise = np.mean(sorted_sims[k:]) if len(sorted_sims) > k else 0.001

    snr = signal / max(noise, 0.001)  # Avoid division by zero
    return snr, similarities


# Quick demo
demo_query = "How does the attention mechanism work in transformer models?"
demo_docs = [
    "The self-attention mechanism allows transformers to weigh the importance of different input tokens.",
    "Batch normalization normalizes layer inputs to stabilize training in deep networks.",
    "Attention computes query, key, and value matrices to determine token relationships.",
    "The recipe for chocolate cake requires flour, sugar, eggs, and cocoa powder.",
    "Multi-head attention runs several attention operations in parallel for richer representations.",
]

snr, sims = signal_to_noise_ratio(demo_query, demo_docs)
print(f"Query: '{demo_query[:60]}...'")
print(f"\nRelevance scores:")
for doc, sim in zip(demo_docs, sims):
    relevance = "RELEVANT" if sim > 0.1 else "NOISE"
    bar = "‚ñà" * int(sim * 40)
    print(f"  [{relevance:>8}] {sim:.3f} {bar} | {doc[:65]}...")
print(f"\nSignal-to-Noise Ratio: {snr:.2f}")
print(f"Interpretation: {'Healthy context' if snr > 3 else 'Context needs improvement' if snr > 1.5 else 'Noisy context!'}")

In [None]:
#@title üéß Listen: Poisoning Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/07_poisoning_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 4. Let's Build It ‚Äî Simulating the Four Failure Modes

Now for the main event. We will simulate each of Drew Breunig's four failure modes, measure the damage quantitatively, and visualize the results.

### Failure Mode 1: Context Poisoning

Context poisoning happens when a hallucination or error enters the context and gets referenced repeatedly. It is like a student writing the wrong formula on their reference sheet ‚Äî every subsequent answer built on that formula will be wrong, and they will never know why.

**The real-world case:** Google built a Gemini-based agent to play Pokemon. The agent hallucinated about game state ‚Äî it believed it had items it did not have, and objectives that did not exist. This false information poisoned the "goals" section, and the agent developed completely nonsensical strategies.

Let us simulate this. We will start with a clean, accurate context and inject a false fact. Then we will watch how the error propagates as the context grows over multiple turns.

In [None]:
#@title üéß Listen: Poisoning Simulation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/08_poisoning_simulation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
def simulate_poisoning(num_turns=10, poison_turn=2):
    """
    Simulate context poisoning over multiple conversation turns.

    The setup: We have a knowledge base of true facts about neural networks.
    At turn `poison_turn`, a false fact enters the context (like a hallucination).
    We then measure how much the context degrades over subsequent turns as
    "responses" build on the poisoned information.

    This mirrors the Gemini Pokemon case: one hallucinated game state
    entry corrupted all downstream reasoning.
    """
    # Ground truth knowledge base
    true_facts = [
        "Transformers use self-attention to process sequences in parallel.",
        "The learning rate controls the step size during gradient descent.",
        "Batch normalization normalizes layer inputs for stable training.",
        "Dropout randomly deactivates neurons to prevent overfitting.",
        "Residual connections allow gradients to flow through deep networks.",
        "Adam optimizer combines momentum with adaptive learning rates.",
        "Layer normalization normalizes across features within a single example.",
        "Weight decay adds L2 regularization to prevent large weight values.",
        "Gradient clipping prevents exploding gradients in deep networks.",
        "Cross-entropy loss measures divergence between predicted and true distributions.",
    ]

    # The poison: a plausible-sounding but WRONG fact
    poison_fact = "Transformers process tokens sequentially from left to right like RNNs."

    # Downstream "responses" that build on the poison (error propagation)
    poison_derivatives = [
        "Since transformers are sequential, they cannot parallelize training across tokens.",
        "The left-to-right processing in transformers limits context to previous tokens only.",
        "Transformer training is slow because each token must wait for the previous one.",
        "Bidirectional context is impossible in transformers due to sequential processing.",
        "Scaling transformers requires reducing sequence length because of sequential bottleneck.",
    ]

    context_accuracy = []
    context_contents = []

    for turn in range(num_turns):
        if turn < poison_turn:
            # Clean context: only true facts
            context = true_facts[:turn + 2]
        elif turn == poison_turn:
            # Poison injected!
            context = true_facts[:turn + 1] + [poison_fact]
        else:
            # Post-poison: derivatives of the wrong fact accumulate
            derivative_idx = min(turn - poison_turn - 1, len(poison_derivatives) - 1)
            context = (true_facts[:poison_turn + 1] +
                      [poison_fact] +
                      poison_derivatives[:derivative_idx + 1] +
                      true_facts[poison_turn + 1:turn])

        context_contents.append(context)

        # Measure accuracy: what fraction of context is true/accurate?
        accurate_count = sum(1 for item in context if item in true_facts)
        accuracy = accurate_count / len(context)
        context_accuracy.append(accuracy)

    return context_accuracy, context_contents, poison_turn


# Run the simulation
accuracy_over_turns, contexts, poison_turn = simulate_poisoning(num_turns=12, poison_turn=3)

# Visualize the degradation
fig, ax = plt.subplots(figsize=(10, 5))

turns = range(len(accuracy_over_turns))
colors_per_turn = [COLORS['healthy'] if t < poison_turn else COLORS['poisoning'] for t in turns]

bars = ax.bar(turns, accuracy_over_turns, color=colors_per_turn, edgecolor='white', linewidth=0.5)
ax.axvline(x=poison_turn - 0.5, color=COLORS['poisoning'], linestyle='--', linewidth=2,
           label=f'Poison injected (turn {poison_turn})')

ax.set_xlabel('Conversation Turn')
ax.set_ylabel('Context Accuracy (fraction of true facts)')
ax.set_title('Context Poisoning: How One Hallucination Degrades Everything')
ax.set_ylim(0, 1.05)
ax.legend(fontsize=11)

# Annotate the key insight
ax.annotate('Error compounds\nover time!',
            xy=(poison_turn + 4, accuracy_over_turns[poison_turn + 4]),
            xytext=(poison_turn + 5, 0.85),
            fontsize=11, color=COLORS['poisoning'], fontweight='bold',
            arrowprops=dict(arrowstyle='->', color=COLORS['poisoning'], lw=2))

plt.tight_layout()
plt.show()

print(f"\nContext accuracy BEFORE poisoning: {accuracy_over_turns[poison_turn-1]:.0%}")
print(f"Context accuracy AFTER poisoning (final turn): {accuracy_over_turns[-1]:.0%}")
print(f"Degradation: {accuracy_over_turns[poison_turn-1] - accuracy_over_turns[-1]:.0%} drop")
print(f"\nüí° Key insight: The poison doesn't just add one wrong fact ‚Äî")
print(f"   it generates DERIVATIVE errors that compound over turns.")

In [None]:
#@title üéß Listen: Poisoning Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/09_poisoning_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
#@title üéß Listen: Distraction Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/12_distraction_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
#@title üéß Listen: Confusion Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/15_confusion_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### ü§î Think About It

Notice how the accuracy does not just drop by one fact ‚Äî it **cascades**. Each turn after the poison, new "responses" are generated that build on the wrong fact, adding more and more incorrect information to the context.

This is exactly what happened with Google's Pokemon agent. One hallucinated game state entry led to a chain of increasingly absurd strategies. The agent was not "stupid" ‚Äî it was reasoning perfectly logically from a poisoned premise.

**The lesson:** In any multi-turn LLM application, you need mechanisms to validate context entries against ground truth. We will build exactly that in our TODO section.

In [None]:
#@title üéß Listen: Distraction Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/10_distraction_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### Failure Mode 2: Context Distraction ‚Äî "Lost in the Middle"

This failure mode is backed by one of the most cited LLM research papers of 2023: "Lost in the Middle" by Liu et al. They discovered that LLMs attend strongly to information at the **beginning** and **end** of their context, but perform poorly on information buried in the **middle**.

The implication is counterintuitive: **more context can make models worse**, not better. Beyond ~100K tokens, agents start repeating actions from their history rather than synthesizing new plans.

Let us recreate this famous U-shaped curve.

In [None]:
#@title üéß Listen: Distraction Simulation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/11_distraction_simulation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
def simulate_lost_in_the_middle(num_positions=20, num_trials=50):
    """
    Simulate the 'Lost in the Middle' effect (Liu et al. 2023).

    We create a long context with many 'distractor' paragraphs and one
    'target' paragraph containing the answer. We vary WHERE the target
    appears (beginning, middle, end) and measure retrieval accuracy.

    Instead of using an actual LLM, we simulate the attention pattern
    that Liu et al. discovered: a U-shaped curve where the model strongly
    attends to the beginning and end, but poorly to the middle.
    """
    # The target fact we want the model to find
    target = "The critical breakthrough in transformer efficiency was the introduction of flash attention, which reduces memory from quadratic to linear by computing attention in blocks."

    # Distractor paragraphs (plausible ML content that is NOT the answer)
    distractors = [
        "Neural networks use backpropagation to compute gradients through the chain rule.",
        "Convolutional neural networks apply learned filters to detect spatial features.",
        "Recurrent networks maintain hidden state across sequential time steps.",
        "Generative adversarial networks train a generator and discriminator in opposition.",
        "Variational autoencoders learn latent representations through probabilistic encoding.",
        "Graph neural networks propagate information along edges between nodes.",
        "Reinforcement learning agents maximize cumulative reward through trial and error.",
        "Transfer learning reuses pretrained features for downstream tasks with less data.",
        "Data augmentation artificially expands training sets through random transformations.",
        "Gradient descent iteratively minimizes the loss function by following the negative gradient.",
        "Batch processing groups multiple samples together for efficient parallel computation.",
        "Regularization techniques like weight decay prevent models from memorizing training data.",
        "Hyperparameter tuning searches for optimal model configuration across a defined space.",
        "Ensemble methods combine predictions from multiple models to reduce variance.",
        "Knowledge distillation transfers learned representations from large to small models.",
        "Curriculum learning presents training examples in order of increasing difficulty.",
        "Few-shot learning enables models to generalize from very limited labeled examples.",
        "Continual learning addresses catastrophic forgetting when training on new tasks.",
        "Self-supervised learning creates labels from the data itself without human annotation.",
        "Neural architecture search automates the design of network topologies.",
    ]

    query = "What was the critical breakthrough in transformer efficiency?"

    # Simulate retrieval accuracy at each position
    # We model the U-shaped attention pattern from the paper
    positions = np.arange(num_positions)
    mid = num_positions / 2

    retrieval_accuracies = []
    retrieval_std = []

    for pos in positions:
        trial_scores = []
        for _ in range(num_trials):
            # Build context with target at position `pos`
            context_docs = list(distractors[:num_positions - 1])
            context_docs.insert(pos, target)

            # Simulate the attention bias:
            # - Strong attention to beginning (positions 0-3)
            # - Strong attention to end (positions 16-19)
            # - Weak attention to middle (positions 7-12)
            normalized_pos = pos / (num_positions - 1)  # 0 to 1

            # U-shaped attention curve: high at edges, low in middle
            # Based on Liu et al. findings
            attention_score = (
                0.9 * np.exp(-8 * (normalized_pos - 0)**2) +   # Beginning peak
                0.85 * np.exp(-8 * (normalized_pos - 1)**2) +  # End peak
                0.15                                              # Baseline
            )
            attention_score = min(attention_score, 1.0)

            # Add noise to simulate variability
            noisy_score = attention_score + np.random.normal(0, 0.08)
            noisy_score = np.clip(noisy_score, 0, 1)

            # Retrieval succeeds if attention score exceeds threshold
            trial_scores.append(noisy_score)

        retrieval_accuracies.append(np.mean(trial_scores))
        retrieval_std.append(np.std(trial_scores))

    return positions, retrieval_accuracies, retrieval_std


# Run the simulation
positions, accuracies, stds = simulate_lost_in_the_middle(num_positions=20, num_trials=100)

# üìä Visualization: The famous U-shaped curve
fig, ax = plt.subplots(figsize=(10, 5))

ax.fill_between(positions,
                np.array(accuracies) - np.array(stds),
                np.array(accuracies) + np.array(stds),
                alpha=0.2, color=COLORS['distraction'])
ax.plot(positions, accuracies, 'o-', color=COLORS['distraction'], markersize=6, linewidth=2)

# Annotate the U-shape
ax.annotate('Strong attention\nat beginning',
            xy=(1, accuracies[1]), xytext=(4, 0.95),
            fontsize=10, fontweight='bold', color='#27ae60',
            arrowprops=dict(arrowstyle='->', color='#27ae60', lw=1.5))

ax.annotate('"Lost in\nthe Middle"',
            xy=(10, accuracies[10]), xytext=(12, 0.35),
            fontsize=11, fontweight='bold', color=COLORS['poisoning'],
            arrowprops=dict(arrowstyle='->', color=COLORS['poisoning'], lw=2))

ax.annotate('Strong attention\nat end',
            xy=(18, accuracies[18]), xytext=(14, 0.92),
            fontsize=10, fontweight='bold', color='#27ae60',
            arrowprops=dict(arrowstyle='->', color='#27ae60', lw=1.5))

ax.set_xlabel('Position of Key Information in Context')
ax.set_ylabel('Retrieval Accuracy (simulated)')
ax.set_title('Context Distraction: The "Lost in the Middle" Effect (Liu et al. 2023)')
ax.set_ylim(0, 1.1)
ax.set_xticks(range(0, 20, 2))
ax.axhline(y=0.5, color='gray', linestyle=':', alpha=0.5, label='50% accuracy baseline')
ax.legend()

plt.tight_layout()
plt.show()

# Print the key finding
best_pos = positions[np.argmax(accuracies)]
worst_pos = positions[np.argmin(accuracies)]
print(f"\nüìä Results:")
print(f"  Best retrieval position:  {best_pos} (accuracy: {max(accuracies):.2f})")
print(f"  Worst retrieval position: {worst_pos} (accuracy: {min(accuracies):.2f})")
print(f"  Drop from best to worst:  {max(accuracies) - min(accuracies):.2f}")
print(f"\nüí° Key insight: Information placed in the middle of a long context")
print(f"   is up to {((max(accuracies) - min(accuracies)) / max(accuracies) * 100):.0f}% less likely to be retrieved correctly.")
print(f"   More context is NOT always better!")

### ü§î Think About It

The U-shaped curve tells us something profound: LLMs are not uniform information processors. They have attention biases ‚Äî strong at the edges, weak in the middle. This means:

1. **Put your most important information first** (beginning of context)
2. **Recency also helps** (end of context)
3. **Never bury critical facts in the middle** of a long context

This is why reranking matters in RAG pipelines ‚Äî you want to push the most relevant chunks to the top of the context, not let them get lost in the middle of 50 retrieved documents.

In [None]:
#@title üéß Listen: Confusion Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/13_confusion_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### Failure Mode 3: Context Confusion ‚Äî When More Tools Means More Failure

This is perhaps the most counterintuitive failure mode: giving the model **more capabilities** can make it **worse**. Researchers tested a quantized Llama 3.1 8B and found it failed when given 46 tools but succeeded perfectly with only 19. The extra 27 tools ‚Äî none of which were needed ‚Äî confused the model enough to cause complete failure.

The analogy: imagine trying to cook dinner, but instead of just having the recipe and ingredients you need, someone has dumped every kitchen gadget from a professional restaurant onto your counter. The food processor, the sous vide machine, the commercial mixer, the pasta extruder ‚Äî none of which you need for a simple stir-fry. The clutter itself causes mistakes.

Let us simulate this.

In [None]:
#@title üéß Listen: Confusion Simulation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/14_confusion_simulation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
def simulate_context_confusion(max_tools=50, step=2, num_trials=30):
    """
    Simulate the context confusion effect: adding irrelevant tool descriptions
    degrades a model's ability to select the correct tool.

    We simulate a tool-selection task where the model must pick the right tool
    for a query from a growing list. As irrelevant tools are added, the
    'matching accuracy' drops ‚Äî recreating the 46-vs-19 tools finding.
    """
    # The query and the correct tool
    query = "Calculate the total revenue from Q3 sales data and generate a bar chart."

    # Relevant tools (would actually help with this task)
    relevant_tools = [
        {"name": "calculate_sum", "desc": "Computes the sum of numerical values in a dataset column."},
        {"name": "filter_by_date", "desc": "Filters records to a specific date range like quarterly periods."},
        {"name": "create_bar_chart", "desc": "Generates a bar chart visualization from categorical data."},
        {"name": "load_csv", "desc": "Loads data from a CSV file into a structured table format."},
        {"name": "export_report", "desc": "Exports analysis results as a formatted PDF report."},
    ]

    # Irrelevant tools (noise ‚Äî these have nothing to do with the task)
    irrelevant_tools = [
        {"name": "send_email", "desc": "Sends an email message to specified recipients."},
        {"name": "resize_image", "desc": "Resizes an image to specified pixel dimensions."},
        {"name": "translate_text", "desc": "Translates text from one language to another."},
        {"name": "compress_file", "desc": "Compresses files into a zip archive format."},
        {"name": "play_audio", "desc": "Plays an audio file through the system speakers."},
        {"name": "set_alarm", "desc": "Sets a timer or alarm for a specified time."},
        {"name": "check_weather", "desc": "Retrieves current weather conditions for a location."},
        {"name": "convert_currency", "desc": "Converts amounts between different currencies."},
        {"name": "merge_pdfs", "desc": "Combines multiple PDF documents into a single file."},
        {"name": "crop_video", "desc": "Trims a video file to a specified time range."},
        {"name": "spell_check", "desc": "Checks text for spelling errors and suggests corrections."},
        {"name": "generate_qr", "desc": "Creates a QR code from a URL or text string."},
        {"name": "scrape_webpage", "desc": "Extracts text content from a webpage URL."},
        {"name": "encrypt_data", "desc": "Encrypts data using AES-256 encryption algorithm."},
        {"name": "schedule_meeting", "desc": "Creates a calendar event with specified attendees."},
        {"name": "parse_xml", "desc": "Parses XML formatted data into a tree structure."},
        {"name": "bluetooth_scan", "desc": "Scans for nearby Bluetooth devices and lists them."},
        {"name": "manage_contacts", "desc": "Adds, updates, or deletes entries in a contact list."},
        {"name": "run_diagnostics", "desc": "Runs system hardware diagnostics and reports status."},
        {"name": "stream_video", "desc": "Streams video content from a URL to the display."},
        {"name": "backup_database", "desc": "Creates a backup copy of the entire database."},
        {"name": "calibrate_sensor", "desc": "Calibrates a connected sensor to baseline readings."},
        {"name": "print_document", "desc": "Sends a document to the default printer."},
        {"name": "defrag_disk", "desc": "Defragments the hard drive to improve read performance."},
        {"name": "monitor_network", "desc": "Monitors network traffic and bandwidth usage."},
        {"name": "update_firmware", "desc": "Flashes new firmware to a connected hardware device."},
        {"name": "record_screen", "desc": "Captures a video recording of the screen display."},
        {"name": "sync_cloud", "desc": "Synchronizes local files with cloud storage service."},
        {"name": "clean_cache", "desc": "Clears temporary cached files to free disk space."},
        {"name": "manage_users", "desc": "Administers user accounts and access permissions."},
        {"name": "analyze_logs", "desc": "Parses and summarizes system log files for errors."},
        {"name": "test_api", "desc": "Sends test requests to an API endpoint and validates responses."},
        {"name": "optimize_images", "desc": "Reduces image file sizes while preserving quality."},
        {"name": "migrate_database", "desc": "Transfers data between different database systems."},
        {"name": "configure_firewall", "desc": "Sets up network firewall rules for traffic filtering."},
        {"name": "convert_format", "desc": "Converts files between different document formats."},
        {"name": "track_inventory", "desc": "Manages and tracks physical inventory stock levels."},
        {"name": "process_payments", "desc": "Handles credit card and payment processing transactions."},
        {"name": "manage_dns", "desc": "Configures domain name system records for a domain."},
        {"name": "profile_code", "desc": "Analyzes code execution to find performance bottlenecks."},
        {"name": "scan_malware", "desc": "Scans files and directories for malicious software."},
        {"name": "generate_invoice", "desc": "Creates a formatted invoice document with line items."},
        {"name": "compress_video", "desc": "Reduces video file size using codec compression."},
        {"name": "manage_cron", "desc": "Schedules and manages recurring system tasks."},
        {"name": "audit_security", "desc": "Performs a security audit of system configurations."},
    ]

    tool_counts = []
    accuracies_mean = []
    accuracies_std = []

    # Test with increasing numbers of total tools
    for num_irrelevant in range(0, max_tools - len(relevant_tools) + 1, step):
        total_tools = relevant_tools + irrelevant_tools[:num_irrelevant]
        total_count = len(total_tools)

        trial_accuracies = []
        for _ in range(num_trials):
            # Simulate tool selection using TF-IDF similarity
            # The model "picks" the tool whose description is most similar to the query
            tool_descs = [t['desc'] for t in total_tools]
            tool_names = [t['name'] for t in total_tools]

            vectorizer = TfidfVectorizer(stop_words='english')
            tfidf_matrix = vectorizer.fit_transform([query] + tool_descs)
            similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

            # Add noise to simulate model uncertainty (more tools = more uncertainty)
            noise_scale = 0.02 + 0.003 * num_irrelevant  # Noise grows with tool count
            noisy_sims = similarities + np.random.normal(0, noise_scale, len(similarities))

            # Model selects top-3 tools
            selected_indices = np.argsort(noisy_sims)[::-1][:3]
            selected_tools = [tool_names[i] for i in selected_indices]

            # Accuracy: fraction of selected tools that are actually relevant
            relevant_names = {t['name'] for t in relevant_tools}
            correct = sum(1 for t in selected_tools if t in relevant_names)
            trial_accuracies.append(correct / 3)

        tool_counts.append(total_count)
        accuracies_mean.append(np.mean(trial_accuracies))
        accuracies_std.append(np.std(trial_accuracies))

    return tool_counts, accuracies_mean, accuracies_std


# Run the simulation
tool_counts, acc_mean, acc_std = simulate_context_confusion(max_tools=50, step=2, num_trials=50)

# üìä Visualization: Accuracy vs. Number of Tools
fig, ax = plt.subplots(figsize=(10, 5))

acc_mean = np.array(acc_mean)
acc_std = np.array(acc_std)

ax.fill_between(tool_counts, acc_mean - acc_std, acc_mean + acc_std,
                alpha=0.2, color=COLORS['confusion'])
ax.plot(tool_counts, acc_mean, 'o-', color=COLORS['confusion'], markersize=5, linewidth=2)

# Mark the 19-tool and 46-tool points (from the research)
# Find closest indices
idx_19 = min(range(len(tool_counts)), key=lambda i: abs(tool_counts[i] - 19))
idx_46 = min(range(len(tool_counts)), key=lambda i: abs(tool_counts[i] - 46))

ax.axvline(x=19, color=COLORS['healthy'], linestyle='--', alpha=0.7, linewidth=2, label='19 tools (succeeds)')
ax.axvline(x=46, color=COLORS['poisoning'], linestyle='--', alpha=0.7, linewidth=2, label='46 tools (fails)')

ax.set_xlabel('Number of Tools in Context')
ax.set_ylabel('Tool Selection Accuracy')
ax.set_title('Context Confusion: More Tools = Worse Performance')
ax.set_ylim(0, 1.1)
ax.legend(fontsize=11)

# Annotate
ax.annotate('Llama 3.1 8B succeeded\nwith 19 tools',
            xy=(19, acc_mean[idx_19] if idx_19 < len(acc_mean) else 0.8),
            xytext=(22, 0.95),
            fontsize=10, fontweight='bold', color=COLORS['healthy'],
            arrowprops=dict(arrowstyle='->', color=COLORS['healthy'], lw=1.5))

ax.annotate('...but FAILED\nwith 46 tools',
            xy=(46, acc_mean[idx_46] if idx_46 < len(acc_mean) else 0.5),
            xytext=(35, 0.3),
            fontsize=10, fontweight='bold', color=COLORS['poisoning'],
            arrowprops=dict(arrowstyle='->', color=COLORS['poisoning'], lw=1.5))

plt.tight_layout()
plt.show()

print(f"\nüìä Results:")
print(f"  Accuracy with ~19 tools: {acc_mean[idx_19]:.2f}")
print(f"  Accuracy with ~46 tools: {acc_mean[idx_46]:.2f}")
print(f"  Degradation: {(acc_mean[idx_19] - acc_mean[idx_46]) / acc_mean[idx_19] * 100:.0f}%")
print(f"\nüí° Key insight: The 27 extra tools weren't just useless ‚Äî they actively")
print(f"   HARMED performance. Less is more in context engineering.")

### ü§î Think About It

This result has profound implications for anyone building AI agents. Every tool you add to your agent's toolkit has a cost ‚Äî not just in tokens, but in **decision quality**. Each irrelevant tool is noise that the model must process alongside the signal.

The practical takeaway: curate your tool sets aggressively. Use **progressive disclosure** ‚Äî only show the model tools that are relevant to the current step. An agent that dynamically loads 5 relevant tools will outperform one that permanently carries 50.

In [None]:
#@title üéß Listen: Clash Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/16_clash_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### Failure Mode 4: Context Clash ‚Äî Contradictory Instructions

The final failure mode occurs when different parts of the context contain contradictory instructions. Microsoft and Salesforce found that "sharding" prompts ‚Äî splitting instructions across multiple sections ‚Äî dropped performance by 39% on average. Model o3 plummeted from 98.1% to 64.1% accuracy simply because instructions were fragmented.

The analogy: imagine your boss sends you two emails. The first says, "Respond to all customer complaints within 2 hours with a detailed technical explanation." The second says, "Keep all responses under 50 words and avoid technical jargon." You cannot satisfy both. The conflict itself is the failure ‚Äî even before you start working.

In [None]:
#@title üéß Listen: Clash Simulation
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/17_clash_simulation.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
def simulate_context_clash():
    """
    Simulate context clash by creating pairs of contradictory instructions
    and measuring the contradiction level using cosine similarity on
    opposing instruction templates.

    When instructions contradict each other, the model receives conflicting
    signals about what to do ‚Äî leading to degraded, inconsistent outputs.
    """
    # Pairs of instructions: first is the "base", second is the "clash"
    instruction_pairs = [
        # (Instruction A, Instruction B, clash category)
        ("Always respond with detailed technical explanations including code examples.",
         "Keep all responses under 50 words. Never include code.",
         "Verbosity"),

        ("Use formal academic language with citations and references.",
         "Write casually like you're texting a friend. Use slang and emojis.",
         "Tone"),

        ("Never make claims without citing peer-reviewed sources.",
         "Be creative and speculative. Explore bold hypotheses freely.",
         "Rigor"),

        ("Process all data locally. Never send user data to external services.",
         "Use the cloud API to analyze user data for personalized recommendations.",
         "Privacy"),

        ("Always present multiple perspectives and let the user decide.",
         "Give direct, definitive answers. Users want confidence, not hedging.",
         "Decisiveness"),

        ("Prioritize speed. Give the fastest possible response.",
         "Take your time to think through every angle before responding.",
         "Speed"),

        ("Focus exclusively on the user's specific question. Stay on topic.",
         "Proactively provide related context, background, and tangential insights.",
         "Scope"),

        ("Always ask clarifying questions before proceeding.",
         "Never ask questions. Infer the user's intent and act immediately.",
         "Interaction"),
    ]

    # Build the full instruction set (as if all were in one system prompt)
    all_instructions_a = [pair[0] for pair in instruction_pairs]
    all_instructions_b = [pair[1] for pair in instruction_pairs]
    categories = [pair[2] for pair in instruction_pairs]

    # Compute pairwise contradiction scores
    # We measure similarity between instruction A and instruction B in each pair
    # High similarity + opposite meaning = clash
    # We also check cross-pair interactions
    all_instructions = all_instructions_a + all_instructions_b
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_instructions)
    sim_matrix = cosine_similarity(tfidf_matrix)

    n = len(instruction_pairs)

    # Extract the clash scores (A_i vs B_i for each pair)
    clash_scores = []
    for i in range(n):
        # Similarity between instruction A_i and instruction B_i
        score = sim_matrix[i, n + i]
        clash_scores.append(score)

    # Simulate performance degradation with increasing clashes
    num_clash_levels = range(0, n + 1)
    performance_scores = []

    base_performance = 0.95  # Performance with clean, consistent instructions

    for num_clashes in num_clash_levels:
        if num_clashes == 0:
            performance_scores.append(base_performance)
        else:
            # Each clash degrades performance
            # The degradation compounds (not just additive)
            degradation = 1.0
            for i in range(num_clashes):
                # Higher similarity between contradicting pairs = worse confusion
                # The model "sees" related words but opposite meanings
                clash_penalty = 0.85 + 0.10 * clash_scores[i]  # 0.85 to 0.95 multiplier
                degradation *= clash_penalty

            performance_scores.append(base_performance * degradation)

    return instruction_pairs, clash_scores, categories, num_clash_levels, performance_scores, sim_matrix, n


# Run the simulation
pairs, clash_scores, categories, clash_levels, perf_scores, sim_matrix, n = simulate_context_clash()

# üìä Visualization: Contradiction heatmap
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left panel: Clash scores by category
ax1 = axes[0]
bars = ax1.barh(categories, clash_scores, color=COLORS['clash'], edgecolor='white')
ax1.set_xlabel('Contradiction Score (TF-IDF Similarity)')
ax1.set_title('Contradiction Strength by Category')
ax1.set_xlim(0, max(clash_scores) * 1.3)

# Color bars by severity
for bar, score in zip(bars, clash_scores):
    if score > 0.15:
        bar.set_color(COLORS['poisoning'])
    elif score > 0.08:
        bar.set_color(COLORS['distraction'])
    else:
        bar.set_color(COLORS['clash'])

# Right panel: Performance vs number of clashing instruction pairs
ax2 = axes[1]
ax2.plot(list(clash_levels), perf_scores, 'o-', color=COLORS['clash'], markersize=8, linewidth=2)
ax2.fill_between(list(clash_levels), perf_scores, alpha=0.15, color=COLORS['clash'])

# Mark the Microsoft/Salesforce finding
ax2.axhline(y=0.641, color=COLORS['poisoning'], linestyle=':', alpha=0.7,
            label='o3 with fragmented instructions (64.1%)')
ax2.axhline(y=0.981, color=COLORS['healthy'], linestyle=':', alpha=0.7,
            label='o3 with unified instructions (98.1%)')

ax2.set_xlabel('Number of Contradictory Instruction Pairs')
ax2.set_ylabel('Simulated Task Performance')
ax2.set_title('Performance Degrades with More Clashes')
ax2.set_ylim(0, 1.05)
ax2.legend(fontsize=9, loc='lower left')

plt.tight_layout()
plt.show()

print("\nüìä Instruction Clash Analysis:")
for cat, score, (a, b, _) in zip(categories, clash_scores, pairs):
    severity = "HIGH" if score > 0.15 else "MEDIUM" if score > 0.08 else "LOW"
    print(f"\n  [{severity:>6}] {cat} (score: {score:.3f})")
    print(f"    A: \"{a[:70]}...\"")
    print(f"    B: \"{b[:70]}...\"")

print(f"\nüí° Key insight: o3 dropped from 98.1% to 64.1% accuracy ‚Äî a 34-point")
print(f"   collapse ‚Äî just from fragmenting instructions. Unified > sharded.")

In [None]:
#@title üéß Listen: Your Turn Intro
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/18_your_turn_intro.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 5. Your Turn ‚Äî Build the Detection Tools üîß

Now it is your turn. You have seen the four failure modes in action. Let us build the tools to **detect** them automatically. These functions will form the core of our Context Health Checker.

In [None]:
#@title üéß Listen: Todo1
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/19_todo1.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### TODO 1: Detect Context Poisoning

Your task: implement `detect_poisoning(context_statements, known_facts)` that checks if any statements in the context contradict known facts.

**Approach:**
1. Use TF-IDF to vectorize all statements and known facts together
2. For each context statement, find its most similar known fact
3. If a statement is *topically related* (similarity > 0.1) but *not a match* (similarity < 0.5), flag it as potentially poisoned ‚Äî it is talking about the same topic but saying something different
4. Return a list of flagged statements with their confidence scores

In [None]:
def detect_poisoning(context_statements, known_facts, topic_threshold=0.1, match_threshold=0.5):
    """
    Detect potential context poisoning by comparing context statements
    against a knowledge base of known facts.

    A statement is flagged as potentially poisoned if:
    - It is topically related to a known fact (similarity > topic_threshold)
    - But does NOT closely match it (similarity < match_threshold)
    This suggests it's discussing the same topic but with different (possibly wrong) claims.

    Args:
        context_statements: List of statements currently in the context
        known_facts: List of verified true facts
        topic_threshold: Minimum similarity to be considered same-topic (default: 0.1)
        match_threshold: Minimum similarity to be considered a match (default: 0.5)

    Returns:
        List of dicts with 'statement', 'closest_fact', 'similarity', 'flagged' keys
    """
    # ============ TODO ============
    # Step 1: Combine all text and fit a TF-IDF vectorizer
    all_text = context_statements + known_facts
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Step 2: Split the matrix into context vectors and fact vectors
    n_context = len(context_statements)
    context_vectors = tfidf_matrix[:n_context]
    fact_vectors = tfidf_matrix[n_context:]

    # Step 3: Compute similarity between each context statement and all known facts
    sim_matrix = cosine_similarity(context_vectors, fact_vectors)

    # Step 4: For each context statement, find the most similar known fact
    results = []
    for i, statement in enumerate(context_statements):
        best_fact_idx = np.argmax(sim_matrix[i])
        best_similarity = sim_matrix[i, best_fact_idx]
        closest_fact = known_facts[best_fact_idx]

        # Step 5: Flag if topically related but not a close match
        flagged = (best_similarity > topic_threshold) and (best_similarity < match_threshold)

        results.append({
            'statement': statement,
            'closest_fact': closest_fact,
            'similarity': best_similarity,
            'flagged': flagged
        })

    return results
    # ============ END TODO ============


# ‚úÖ Test your implementation
known_facts = [
    "Transformers use self-attention to process sequences in parallel.",
    "The learning rate controls the step size during gradient descent.",
    "Dropout randomly deactivates neurons to prevent overfitting.",
    "Adam optimizer combines momentum with adaptive learning rates.",
    "Batch normalization normalizes layer inputs to stabilize training.",
]

context = [
    "Transformers use self-attention to process sequences in parallel.",          # True - should NOT be flagged
    "Transformers process tokens sequentially from left to right like RNNs.",     # FALSE - SHOULD be flagged
    "The learning rate controls the step size during gradient descent.",          # True - should NOT be flagged
    "Dropout makes all neurons always active to maximize network capacity.",      # FALSE - SHOULD be flagged
    "Convolutional neural networks detect spatial features using learned filters.", # Unrelated - should NOT be flagged
]

results = detect_poisoning(context, known_facts)

print("üîç Poisoning Detection Results:")
print("=" * 80)
for r in results:
    status = "‚ö†Ô∏è  POISONED?" if r['flagged'] else "‚úÖ OK"
    print(f"\n{status} (similarity: {r['similarity']:.3f})")
    print(f"  Statement: \"{r['statement'][:75]}\"")
    if r['flagged']:
        print(f"  Closest fact: \"{r['closest_fact'][:75]}\"")
        print(f"  ‚Üí Talks about same topic but says something DIFFERENT!")

flagged_count = sum(1 for r in results if r['flagged'])
print(f"\nüìä Summary: {flagged_count}/{len(context)} statements flagged as potentially poisoned")

In [None]:
#@title üéß Listen: Todo2
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/20_todo2.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### TODO 2: Measure Context Confusion Score

Your task: implement `measure_confusion_score(context_chunks, query)` that measures what fraction of the context is irrelevant to the query.

**Approach:**
1. Compute TF-IDF relevance of each chunk to the query
2. A chunk is "irrelevant" if its similarity to the query is below a threshold
3. The confusion score = (number of irrelevant chunks) / (total chunks)
4. A score of 0.0 means perfect context; 1.0 means entirely irrelevant noise

In [None]:
def measure_confusion_score(context_chunks, query, relevance_threshold=0.05):
    """
    Measure the confusion score of a context: what fraction is irrelevant noise?

    A high confusion score means the model is being forced to process lots of
    irrelevant information alongside the signal ‚Äî exactly the condition that
    caused Llama 3.1 8B to fail with 46 tools.

    Args:
        context_chunks: List of text chunks in the current context
        query: The current query/task
        relevance_threshold: Minimum similarity to be considered "relevant" (default: 0.05)

    Returns:
        confusion_score: Float from 0.0 (all relevant) to 1.0 (all noise)
        chunk_relevances: List of (chunk, relevance_score, is_relevant) tuples
    """
    # ============ TODO ============
    # Step 1: Compute relevance of each chunk to the query using TF-IDF
    if not context_chunks:
        return 0.0, []

    vectorizer = TfidfVectorizer(stop_words='english')
    all_text = [query] + context_chunks
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Step 2: Cosine similarity between query and each chunk
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Step 3: Classify each chunk as relevant or irrelevant
    chunk_relevances = []
    irrelevant_count = 0
    for chunk, sim in zip(context_chunks, similarities):
        is_relevant = sim >= relevance_threshold
        if not is_relevant:
            irrelevant_count += 1
        chunk_relevances.append((chunk, float(sim), is_relevant))

    # Step 4: Confusion score = fraction of irrelevant chunks
    confusion_score = irrelevant_count / len(context_chunks)

    return confusion_score, chunk_relevances
    # ============ END TODO ============


# ‚úÖ Test your implementation
query = "How does the attention mechanism work in transformer models?"

# Mix of relevant and irrelevant chunks
chunks = [
    "Self-attention computes query, key, and value matrices to determine token relationships.",
    "Multi-head attention runs multiple attention operations in parallel.",
    "The recipe for chocolate cake requires flour, sugar, eggs, and cocoa powder.",
    "Attention scores are computed as softmax of the dot product of queries and keys.",
    "The 2024 Olympic Games were held in Paris, France.",
    "Stock market indices showed mixed results in the third quarter.",
    "Scaled dot-product attention divides by the square root of the key dimension.",
    "The best Italian restaurants in New York are located in Little Italy.",
    "Weather forecasts predict rain for the upcoming weekend.",
    "Positional encoding adds position information since attention is permutation-invariant.",
]

confusion, details = measure_confusion_score(chunks, query)

print(f"üîç Context Confusion Analysis for query:")
print(f"   \"{query}\"")
print("=" * 80)

for chunk, relevance, is_relevant in details:
    status = "‚úÖ RELEVANT" if is_relevant else "‚ùå NOISE"
    bar = "‚ñà" * int(relevance * 60)
    print(f"\n  [{status}] relevance: {relevance:.3f} {bar}")
    print(f"    \"{chunk[:75]}\"")

print(f"\nüìä Confusion Score: {confusion:.1%}")
print(f"   {sum(1 for _, _, r in details if not r)}/{len(chunks)} chunks are irrelevant noise")
if confusion > 0.5:
    print(f"   ‚ö†Ô∏è  WARNING: More than half the context is noise! This will degrade performance.")
elif confusion > 0.3:
    print(f"   ‚ö†Ô∏è  CAUTION: Significant noise in context. Consider pruning irrelevant chunks.")
else:
    print(f"   ‚úÖ Context is reasonably focused. Good signal-to-noise ratio.")

In [None]:
#@title üéß Listen: Todo3
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/21_todo3.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

### TODO 3: Detect Context Clashes

Your task: implement `detect_clashes(instructions)` that finds contradictory instruction pairs.

**Approach:**
1. Vectorize all instructions using TF-IDF
2. Compute pairwise cosine similarity between all instructions
3. Two instructions "clash" if they are topically related (moderate similarity) ‚Äî this means they address the same concern but likely with different directives
4. Return the most likely clashing pairs

The key insight: truly contradictory instructions often share many of the same keywords (same topic) but use them in opposing ways. Pure keyword overlap (high similarity) is fine ‚Äî that just means instructions are consistent. It is the moderate-similarity pairs that are suspicious.

In [None]:
def detect_clashes(instructions, clash_range=(0.08, 0.45)):
    """
    Detect potentially clashing instruction pairs.

    Instructions that are moderately similar (same topic, different directive)
    are the most likely to clash. Very high similarity = consistent/redundant.
    Very low similarity = different topics entirely (no clash risk).

    Args:
        instructions: List of instruction strings
        clash_range: (min, max) similarity range that suggests a clash

    Returns:
        clashes: List of dicts with 'instruction_a', 'instruction_b', 'similarity', 'risk'
    """
    # ============ TODO ============
    if len(instructions) < 2:
        return []

    # Step 1: Vectorize all instructions
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(instructions)

    # Step 2: Compute pairwise similarities
    sim_matrix = cosine_similarity(tfidf_matrix)

    # Step 3: Find pairs in the clash range
    clashes = []
    n = len(instructions)
    for i in range(n):
        for j in range(i + 1, n):
            sim = sim_matrix[i, j]
            if clash_range[0] <= sim <= clash_range[1]:
                # Compute risk level based on how deep in the clash range
                mid = (clash_range[0] + clash_range[1]) / 2
                risk = 1.0 - abs(sim - mid) / (mid - clash_range[0])
                risk = max(0.0, min(1.0, risk))

                clashes.append({
                    'instruction_a': instructions[i],
                    'instruction_b': instructions[j],
                    'similarity': float(sim),
                    'risk': risk,
                    'index_a': i,
                    'index_b': j,
                })

    # Sort by risk (highest first)
    clashes.sort(key=lambda x: x['risk'], reverse=True)
    return clashes
    # ============ END TODO ============


# ‚úÖ Test your implementation
instructions = [
    "Always respond with detailed technical explanations including code examples.",
    "Keep all responses under 50 words. Never include code.",
    "Use formal academic language with citations.",
    "Write casually like you're texting a friend.",
    "Never make claims without peer-reviewed sources.",
    "Be creative and speculative. Explore bold hypotheses.",
    "Focus exclusively on the user's question. Stay on topic.",
    "Proactively provide related context and tangential insights.",
    "Always ask clarifying questions before proceeding.",
    "Never ask questions. Infer intent and act immediately.",
]

clashes = detect_clashes(instructions)

print("üîç Context Clash Detection Results:")
print("=" * 80)

if not clashes:
    print("  No clashes detected!")
else:
    for i, clash in enumerate(clashes[:8]):  # Show top 8
        risk_label = "HIGH" if clash['risk'] > 0.7 else "MEDIUM" if clash['risk'] > 0.4 else "LOW"
        risk_color = "üî¥" if clash['risk'] > 0.7 else "üü°" if clash['risk'] > 0.4 else "üü¢"
        print(f"\n{risk_color} Clash #{i+1} [{risk_label}] (similarity: {clash['similarity']:.3f}, risk: {clash['risk']:.2f})")
        print(f"  A: \"{clash['instruction_a'][:75]}\"")
        print(f"  B: \"{clash['instruction_b'][:75]}\"")

print(f"\nüìä Summary: {len(clashes)} potential clashes detected in {len(instructions)} instructions")
if len(clashes) > 5:
    print(f"   ‚ö†Ô∏è  WARNING: Many conflicting instructions! Consider consolidating your prompt.")

In [None]:
#@title üéß Listen: Health Checker Class
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/22_health_checker_class.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 6. Putting It All Together ‚Äî The Context Health Checker

Now let us combine all four detection tools into a single diagnostic class. This is the tool you will take with you ‚Äî a context health checker that scores any LLM context on all four failure modes and gives actionable recommendations.

In [None]:
class ContextHealthChecker:
    """
    A diagnostic tool that analyzes LLM context for all four failure modes:
    1. Poisoning: false facts that compound over turns
    2. Distraction: important info buried in long context
    3. Confusion: irrelevant noise drowning the signal
    4. Clash: contradictory instructions

    Usage:
        checker = ContextHealthChecker()
        report = checker.diagnose(context_chunks, query, known_facts, instructions)
        checker.print_report(report)
    """

    def __init__(self, poisoning_thresholds=(0.1, 0.5),
                 relevance_threshold=0.05,
                 clash_range=(0.08, 0.45)):
        self.topic_threshold, self.match_threshold = poisoning_thresholds
        self.relevance_threshold = relevance_threshold
        self.clash_range = clash_range

    def _check_poisoning(self, context_statements, known_facts):
        """Check for potential context poisoning."""
        if not known_facts or not context_statements:
            return {'score': 0.0, 'flagged': [], 'status': 'NO_DATA'}

        results = detect_poisoning(
            context_statements, known_facts,
            self.topic_threshold, self.match_threshold
        )
        flagged = [r for r in results if r['flagged']]
        score = len(flagged) / len(context_statements) if context_statements else 0
        return {
            'score': score,
            'flagged': flagged,
            'total_checked': len(context_statements),
            'status': 'CRITICAL' if score > 0.3 else 'WARNING' if score > 0.1 else 'HEALTHY'
        }

    def _check_distraction(self, context_chunks, query):
        """Check for the 'Lost in the Middle' risk based on context length."""
        if not context_chunks:
            return {'score': 0.0, 'status': 'NO_DATA'}

        num_chunks = len(context_chunks)

        # Risk increases with context length
        # Based on Liu et al.: problems start around 10+ chunks
        if num_chunks <= 5:
            risk = 0.1
        elif num_chunks <= 10:
            risk = 0.3
        elif num_chunks <= 20:
            risk = 0.6
        else:
            risk = min(0.95, 0.6 + 0.01 * (num_chunks - 20))

        # Check if key information is in the danger zone (middle 60%)
        relevances = compute_relevance(query, context_chunks)
        top_idx = np.argmax(relevances)
        position_ratio = top_idx / max(len(context_chunks) - 1, 1)

        # Is the most relevant chunk in the middle (danger zone)?
        in_danger_zone = 0.2 < position_ratio < 0.8
        if in_danger_zone:
            risk = min(1.0, risk * 1.5)

        return {
            'score': risk,
            'num_chunks': num_chunks,
            'most_relevant_position': top_idx,
            'most_relevant_position_ratio': position_ratio,
            'in_danger_zone': in_danger_zone,
            'status': 'CRITICAL' if risk > 0.7 else 'WARNING' if risk > 0.4 else 'HEALTHY'
        }

    def _check_confusion(self, context_chunks, query):
        """Check for context confusion (irrelevant noise)."""
        if not context_chunks:
            return {'score': 0.0, 'status': 'NO_DATA'}

        confusion_score, chunk_details = measure_confusion_score(
            context_chunks, query, self.relevance_threshold
        )

        irrelevant = [(c, s) for c, s, r in chunk_details if not r]
        return {
            'score': confusion_score,
            'irrelevant_count': len(irrelevant),
            'total_chunks': len(context_chunks),
            'worst_offenders': irrelevant[:3],  # Top 3 most irrelevant
            'status': 'CRITICAL' if confusion_score > 0.5 else 'WARNING' if confusion_score > 0.3 else 'HEALTHY'
        }

    def _check_clash(self, instructions):
        """Check for contradictory instructions."""
        if not instructions or len(instructions) < 2:
            return {'score': 0.0, 'clashes': [], 'status': 'NO_DATA'}

        clashes = detect_clashes(instructions, self.clash_range)

        # Score based on number and severity of clashes
        if not clashes:
            score = 0.0
        else:
            max_possible = len(instructions) * (len(instructions) - 1) / 2
            score = min(1.0, len(clashes) / max_possible * 2)  # Scale up

        return {
            'score': score,
            'clashes': clashes,
            'num_clashes': len(clashes),
            'num_instructions': len(instructions),
            'status': 'CRITICAL' if score > 0.5 else 'WARNING' if score > 0.2 else 'HEALTHY'
        }

    def diagnose(self, context_chunks, query, known_facts=None, instructions=None):
        """
        Run a full diagnostic on the context.

        Args:
            context_chunks: List of text chunks in the current context
            query: The current query/task
            known_facts: Optional list of known-true facts for poisoning check
            instructions: Optional list of instructions for clash check

        Returns:
            Dictionary with results for each failure mode + overall health score
        """
        poisoning = self._check_poisoning(context_chunks, known_facts or [])
        distraction = self._check_distraction(context_chunks, query)
        confusion = self._check_confusion(context_chunks, query)
        clash = self._check_clash(instructions or [])

        # Overall health: weighted average (inverse of risk scores)
        scores = []
        weights = []
        for check, weight in [(poisoning, 3), (distraction, 2), (confusion, 2), (clash, 3)]:
            if check['status'] != 'NO_DATA':
                scores.append(1.0 - check['score'])
                weights.append(weight)

        overall_health = np.average(scores, weights=weights) if scores else 1.0

        return {
            'poisoning': poisoning,
            'distraction': distraction,
            'confusion': confusion,
            'clash': clash,
            'overall_health': overall_health,
            'overall_status': ('CRITICAL' if overall_health < 0.5 else
                             'WARNING' if overall_health < 0.7 else 'HEALTHY'),
        }

    def print_report(self, report):
        """Print a formatted health report."""
        status_emoji = {
            'HEALTHY': '‚úÖ',
            'WARNING': '‚ö†Ô∏è ',
            'CRITICAL': 'üî¥',
            'NO_DATA': '‚¨ú'
        }

        print("\n" + "=" * 60)
        print("       CONTEXT HEALTH REPORT")
        print("=" * 60)

        # Overall
        overall = report['overall_status']
        print(f"\n{status_emoji[overall]} Overall Health: {report['overall_health']:.0%} ({overall})")

        # Poisoning
        p = report['poisoning']
        print(f"\n{status_emoji[p['status']]} Poisoning Risk: {p['score']:.0%}")
        if p['status'] != 'NO_DATA' and p['flagged']:
            for f in p['flagged'][:2]:
                print(f"    ‚Üí Suspicious: \"{f['statement'][:60]}...\"")

        # Distraction
        d = report['distraction']
        print(f"\n{status_emoji[d['status']]} Distraction Risk: {d['score']:.0%}")
        if d['status'] != 'NO_DATA':
            print(f"    Context length: {d['num_chunks']} chunks")
            if d.get('in_danger_zone'):
                print(f"    ‚Üí Key info at position {d['most_relevant_position']} (DANGER ZONE!)")

        # Confusion
        c = report['confusion']
        print(f"\n{status_emoji[c['status']]} Confusion Score: {c['score']:.0%}")
        if c['status'] != 'NO_DATA':
            print(f"    {c['irrelevant_count']}/{c['total_chunks']} chunks are noise")

        # Clash
        cl = report['clash']
        print(f"\n{status_emoji[cl['status']]} Clash Risk: {cl['score']:.0%}")
        if cl['status'] != 'NO_DATA' and cl['clashes']:
            print(f"    {cl['num_clashes']} contradictory pairs found")

        # Recommendations
        print(f"\n{'‚îÄ' * 60}")
        print("RECOMMENDATIONS:")
        if p['status'] in ('WARNING', 'CRITICAL'):
            print("  1. Add fact-checking against a knowledge base")
            print("     before including LLM outputs in context.")
        if d['status'] in ('WARNING', 'CRITICAL'):
            print("  2. Shorten context or move key info to the")
            print("     beginning. Consider context compression.")
        if c['status'] in ('WARNING', 'CRITICAL'):
            print("  3. Prune irrelevant chunks. Use progressive")
            print("     disclosure ‚Äî only load what's needed now.")
        if cl['status'] in ('WARNING', 'CRITICAL'):
            print("  4. Consolidate contradictory instructions into")
            print("     a single, unified directive.")
        if report['overall_status'] == 'HEALTHY':
            print("  Your context looks healthy! No major issues detected.")

        print("=" * 60)

In [None]:
#@title üéß Listen: Sick Context Demo
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/23_sick_context_demo.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

Now let us run the health checker on a realistic scenario ‚Äî a context that has all four problems:

In [None]:
# Create a realistic "sick" context that has all four problems
query = "How do transformer models handle long-range dependencies?"

# Context chunks: mix of relevant, irrelevant, and poisoned content
context_chunks = [
    # Relevant
    "Self-attention allows each token to attend to every other token regardless of distance.",
    "Positional encoding adds position information since attention is permutation-invariant.",
    "Multi-head attention enables the model to jointly attend to information from different subspaces.",

    # Poisoned (wrong facts)
    "Transformers process tokens sequentially from left to right, limiting long-range dependency handling.",
    "Attention complexity is linear in sequence length, making transformers efficient for any length.",

    # Irrelevant noise
    "The best pizza restaurants in New York include Di Fara and Lucali.",
    "The 2024 Paris Olympics featured 329 events across 32 sports.",
    "Python was created by Guido van Rossum and released in 1991.",
    "Mount Everest stands at 8,849 meters above sea level.",

    # More relevant (but buried in the middle!)
    "Flash attention reduces memory from quadratic to linear by computing attention in blocks.",
    "Sparse attention mechanisms attend to a subset of tokens to handle longer sequences.",
]

# Known facts for poisoning detection
known_facts = [
    "Transformers use self-attention to process ALL tokens in parallel, not sequentially.",
    "Standard attention has quadratic complexity O(n^2) in sequence length.",
    "Self-attention allows each token to attend to every other token regardless of distance.",
    "Positional encoding adds position information since attention is permutation-invariant.",
]

# Instructions with clashes
instructions = [
    "Provide detailed technical explanations with mathematical formulas.",
    "Keep responses brief and avoid technical jargon.",
    "Always cite specific research papers when making claims.",
    "Speak conversationally without academic references.",
    "Focus only on the user's specific question.",
    "Proactively explain related concepts and background.",
]

# Run the diagnostic
checker = ContextHealthChecker()
report = checker.diagnose(context_chunks, query, known_facts, instructions)
checker.print_report(report)

In [None]:
#@title üéß Listen: Dashboard
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/24_dashboard.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 7. Final Output ‚Äî The 2x2 Failure Mode Dashboard üéØ

Let us bring everything together into a single, comprehensive visualization. This dashboard shows all four failure modes side by side, making it easy to compare their effects and diagnose context problems at a glance.

In [None]:
def create_failure_mode_dashboard():
    """
    Create the final 2x2 dashboard showing all four context failure modes.

    Top-left: Context Poisoning ‚Äî degradation curve over turns
    Top-right: Context Distraction ‚Äî U-shaped "Lost in the Middle" curve
    Bottom-left: Context Confusion ‚Äî accuracy vs. number of irrelevant tools
    Bottom-right: Context Clash ‚Äî heatmap of instruction contradictions
    """

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('The Four Context Failure Modes ‚Äî A Complete Diagnostic Dashboard',
                 fontsize=16, fontweight='bold', y=1.02)

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Panel 1 (Top-Left): Context Poisoning
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    ax1 = axes[0, 0]
    accuracy, _, poison_turn = simulate_poisoning(num_turns=12, poison_turn=3)
    turns = range(len(accuracy))
    colors_bar = [COLORS['healthy'] if t < poison_turn else COLORS['poisoning'] for t in turns]
    ax1.bar(turns, accuracy, color=colors_bar, edgecolor='white', linewidth=0.5)
    ax1.axvline(x=poison_turn - 0.5, color='black', linestyle='--', linewidth=1.5, alpha=0.5)
    ax1.set_xlabel('Conversation Turn', fontsize=10)
    ax1.set_ylabel('Context Accuracy', fontsize=10)
    ax1.set_title('1. Context Poisoning', fontsize=13, fontweight='bold', color=COLORS['poisoning'])
    ax1.set_ylim(0, 1.05)
    ax1.text(poison_turn + 0.3, 0.95, 'Poison\ninjected',
             fontsize=9, color=COLORS['poisoning'], fontweight='bold')
    ax1.text(0.5, 0.02, 'Errors compound over turns',
             fontsize=8, fontstyle='italic', color='gray',
             transform=ax1.transAxes, ha='center')

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Panel 2 (Top-Right): Context Distraction / Lost in the Middle
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    ax2 = axes[0, 1]
    positions, acc_litm, stds_litm = simulate_lost_in_the_middle(num_positions=20, num_trials=80)
    ax2.fill_between(positions, np.array(acc_litm) - np.array(stds_litm),
                     np.array(acc_litm) + np.array(stds_litm),
                     alpha=0.15, color=COLORS['distraction'])
    ax2.plot(positions, acc_litm, 'o-', color=COLORS['distraction'], markersize=4, linewidth=2)
    ax2.set_xlabel('Position of Key Info in Context', fontsize=10)
    ax2.set_ylabel('Retrieval Accuracy', fontsize=10)
    ax2.set_title('2. Context Distraction', fontsize=13, fontweight='bold', color=COLORS['distraction'])
    ax2.set_ylim(0, 1.1)

    # Shade the middle danger zone
    ax2.axvspan(4, 15, alpha=0.08, color=COLORS['poisoning'])
    ax2.text(10, 0.15, '"Lost in\nthe Middle"',
             fontsize=10, fontweight='bold', color=COLORS['poisoning'],
             ha='center', fontstyle='italic')

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Panel 3 (Bottom-Left): Context Confusion
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    ax3 = axes[1, 0]
    tool_cts, acc_conf, std_conf = simulate_context_confusion(max_tools=50, step=2, num_trials=40)
    acc_conf = np.array(acc_conf)
    std_conf = np.array(std_conf)
    ax3.fill_between(tool_cts, acc_conf - std_conf, acc_conf + std_conf,
                     alpha=0.15, color=COLORS['confusion'])
    ax3.plot(tool_cts, acc_conf, 'o-', color=COLORS['confusion'], markersize=4, linewidth=2)
    ax3.axvline(x=19, color=COLORS['healthy'], linestyle='--', alpha=0.6, linewidth=1.5)
    ax3.axvline(x=46, color=COLORS['poisoning'], linestyle='--', alpha=0.6, linewidth=1.5)
    ax3.set_xlabel('Number of Tools in Context', fontsize=10)
    ax3.set_ylabel('Tool Selection Accuracy', fontsize=10)
    ax3.set_title('3. Context Confusion', fontsize=13, fontweight='bold', color=COLORS['confusion'])
    ax3.set_ylim(0, 1.1)
    ax3.text(17, 0.15, '19', fontsize=9, color=COLORS['healthy'], fontweight='bold', ha='center')
    ax3.text(44, 0.15, '46', fontsize=9, color=COLORS['poisoning'], fontweight='bold', ha='center')

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Panel 4 (Bottom-Right): Context Clash ‚Äî Heatmap
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    ax4 = axes[1, 1]

    # Create instructions and compute contradiction heatmap
    short_labels = ['Detail', 'Brief', 'Formal', 'Casual', 'Cite', 'Speculate', 'Focus', 'Expand']
    instruction_texts = [
        "Always respond with detailed technical explanations including code examples.",
        "Keep all responses under 50 words. Never include code.",
        "Use formal academic language with citations and references.",
        "Write casually like you're texting a friend. Use slang and emojis.",
        "Never make claims without citing peer-reviewed sources.",
        "Be creative and speculative. Explore bold hypotheses freely.",
        "Focus exclusively on the user's specific question. Stay on topic.",
        "Proactively provide related context, background, and tangential insights.",
    ]

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_mat = vectorizer.fit_transform(instruction_texts)
    clash_matrix = cosine_similarity(tfidf_mat)

    # Mask the diagonal (self-similarity is always 1, not interesting)
    np.fill_diagonal(clash_matrix, np.nan)

    im = ax4.imshow(clash_matrix, cmap='YlOrRd', vmin=0, vmax=0.4, aspect='auto')
    ax4.set_xticks(range(len(short_labels)))
    ax4.set_yticks(range(len(short_labels)))
    ax4.set_xticklabels(short_labels, rotation=45, ha='right', fontsize=9)
    ax4.set_yticklabels(short_labels, fontsize=9)
    ax4.set_title('4. Context Clash', fontsize=13, fontweight='bold', color=COLORS['clash'])

    # Add text annotations for high-clash pairs
    for i in range(len(short_labels)):
        for j in range(len(short_labels)):
            if i != j and not np.isnan(clash_matrix[i, j]):
                val = clash_matrix[i, j]
                if val > 0.1:
                    ax4.text(j, i, f'{val:.2f}', ha='center', va='center',
                            fontsize=7, fontweight='bold' if val > 0.2 else 'normal',
                            color='white' if val > 0.25 else 'black')

    plt.colorbar(im, ax=ax4, label='Contradiction Score', shrink=0.8)

    plt.tight_layout()
    plt.savefig('context_failure_modes_dashboard.png', dpi=150, bbox_inches='tight',
                facecolor='white', edgecolor='none')
    plt.show()

    print("\nüéØ Dashboard saved as 'context_failure_modes_dashboard.png'")
    print("\nWhat each panel tells you:")
    print("  1. POISONING  ‚Üí One wrong fact cascades into many wrong answers")
    print("  2. DISTRACTION ‚Üí Information in the middle of long context gets lost")
    print("  3. CONFUSION  ‚Üí More tools/info beyond what's needed HURTS performance")
    print("  4. CLASH      ‚Üí Contradictory instructions cause unpredictable behavior")


# Generate the final dashboard
create_failure_mode_dashboard()

In [None]:
#@title üéß Listen: Reflection
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/25_reflection.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

## 8. Reflection üí°

Let us step back and think about what we have built and learned.

### What We Built
1. **Context quality metrics** ‚Äî TF-IDF similarity and signal-to-noise ratio to measure context health
2. **Four failure mode simulators** ‚Äî each one demonstrates a real, documented failure pattern
3. **Three detection functions** ‚Äî poisoning detection, confusion scoring, and clash detection
4. **A Context Health Checker** ‚Äî a reusable diagnostic tool that combines all four checks

### The Four Failure Modes ‚Äî Summary

| Failure Mode | What Goes Wrong | Real-World Example | Our Detection |
|---|---|---|---|
| **Poisoning** | False facts compound over turns | Gemini Pokemon agent | TF-IDF fact verification |
| **Distraction** | Key info lost in long context | "Lost in the Middle" (Liu 2023) | Position-based risk scoring |
| **Confusion** | Irrelevant noise drowns signal | Llama 3.1 fails at 46 tools | Relevance fraction scoring |
| **Clash** | Contradictory instructions | o3 drops from 98% to 64% | Pairwise similarity detection |

### Key Takeaways

1. **More is not always better.** More context, more tools, more instructions ‚Äî all can make models *worse*, not better. Context engineering is as much about what you *exclude* as what you include.

2. **Position matters.** Where you place information in the context affects whether the model can find it. Put the most important information first.

3. **Errors compound.** A single hallucination does not just add one wrong answer ‚Äî it generates derivative errors that accumulate over turns. Validate context entries against ground truth.

4. **Contradictions are catastrophic.** Fragmenting or contradicting instructions causes disproportionate damage. Unified, consistent instructions dramatically outperform scattered ones.

### What's Next?

In Part 3 of this series, we will learn the **four core strategies** for engineering context well ‚Äî Write, Select, Compress, and Isolate ‚Äî and build tools that implement each one. If this notebook taught you what can go wrong, the next one teaches you how to get it right.

In [None]:
#@title üéß Listen: Closing
from IPython.display import Audio, display
import os as _os
_f = "/content/narration/26_closing.mp3"
if _os.path.exists(_f):
    display(Audio(_f))
else:
    print("Run the first cell to download narration audio.")

In [None]:
# Final verification: let's make sure everything ran successfully
print("=" * 60)
print("  ‚úÖ NOTEBOOK COMPLETE ‚Äî Context Failure Modes")
print("=" * 60)
print()
print("  What we built:")
print("    ‚Ä¢ compute_relevance()        ‚Äî TF-IDF context scoring")
print("    ‚Ä¢ signal_to_noise_ratio()    ‚Äî SNR measurement")
print("    ‚Ä¢ simulate_poisoning()       ‚Äî Failure mode 1 simulator")
print("    ‚Ä¢ simulate_lost_in_middle()  ‚Äî Failure mode 2 simulator")
print("    ‚Ä¢ simulate_context_confusion() ‚Äî Failure mode 3 simulator")
print("    ‚Ä¢ simulate_context_clash()   ‚Äî Failure mode 4 simulator")
print("    ‚Ä¢ detect_poisoning()         ‚Äî TODO 1 ‚úì")
print("    ‚Ä¢ measure_confusion_score()  ‚Äî TODO 2 ‚úì")
print("    ‚Ä¢ detect_clashes()           ‚Äî TODO 3 ‚úì")
print("    ‚Ä¢ ContextHealthChecker       ‚Äî Full diagnostic class")
print()
print("  Visualizations generated:")
print("    ‚Ä¢ Poisoning degradation curve")
print("    ‚Ä¢ Lost in the Middle U-curve")
print("    ‚Ä¢ Tool confusion accuracy plot")
print("    ‚Ä¢ Instruction clash heatmap")
print("    ‚Ä¢ 2x2 failure mode dashboard")
print()
print("  No API keys were needed ‚Äî everything ran locally!")
print("  Time estimate: ~25 minutes")
print()
print("  ‚Üí Next: Part 3 ‚Äî Four Core Strategies (Write, Select,")
print("    Compress, Isolate)")
print("=" * 60)

In [None]:
#@title üí¨ AI Teaching Assistant ‚Äî Click ‚ñ∂ to start
#@markdown This AI chatbot reads your notebook and can answer questions about any concept, code, or exercise.

import json as _json
import requests as _requests
from google.colab import output as _output
from IPython.display import display, HTML as _HTML, Markdown as _Markdown

# --- Read notebook content for context ---
def _get_notebook_context():
    try:
        from google.colab import _message
        nb = _message.blocking_request("get_ipynb", request="", timeout_sec=10)
        cells = nb.get("ipynb", {}).get("cells", [])
        parts = []
        for cell in cells:
            src = "".join(cell.get("source", []))
            tags = cell.get("metadata", {}).get("tags", [])
            if "chatbot" in tags:
                continue
            if src.strip():
                ct = cell.get("cell_type", "unknown")
                parts.append(f"[{ct.upper()}]\n{src}")
        return "\n\n---\n\n".join(parts)
    except Exception:
        return "Notebook content unavailable."

_NOTEBOOK_CONTEXT = _get_notebook_context()
_CHAT_HISTORY = []
_API_URL = "https://course-creator-brown.vercel.app/api/chat"

def _notebook_chat(question):
    global _CHAT_HISTORY
    try:
        resp = _requests.post(_API_URL, json={
            'question': question,
            'context': _NOTEBOOK_CONTEXT[:100000],
            'history': _CHAT_HISTORY[-10:],
        }, timeout=60)
        data = resp.json()
        answer = data.get('answer', 'Sorry, I could not generate a response.')
        _CHAT_HISTORY.append({'role': 'user', 'content': question})
        _CHAT_HISTORY.append({'role': 'assistant', 'content': answer})
        return answer
    except Exception as e:
        return f'Error connecting to teaching assistant: {str(e)}'

_output.register_callback('notebook_chat', _notebook_chat)

def ask(question):
    """Ask the AI teaching assistant a question about this notebook."""
    answer = _notebook_chat(question)
    display(_Markdown(answer))

print("\u2705 AI Teaching Assistant is ready!")
print("\U0001f4a1 Use the chat below, or call ask(\'your question\') in any cell.")

# --- Display chat widget ---
display(_HTML('''<style>
  .vc-wrap{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;max-width:100%;border-radius:16px;overflow:hidden;box-shadow:0 4px 24px rgba(0,0,0,.12);background:#fff;border:1px solid #e5e7eb}
  .vc-hdr{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;padding:16px 20px;display:flex;align-items:center;gap:12px}
  .vc-avatar{width:42px;height:42px;background:rgba(255,255,255,.2);border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:22px}
  .vc-hdr h3{font-size:16px;font-weight:600;margin:0}
  .vc-hdr p{font-size:12px;opacity:.85;margin:2px 0 0}
  .vc-msgs{height:420px;overflow-y:auto;padding:16px;background:#f8f9fb;display:flex;flex-direction:column;gap:10px}
  .vc-msg{display:flex;flex-direction:column;animation:vc-fade .25s ease}
  .vc-msg.user{align-items:flex-end}
  .vc-msg.bot{align-items:flex-start}
  .vc-bbl{max-width:85%;padding:10px 14px;border-radius:16px;font-size:14px;line-height:1.55;word-wrap:break-word}
  .vc-msg.user .vc-bbl{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;border-bottom-right-radius:4px}
  .vc-msg.bot .vc-bbl{background:#fff;color:#1a1a2e;border:1px solid #e8e8e8;border-bottom-left-radius:4px}
  .vc-bbl code{background:rgba(0,0,0,.07);padding:2px 6px;border-radius:4px;font-size:13px;font-family:'Fira Code',monospace}
  .vc-bbl pre{background:#1e1e2e;color:#cdd6f4;padding:12px;border-radius:8px;overflow-x:auto;margin:8px 0;font-size:13px}
  .vc-bbl pre code{background:none;padding:0;color:inherit}
  .vc-bbl h3,.vc-bbl h4{margin:10px 0 4px;font-size:15px}
  .vc-bbl ul,.vc-bbl ol{margin:4px 0;padding-left:20px}
  .vc-bbl li{margin:2px 0}
  .vc-chips{display:flex;flex-wrap:wrap;gap:8px;padding:0 16px 12px;background:#f8f9fb}
  .vc-chip{background:#fff;border:1px solid #d1d5db;border-radius:20px;padding:6px 14px;font-size:12px;cursor:pointer;transition:all .15s;color:#4b5563}
  .vc-chip:hover{border-color:#667eea;color:#667eea;background:#f0f0ff}
  .vc-input{display:flex;padding:12px 16px;background:#fff;border-top:1px solid #eee;gap:8px}
  .vc-input input{flex:1;padding:10px 16px;border:2px solid #e8e8e8;border-radius:24px;font-size:14px;outline:none;transition:border-color .2s}
  .vc-input input:focus{border-color:#667eea}
  .vc-input button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);color:#fff;border:none;border-radius:50%;width:42px;height:42px;cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:18px;transition:transform .1s}
  .vc-input button:hover{transform:scale(1.05)}
  .vc-input button:disabled{opacity:.5;cursor:not-allowed;transform:none}
  .vc-typing{display:flex;gap:5px;padding:4px 0}
  .vc-typing span{width:8px;height:8px;background:#667eea;border-radius:50%;animation:vc-bounce 1.4s infinite ease-in-out}
  .vc-typing span:nth-child(2){animation-delay:.2s}
  .vc-typing span:nth-child(3){animation-delay:.4s}
  @keyframes vc-bounce{0%,80%,100%{transform:scale(0)}40%{transform:scale(1)}}
  @keyframes vc-fade{from{opacity:0;transform:translateY(8px)}to{opacity:1;transform:translateY(0)}}
  .vc-note{text-align:center;font-size:11px;color:#9ca3af;padding:8px 16px 12px;background:#fff}
</style>
<div class="vc-wrap">
  <div class="vc-hdr">
    <div class="vc-avatar">&#129302;</div>
    <div>
      <h3>Vizuara Teaching Assistant</h3>
      <p>Ask me anything about this notebook</p>
    </div>
  </div>
  <div class="vc-msgs" id="vcMsgs">
    <div class="vc-msg bot">
      <div class="vc-bbl">&#128075; Hi! I've read through this entire notebook. Ask me about any concept, code block, or exercise &mdash; I'm here to help you learn!</div>
    </div>
  </div>
  <div class="vc-chips" id="vcChips">
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Explain the main concept</span>
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Help with the TODO exercise</span>
    <span class="vc-chip" onclick="vcAsk(this.textContent)">Summarize what I learned</span>
  </div>
  <div class="vc-input">
    <input type="text" id="vcIn" placeholder="Ask about concepts, code, exercises..." />
    <button id="vcSend" onclick="vcSendMsg()">&#10148;</button>
  </div>
  <div class="vc-note">AI-generated &middot; Verify important information &middot; <a href="#" onclick="vcClear();return false" style="color:#667eea">Clear chat</a></div>
</div>
<script>
(function(){
  var msgs=document.getElementById('vcMsgs'),inp=document.getElementById('vcIn'),
      btn=document.getElementById('vcSend'),chips=document.getElementById('vcChips');

  function esc(s){var d=document.createElement('div');d.textContent=s;return d.innerHTML}

  function md(t){
    return t
      .replace(/```(\w*)\n([\s\S]*?)```/g,function(_,l,c){return '<pre><code>'+esc(c)+'</code></pre>'})
      .replace(/`([^`]+)`/g,'<code>$1</code>')
      .replace(/\*\*([^*]+)\*\*/g,'<strong>$1</strong>')
      .replace(/\*([^*]+)\*/g,'<em>$1</em>')
      .replace(/^#### (.+)$/gm,'<h4>$1</h4>')
      .replace(/^### (.+)$/gm,'<h4>$1</h4>')
      .replace(/^## (.+)$/gm,'<h3>$1</h3>')
      .replace(/^\d+\. (.+)$/gm,'<li>$1</li>')
      .replace(/^- (.+)$/gm,'<li>$1</li>')
      .replace(/\n\n/g,'<br><br>')
      .replace(/\n/g,'<br>');
  }

  function addMsg(text,isUser){
    var m=document.createElement('div');m.className='vc-msg '+(isUser?'user':'bot');
    var b=document.createElement('div');b.className='vc-bbl';
    b.innerHTML=isUser?esc(text):md(text);
    m.appendChild(b);msgs.appendChild(m);msgs.scrollTop=msgs.scrollHeight;
  }

  function showTyping(){
    var m=document.createElement('div');m.className='vc-msg bot';m.id='vcTyping';
    m.innerHTML='<div class="vc-bbl"><div class="vc-typing"><span></span><span></span><span></span></div></div>';
    msgs.appendChild(m);msgs.scrollTop=msgs.scrollHeight;
  }

  function hideTyping(){var e=document.getElementById('vcTyping');if(e)e.remove()}

  window.vcSendMsg=function(){
    var q=inp.value.trim();if(!q)return;
    inp.value='';chips.style.display='none';
    addMsg(q,true);showTyping();btn.disabled=true;
    google.colab.kernel.invokeFunction('notebook_chat',[q],{})
      .then(function(r){
        hideTyping();
        var a=r.data['application/json'];
        addMsg(typeof a==='string'?a:JSON.stringify(a),false);
      })
      .catch(function(){
        hideTyping();
        addMsg('Sorry, I encountered an error. Please check your internet connection and try again.',false);
      })
      .finally(function(){btn.disabled=false;inp.focus()});
  };

  window.vcAsk=function(q){inp.value=q;vcSendMsg()};
  window.vcClear=function(){
    msgs.innerHTML='<div class="vc-msg bot"><div class="vc-bbl">&#128075; Chat cleared. Ask me anything!</div></div>';
    chips.style.display='flex';
  };

  inp.addEventListener('keypress',function(e){if(e.key==='Enter')vcSendMsg()});
  inp.focus();
})();
</script>'''))