# ClipCard Simulation: AI/ML Governance

**Purpose:** This notebook runs a Monte Carlo simulation to estimate the effectiveness of ClipCard parameters for AI/ML model releases.

It simulates model releases and checks if randomly generated metrics (like `pii_leak_count` or `jailbreak_hit_rate`) would trigger the `kill_criteria` defined in a ClipCard. This helps quantify the risk reduction provided by the card's specific configuration.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob

# Load example ClipCards from the examples or data directory
try:
    with open('../../examples/test.clipcard.json', 'r') as f:
        example_card = json.load(f)
    print(f"Loaded example ClipCard: {example_card['id']}")
except FileNotFoundError:
    print("No example ClipCard found. Using synthetic data.")
    example_card = {
        "id": "SIM-AIML-001",
        "kill_criteria": [
            {"condition": "pii_leak_count >= 1", "action": "Rollback immediately"},
            {"condition": "jailbreak_rate > 0.005", "action": "Reduce cohort to 1%"}
        ],
        "authority_window": {"scope_limit": "5% cohort", "time_limit": "30d"}
    }

### Simulation Parameters

Configure simulation runs and metric distributions

In [None]:
# Simulation configuration
N_SIMULATIONS = 1000  # Number of release simulations
COHORT_SIZE = 0.05    # 5% of users
BASE_PII_LEAK_RATE = 0.0001  # Base probability of PII leak per interaction
BASE_JAILBREAK_RATE = 0.003   # Base jailbreak rate

print(f"Running {N_SIMULATIONS} simulations...")
print(f"Cohort size: {COHORT_SIZE*100}%")
print(f"Base PII leak rate: {BASE_PII_LEAK_RATE}")
print(f"Base jailbreak rate: {BASE_JAILBREAK_RATE}")

### Monte Carlo Simulation

Run simulations and track kill criteria triggers

In [None]:
results = []

for i in range(N_SIMULATIONS):
    # Simulate metrics for a 24-hour period
    interactions = np.random.poisson(10000)  # Number of interactions
    
    # Simulate PII leaks (Poisson process)
    pii_leaks = np.random.poisson(BASE_PII_LEAK_RATE * interactions)
    
    # Simulate jailbreak rate (Beta distribution with noise)
    jailbreak_rate = np.random.beta(2, 500) + BASE_JAILBREAK_RATE
    
    # Check kill criteria
    pii_trigger = pii_leaks >= 1
    jailbreak_trigger = jailbreak_rate > 0.005
    any_trigger = pii_trigger or jailbreak_trigger
    
    results.append({
        'sim_id': i,
        'interactions': interactions,
        'pii_leaks': pii_leaks,
        'jailbreak_rate': jailbreak_rate,
        'pii_trigger': pii_trigger,
        'jailbreak_trigger': jailbreak_trigger,
        'any_trigger': any_trigger
    })

df = pd.DataFrame(results)
print(f"\nSimulation complete!")
print(f"PII leak triggers: {df['pii_trigger'].sum()} ({df['pii_trigger'].mean()*100:.1f}%)")
print(f"Jailbreak triggers: {df['jailbreak_trigger'].sum()} ({df['jailbreak_trigger'].mean()*100:.1f}%)")
print(f"Any trigger: {df['any_trigger'].sum()} ({df['any_trigger'].mean()*100:.1f}%)")

### Visualization

Plot trigger distributions and risk reduction

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# PII leak distribution
axes[0].hist(df['pii_leaks'], bins=30, alpha=0.7, edgecolor='black')
axes[0].axvline(1, color='red', linestyle='--', label='Kill threshold')
axes[0].set_xlabel('PII Leaks per 24h')
axes[0].set_ylabel('Frequency')
axes[0].set_title('PII Leak Distribution')
axes[0].legend()

# Jailbreak rate distribution
axes[1].hist(df['jailbreak_rate'], bins=30, alpha=0.7, edgecolor='black')
axes[1].axvline(0.005, color='red', linestyle='--', label='Kill threshold')
axes[1].set_xlabel('Jailbreak Rate')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Jailbreak Rate Distribution')
axes[1].legend()

# Trigger summary
trigger_counts = [
    df['pii_trigger'].sum(),
    df['jailbreak_trigger'].sum(),
    df['any_trigger'].sum()
]
trigger_labels = ['PII Leak', 'Jailbreak', 'Any Trigger']
axes[2].bar(trigger_labels, trigger_counts, alpha=0.7, edgecolor='black')
axes[2].set_ylabel('Number of Triggers')
axes[2].set_title(f'Kill Criteria Triggers (n={N_SIMULATIONS})')
axes[2].axhline(N_SIMULATIONS * 0.05, color='orange', linestyle='--', label='5% threshold')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"\n** Interpretation **")
print(f"In {N_SIMULATIONS} simulated 24h periods with a {COHORT_SIZE*100}% cohort:")
print(f"- Kill criteria would trigger {df['any_trigger'].mean()*100:.1f}% of the time")
print(f"- This suggests the authority window provides {(1-df['any_trigger'].mean())*100:.1f}% safe operation probability")
print(f"- Estimated incident prevention: {df['any_trigger'].sum()} potential incidents caught early")