# ClipCard Simulation: DevOps/SRE

**Purpose:** This notebook runs a Monte Carlo simulation to estimate the effectiveness of ClipCard parameters for service deployments.

It simulates deployments and checks if randomly generated metrics (like `error_rate` or `latency`) would trigger the `kill_criteria` defined in a ClipCard. This helps quantify the risk reduction provided by the card's specific configuration.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob

# Load example ClipCards from the examples or data directory
try:
    with open('../../examples/test.clipcard.json', 'r') as f:
        example_card = json.load(f)
    print(f"Loaded example ClipCard: {example_card['id']}")
except FileNotFoundError:
    print("No example ClipCard found. Using synthetic data.")
    example_card = {
        "id": "SIM-DEVOPS-001",
        "kill_criteria": [
            {"condition": "error_rate > 0.01", "action": "Rollback immediately"},
            {"condition": "latency_p99 > 500", "action": "Reduce traffic to 1%"}
        ],
        "authority_window": {"scope_limit": "5% deployment", "time_limit": "30d"}
    }

### Simulation Parameters

Configure simulation runs and metric distributions

In [None]:
# Simulation configuration
N_SIMULATIONS = 1000  # Number of deployment simulations
COHORT_SIZE = 0.05    # 5% of traffic
BASE_ERROR_RATE = 0.003  # Base error rate
BASE_LATENCY_P99 = 250   # Base p99 latency in ms

print(f"Running {N_SIMULATIONS} simulations...")
print(f"Deployment size: {COHORT_SIZE*100}%")
print(f"Base error rate: {BASE_ERROR_RATE}")
print(f"Base p99 latency: {BASE_LATENCY_P99}ms")

### Monte Carlo Simulation

Run simulations and track kill criteria triggers

In [None]:
results = []

for i in range(N_SIMULATIONS):
    # Simulate metrics for a 24-hour period
    requests = np.random.poisson(100000)  # Number of requests
    
    # Simulate error rate (Beta distribution with noise)
    error_rate = np.random.beta(2, 500) + BASE_ERROR_RATE
    
    # Simulate p99 latency (Gamma distribution)
    latency_p99 = np.random.gamma(2, BASE_LATENCY_P99/2)
    
    # Check kill criteria
    error_trigger = error_rate > 0.01
    latency_trigger = latency_p99 > 500
    any_trigger = error_trigger or latency_trigger
    
    results.append({
        'sim_id': i,
        'requests': requests,
        'error_rate': error_rate,
        'latency_p99': latency_p99,
        'error_trigger': error_trigger,
        'latency_trigger': latency_trigger,
        'any_trigger': any_trigger
    })

df = pd.DataFrame(results)
print(f"
Simulation complete!")
print(f"Error rate triggers: {df['error_trigger'].sum()} ({df['error_trigger'].mean()*100:.1f}%)")
print(f"Latency triggers: {df['latency_trigger'].sum()} ({df['latency_trigger'].mean()*100:.1f}%)")
print(f"Any trigger: {df['any_trigger'].sum()} ({df['any_trigger'].mean()*100:.1f}%)")

### Visualization

Plot trigger distributions and risk reduction

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Error rate distribution
axes[0].hist(df['error_rate'], bins=30, alpha=0.7, edgecolor='black')
axes[0].axvline(0.01, color='red', linestyle='--', label='Kill threshold')
axes[0].set_xlabel('Error Rate')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Error Rate Distribution')
axes[0].legend()

# Latency distribution
axes[1].hist(df['latency_p99'], bins=30, alpha=0.7, edgecolor='black')
axes[1].axvline(500, color='red', linestyle='--', label='Kill threshold')
axes[1].set_xlabel('P99 Latency (ms)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Latency Distribution')
axes[1].legend()

# Trigger summary
trigger_counts = [
    df['error_trigger'].sum(),
    df['latency_trigger'].sum(),
    df['any_trigger'].sum()
]
trigger_labels = ['Error Rate', 'Latency', 'Any Trigger']
axes[2].bar(trigger_labels, trigger_counts, alpha=0.7, edgecolor='black')
axes[2].set_ylabel('Number of Triggers')
axes[2].set_title(f'Kill Criteria Triggers (n={N_SIMULATIONS})')
axes[2].axhline(N_SIMULATIONS * 0.05, color='orange', linestyle='--', label='5% threshold')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"
** Interpretation **")
print(f"In {N_SIMULATIONS} simulated 24h periods with a {COHORT_SIZE*100}% deployment:")
print(f"- Kill criteria would trigger {df['any_trigger'].mean()*100:.1f}% of the time")
print(f"- This suggests the authority window provides {(1-df['any_trigger'].mean())*100:.1f}% safe operation probability")
print(f"- Estimated incident prevention: {df['any_trigger'].sum()} potential incidents caught early")