In [None]:
import json
import sys
from pathlib import Path

In [None]:
DATA_DIR = Path(__file__).parent / "mock_data"

In [None]:
def _load_json(filename: str) -> dict:
    with open(DATA_DIR / filename) as f:
        return json.load(f)

In [None]:
def metrics_agent_investigate(state: SharedState) -> SharedState:

    print("\n [METRICS AGENT] Analyzing infrastructure metrics…")

    svc = state.raw_metrics.get("services", {}).get("payment-service", {})
    pod_data = svc.get("pods", {}).get("pay-pod-9b1e", {})
    mem = pod_data.get("memory", {})
    latency = svc.get("latency", {})
    error_rate = svc.get("error_rate", {})
    db = svc.get("database", {})

    # ── Finding: Memory Leak Pattern ──
    leak = mem.get("leak_analysis", {})
    if leak.get("leak_detected"):
        mem_ts = mem.get("timeseries", [])
        state.add_finding(Finding(
            agent=AgentRole.METRICS_AGENT,
            title="Confirmed memory leak — OOM in ~14 minutes per restart",
            description=(
                f"Memory grows at {leak['leak_rate_mb_per_min']} MiB/min. "
                f"GC overhead at {leak['gc_overhead_pct']}% (stop-the-world pauses). "
                f"Pod restarts every ~{leak['estimated_oom_minutes']:.0f} minutes. "
                f"Suspected source: {leak['suspected_source']}. "
                f"Memory trajectory: {mem_ts[0]['value']}→{mem_ts[4]['value']} MiB in first 4 minutes."
            ),
            severity=Severity.P1_CRITICAL,
            evidence={
                "leak_rate_mb_per_min": leak["leak_rate_mb_per_min"],
                "gc_overhead_pct": leak["gc_overhead_pct"],
                "oom_cycle_minutes": leak["estimated_oom_minutes"],
                "memory_timeseries_sample": mem_ts[:6],
            },
            related_services=["payment-service"],
            confidence=0.96,
            evidence = {
                "leak_rate_mb_per_min"
            }
        ))

    # ── Finding: Latency Degradation ──
    if latency.get("timeseries"):
        ts = latency["timeseries"]
        p99_start = ts[0]["p99"]
        p99_peak = max(t["p99"] for t in ts)
        state.add_finding(Finding(
            agent=AgentRole.METRICS_AGENT,
            title=f"P99 latency spike: {p99_start}ms → {p99_peak}ms ({p99_peak/p99_start:.0f}x increase)",
            description=(
                f"P99 latency degraded from {p99_start}ms to {p99_peak}ms over the incident window. "
                f"P50 also impacted: {ts[0]['p50']}ms → {max(t['p50'] for t in ts)}ms. "
                "Latency shows sawtooth pattern correlated with pod restart cycles."
            ),
            severity=Severity.P1_CRITICAL,
            evidence={
                "p99_start": p99_start,
                "p99_peak": p99_peak,
                "degradation_factor": round(p99_peak / p99_start, 1),
                "latency_timeseries": ts,
            },
            related_services=["payment-service"],
            confidence=0.98,
        ))

    # ── Finding: Error Rate Surge ──
    if error_rate.get("timeseries"):
        er_ts = error_rate["timeseries"]
        peak_err = max(t["value"] for t in er_ts)
        state.add_finding(Finding(
            agent=AgentRole.METRICS_AGENT,
            title=f"Error rate surged to {peak_err}%",
            description=(
                f"Error rate escalated from {er_ts[0]['value']}% to {peak_err}% "
                f"across the incident window. Currently at {er_ts[-1]['value']}%. "
                "Correlated with memory exhaustion and pod failures."
            ),
            severity=Severity.P1_CRITICAL,
            evidence={"error_rate_timeseries": er_ts, "peak_error_pct": peak_err},
            related_services=["payment-service"],
            confidence=0.97,
        ))

    # ── Finding: DB Connection Pool Saturation ──
    if db.get("connection_pool"):
        pool = db["connection_pool"]
        state.add_finding(Finding(
            agent=AgentRole.METRICS_AGENT,
            title="Database connection pool at 100% with 1847 pending requests",
            description=(
                f"All {pool['max_size']} connections active, 0 idle. "
                f"{pool['pending']} requests queued with avg checkout time of {pool['avg_checkout_time_ms']/1000:.1f}s. "
                "Slow queries (avg 38s) are holding connections, starving other requests."
            ),
            severity=Severity.P1_CRITICAL,
            evidence={
                "pool": pool,
                "slow_queries": db.get("slow_queries", []),
                "replication_lag_ms": db.get("replication_lag_ms"),
            },
            related_services=["payment-service"],
            confidence=0.95,
        ))

    # ── Finding: Pod Health Degradation ──
    pods = svc.get("pods", {})
    unhealthy = [name for name, info in pods.items()
                 if isinstance(info, dict) and info.get("status") in ("CrashLoopBackOff", "OOMKilled")]
    if unhealthy:
        gateway = state.raw_metrics.get("services", {}).get("api-gateway", {})
        upstream = gateway.get("upstream_health", {}).get("payment-service", {})
        state.add_finding(Finding(
            agent=AgentRole.METRICS_AGENT,
            title=f"{len(unhealthy)} pods in critical state — only 33% capacity",
            description=(
                f"Pods in failure state: {unhealthy}. "
                f"Healthy capacity at {upstream.get('healthy_pct', 'N/A')}%. "
                f"Circuit breaker state: {upstream.get('circuit_state', 'N/A')}. "
                "Remaining pods overloaded at >88% CPU."
            ),
            severity=Severity.P1_CRITICAL,
            evidence={
                "unhealthy_pods": unhealthy,
                "healthy_capacity_pct": upstream.get("healthy_pct"),
                "circuit_state": upstream.get("circuit_state"),
            },
            related_services=["payment-service"],
            confidence=0.99,
        ))


    met_findings = state.get_findings_by_agent(AgentRole.METRICS_AGENT)
    print(f"Produce {len(met_findings)} findings")
    return state

    CommunicationBus.dispatch(
        state, AgentRole.METRICS_AGENT, AgentRole.COMMANDER,
        MessageType.STATUS_UPDATE,
        {"status": "complete", "findings_count": len(state.get_findings_by_agent(AgentRole.METRICS_AGENT))},
    )

    met_findings = state.get_findings_by_agent(AgentRole.METRICS_AGENT)
    print(f"   Produced {len(met_findings)} findings")
    return state