# AMD AI Compute Observatory - Advanced Profiling

This notebook covers advanced profiling techniques including kernel traces, GPU counters, and comparative analysis.

Â© 2026 Sudheer Ibrahim Daniel Devu. All Rights Reserved.

## Setup

In [None]:
import aaco
from aaco.core import Observatory
from aaco.collectors import TimingCollector, CounterCollector, TraceCollector
import numpy as np
import matplotlib.pyplot as plt

print(f"AACO version: {aaco.__version__}")

## Configuration Options

AACO supports various profiling configurations for different use cases.

In [None]:
# Full profiling configuration
full_config = {
    "profiling": {
        "default_iterations": 500,
        "default_warmup": 50,
        "trace_level": "full",  # api, kernel, or full
    },
    "collectors": {
        "timing": {
            "enabled": True,
            "resolution": "nanoseconds"
        },
        "memory": {
            "enabled": True,
            "track_gpu": True,
            "track_cpu": True
        },
        "counters": {
            "enabled": True,
            "groups": ["compute", "memory", "cache"]
        }
    },
    "analysis": {
        "statistics": {
            "confidence_level": 0.95,
            "outlier_detection": "iqr"
        }
    }
}

obs = Observatory(config=full_config)
print("Observatory configured for advanced profiling")

## GPU Counter Profiling

Collect hardware performance counters from the GPU.

In [None]:
# List available GPU counters
available_counters = CounterCollector.available_counters()
print(f"Available counter groups: {list(available_counters.keys()) if isinstance(available_counters, dict) else available_counters}")

In [None]:
# Example counter profiling configuration
counter_config = {
    "compute_counters": [
        "GRBM_GUI_ACTIVE",  # GPU active cycles
        "SQ_WAVES",          # Shader waves
        "SQ_INSTS_VALU",     # Vector ALU instructions
    ],
    "memory_counters": [
        "TCP_TCC_READ_REQ_sum",  # Cache read requests
        "TCP_TCC_WRITE_REQ_sum", # Cache write requests
        "TCC_HIT_sum",           # L2 cache hits
        "TCC_MISS_sum",          # L2 cache misses
    ]
}

print("Counter configuration ready")

## Trace-Level Profiling

Capture detailed execution traces.

In [None]:
# Configure trace collection
trace_config = {
    "trace_level": "full",
    "output_format": "perfetto",  # or "json", "chrome"
    "include_hip_api": True,
    "include_kernel_launches": True,
    "include_memory_operations": True,
}

# Example trace collector usage
# trace_collector = TraceCollector(**trace_config)
print("Trace configuration ready")

## Comparative Analysis

Compare performance across different configurations.

In [None]:
from aaco.analytics import DriftDetector

# Simulate baseline and current metrics
np.random.seed(42)
baseline_latencies = np.random.normal(10.0, 0.5, 100)
np.random.seed(43)
current_latencies = np.random.normal(10.5, 0.6, 100)  # Slight regression

# Detect drift
detector = DriftDetector(method="ewma_cusum")
result = detector.detect(baseline_latencies, current_latencies)

print(f"Drift detected: {result.has_drift}")
print(f"Magnitude: {result.magnitude:.2%}")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot comparison
axes[0].boxplot(
    [baseline_latencies, current_latencies],
    labels=['Baseline', 'Current']
)
axes[0].set_ylabel('Latency (ms)')
axes[0].set_title('Latency Comparison')

# Distribution overlay
axes[1].hist(baseline_latencies, bins=30, alpha=0.5, label='Baseline', color='blue')
axes[1].hist(current_latencies, bins=30, alpha=0.5, label='Current', color='#ED1C24')
axes[1].set_xlabel('Latency (ms)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution Comparison')
axes[1].legend()

plt.tight_layout()
plt.show()

## Bottleneck Analysis

Identify performance bottlenecks using metrics.

In [None]:
from aaco.analytics import BottleneckClassifier

# Example metrics from a profiling session
metrics = {
    "gpu_utilization": 92.0,
    "sq_busy": 0.88,
    "memory_bandwidth_utilization": 0.35,
    "l2_hit_rate": 0.92,
}

classifier = BottleneckClassifier()
result = classifier.classify(metrics)

print(f"Workload Type: {result.category}")
print(f"Confidence: {result.confidence:.2%}")
print(f"\nEvidence:")
for evidence in result.evidence:
    print(f"  - {evidence}")

## Optimization Recommendations

In [None]:
# Based on bottleneck analysis
recommendations = {
    "compute_bound": [
        "Consider using mixed precision (FP16/BF16) for compute-intensive operations",
        "Optimize kernel occupancy by adjusting workgroup sizes",
        "Use kernel fusion to reduce launch overhead",
    ],
    "memory_bound": [
        "Optimize memory access patterns for coalesced reads",
        "Consider using shared memory for frequently accessed data",
        "Profile memory bandwidth with roofline analysis",
    ],
    "launch_overhead_bound": [
        "Use kernel batching or fusion",
        "Implement HIP graphs for repetitive workloads",
        "Reduce small kernel launches",
    ]
}

if result.category in recommendations:
    print(f"\nOptimization Recommendations for {result.category}:")
    for i, rec in enumerate(recommendations[result.category], 1):
        print(f"  {i}. {rec}")

## Exporting Results

Export profiling data in various formats for further analysis.

In [None]:
import json
from pathlib import Path

# Example export
export_data = {
    "session_id": "demo_session",
    "metrics": metrics,
    "analysis": {
        "bottleneck": result.category,
        "confidence": result.confidence,
    },
    "baseline_stats": {
        "mean": float(np.mean(baseline_latencies)),
        "p95": float(np.percentile(baseline_latencies, 95)),
    },
    "current_stats": {
        "mean": float(np.mean(current_latencies)),
        "p95": float(np.percentile(current_latencies, 95)),
    }
}

# Save to file
output_path = Path("./analysis_results.json")
# output_path.write_text(json.dumps(export_data, indent=2))
print(json.dumps(export_data, indent=2))

## Next Steps

- Learn about [Custom Collectors](03_custom_collectors.ipynb) for specialized profiling
- Explore [Laboratory Mode](04_laboratory_mode.ipynb) for kernel-level analysis with eBPF