# Graph Fusion Analysis

This notebook demonstrates graph-level operator fusion experiments
and benchmarks the performance impact of different fusion strategies.

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

from src.manual_fuser import (
    ManualFuser, ComputationGraph, OperatorNode,
    FusionRule, FusionStrategy, get_default_fusion_rules
)
from src.benchmark import (
    FusionBenchmark, compare_fused_vs_unfused,
    create_sample_resnet_block, create_sample_transformer_block
)
from src.graph_analyzer import GraphAnalyzer

import matplotlib.pyplot as plt
import numpy as np

## 1. Create Sample Computation Graphs

In [None]:
# Create ResNet block graph
resnet_graph = create_sample_resnet_block()
print(f"ResNet block: {len(resnet_graph.nodes)} operators")

# Create Transformer block graph
transformer_graph = create_sample_transformer_block()
print(f"Transformer block: {len(transformer_graph.nodes)} operators")

## 2. Apply Fusion Rules

In [None]:
# Create fuser with default rules
fuser = ManualFuser(strategy=FusionStrategy.GREEDY)

for rule in get_default_fusion_rules():
    fuser.add_rule(rule)
    print(f"Added rule: {rule.name} ({rule.pattern} -> {rule.fused_op})")

In [None]:
# Apply fusion to ResNet block
fused_resnet = fuser.fuse(resnet_graph)
resnet_stats = fuser.get_stats()

print(f"\nResNet fusion results:")
print(f"  Original ops: {len(resnet_graph.nodes)}")
print(f"  Fused ops: {len(fused_resnet.nodes)}")
print(f"  Fusions applied: {resnet_stats}")

## 3. Benchmark Fused vs Unfused

In [None]:
# Run comparison benchmark
comparison = compare_fused_vs_unfused(resnet_graph, iterations=100)
print(comparison)

In [None]:
# Visualize latency comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart comparison
ax1 = axes[0]
metrics = ['Mean', 'P50', 'P95', 'P99']
unfused_vals = [comparison.unfused.mean_ms, comparison.unfused.median_ms,
                comparison.unfused.p95_ms, comparison.unfused.p99_ms]
fused_vals = [comparison.fused.mean_ms, comparison.fused.median_ms,
              comparison.fused.p95_ms, comparison.fused.p99_ms]

x = np.arange(len(metrics))
width = 0.35

ax1.bar(x - width/2, unfused_vals, width, label='Unfused', color='coral')
ax1.bar(x + width/2, fused_vals, width, label='Fused', color='steelblue')
ax1.set_ylabel('Latency (ms)')
ax1.set_title('Latency Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(metrics)
ax1.legend()

# Op count comparison
ax2 = axes[1]
ax2.bar(['Original', 'Fused'], 
        [comparison.original_op_count, comparison.fused_op_count],
        color=['coral', 'steelblue'])
ax2.set_ylabel('Operator Count')
ax2.set_title('Graph Size Reduction')

for i, v in enumerate([comparison.original_op_count, comparison.fused_op_count]):
    ax2.text(i, v + 0.1, str(v), ha='center')

plt.tight_layout()
plt.savefig('../results/fusion_comparison.png', dpi=150)
plt.show()

## 4. Analyze Different Fusion Strategies

In [None]:
# Compare fusion strategies
strategies = [FusionStrategy.GREEDY, FusionStrategy.MAXIMIZE_FUSION]
strategy_results = {}

for strategy in strategies:
    fuser = ManualFuser(strategy=strategy)
    for rule in get_default_fusion_rules():
        fuser.add_rule(rule)
    
    fused = fuser.fuse(resnet_graph)
    strategy_results[strategy.value] = {
        'op_count': len(fused.nodes),
        'fusions': fuser.get_stats()['total_fusions'],
    }

for strategy, result in strategy_results.items():
    print(f"{strategy}: {result['op_count']} ops, {result['fusions']} fusions")

## 5. Custom Fusion Rule Example

In [None]:
# Define a custom fusion rule
custom_rule = FusionRule(
    name="conv_relu_custom",
    pattern=["Conv", "Relu"],
    fused_op="MyFusedConvRelu",
    priority=15,
    description="Custom Conv + ReLU fusion"
)

# Test the custom rule
custom_fuser = ManualFuser()
custom_fuser.add_rule(custom_rule)

# Create a simple test graph
test_nodes = [
    OperatorNode(id="conv", op_type="Conv", inputs=["x"], outputs=["conv_out"]),
    OperatorNode(id="relu", op_type="Relu", inputs=["conv_out"], outputs=["y"]),
]
test_graph = ComputationGraph(test_nodes)

fused_test = custom_fuser.fuse(test_graph)
print(f"Original ops: {len(test_graph.nodes)}")
print(f"Fused ops: {len(fused_test.nodes)}")
print(f"Fused node type: {list(fused_test.nodes.values())[0].op_type}")

## 6. Export Results

In [None]:
import json

# Export benchmark results
results = {
    'resnet_comparison': comparison.to_dict(),
    'strategy_comparison': strategy_results,
}

with open('../results/fusion_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results exported to ../results/fusion_analysis_results.json")

## Summary

Key findings from fusion analysis:
1. Conv + BN + ReLU fusion provides significant latency reduction
2. Graph size reduction correlates with memory bandwidth savings
3. Different fusion strategies may be optimal for different architectures