# Decode Stage Roofline Analysis

This notebook analyzes the performance of MLA (Multi-head Latent Attention) and MOE (Mixture of Experts) using roofline models.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from decode_mla_moe import get_Theory_Data, get_gpu_info

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Cell 1: Get theoretical performance data
print("Generating theoretical performance data...")
print("This may take a few minutes...\n")

df = get_Theory_Data()

print(f"\nData generation complete!")
print(f"Total configurations: {len(df)}")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
display(df.head())

# Save to CSV
df.to_csv('theory_performance_data.csv', index=False)
print("\nData saved to: theory_performance_data.csv")

Generating theoretical performance data...
This may take a few minutes...

memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
me

Unnamed: 0,GPU,batch,seq_len,TP,EP,mla_result,moe_result,total_time_ms
0,H20,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0058588104630365664, ...",0.141659
1,H800,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.001071362625769105, '...",0.098186
2,B200,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0002363072232843137, ...",0.069899
3,GB200-NVL72,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0002132796719117647, ...",0.069688
4,H20,1,1024,1,2,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0058588104630365664, ...",0.138571



Data saved to: theory_performance_data.csv


In [3]:
# Cell 2: Plot Roofline Model for Attention (FP16), GEMM (FP8), and MOE

def plot_roofline(df, gpu_name, batch_size, seq_len, tp, ep, figsize=(15, 5)):
    """
    Plot roofline model for a specific configuration.
    
    Args:
        df: DataFrame with performance data
        gpu_name: GPU name
        batch_size: Batch size
        seq_len: Sequence length
        tp: Tensor parallelism
        ep: Expert parallelism
        figsize: Figure size
    """
    # Filter data for specific configuration
    mask = (df['GPU'] == gpu_name) & \
           (df['batch'] == batch_size) & \
           (df['seq_len'] == seq_len) & \
           (df['TP'] == tp) & \
           (df['EP'] == ep)
    
    if mask.sum() == 0:
        print(f"No data found for configuration: {gpu_name}, batch={batch_size}, seq_len={seq_len}, TP={tp}, EP={ep}")
        return
    
    row = df[mask].iloc[0]
    mla_result = row['mla_result']
    moe_result = row['moe_result']
    
    # Get GPU info
    gpu_name_mapping = {'B200': 'DGX-B200'}
    gpu_dict = get_gpu_info(filename='./device/gpu_info.csv', 
                           decoding_mode=True,
                           gpu_name_mapping=gpu_name_mapping)
    gpu = gpu_dict[gpu_name]
    
    # GPU specs
    mem_bw = gpu.get_mem_bw()  # GB/s
    fp16_flops = gpu.get_fp16_flops()  # TFLOPS
    fp8_flops = gpu.get_fp8_flops()  # TFLOPS
    
    # Extract data from results
    # MLA GEMM (FP8)
    gemm_fp8_gflops = mla_result['gemm_fp8_flpos']  # GFLOPS
    mem_attn_gemm = mla_result['mem_attn_gemm']  # MB
    gemm_ai = gemm_fp8_gflops / (mem_attn_gemm / 1024) if mem_attn_gemm > 0 else 0  # FLOP/Byte
    gemm_time = mla_result['gemm_fp8_time']  # ms
    gemm_achieved_flops = gemm_fp8_gflops / gemm_time if gemm_time > 0 else 0  # GFLOPS/ms = TFLOPS
    
    # MLA Attention (FP16)
    attn_fp16_gflops = mla_result['attn_fp16_flpos']  # GFLOPS
    mem_kv = mla_result['mem_KVCache']  # MB
    attn_ai = attn_fp16_gflops / (mem_kv / 1024) if mem_kv > 0 else 0  # FLOP/Byte
    attn_time = mla_result['attn_fp16_time']  # ms
    attn_achieved_flops = attn_fp16_gflops / attn_time if attn_time > 0 else 0  # TFLOPS
    
    # MOE
    mem_moe = moe_result['mem_moe']  # MB
    moe_shared_time = moe_result['shared_expert_time']  # ms
    moe_routed_time = moe_result['routed_expert_time']  # ms
    
    # Calculate MOE FLOPS (3 linear layers per expert)
    from decode_mla_moe import ModelArgs, moe_expert_flops
    args = ModelArgs()
    
    # Shared expert FLOPS
    shared_gflops = moe_expert_flops(args, batch_size)  # GFLOPS
    shared_ai = shared_gflops / (mem_moe / 1024) if mem_moe > 0 else 0  # FLOP/Byte
    shared_achieved_flops = shared_gflops / moe_shared_time if moe_shared_time > 0 else 0  # TFLOPS
    
    # Routed expert FLOPS
    routed_tokens = batch_size * args.n_activated_experts
    routed_gflops = moe_expert_flops(args, routed_tokens)  # GFLOPS
    routed_ai = routed_gflops / (mem_moe / 1024) if mem_moe > 0 else 0  # FLOP/Byte
    routed_achieved_flops = routed_gflops / moe_routed_time if moe_routed_time > 0 else 0  # TFLOPS
    
    # Create figure with 3 subplots
    fig, axes = plt.subplots(1, 3, figsize=figsize)
    
    # AI range for plotting roofline
    ai_range = np.logspace(-1, 3, 100)  # 0.1 to 1000 FLOP/Byte
    
    # Plot 1: Attention FP16 Roofline
    ax1 = axes[0]
    # Memory bound region
    mem_bound = mem_bw * ai_range  # TFLOPS
    # Compute bound region
    compute_bound = np.full_like(ai_range, fp16_flops)
    # Roofline
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax1.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax1.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax1.axhline(fp16_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP16 ({fp16_flops:.0f} TFLOPS)')
    ax1.scatter(attn_ai, attn_achieved_flops, s=100, c='red', marker='o', 
               label=f'Attention (AI={attn_ai:.2f})', zorder=5)
    ax1.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax1.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax1.set_title(f'Attention FP16 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: GEMM FP8 Roofline
    ax2 = axes[1]
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax2.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax2.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax2.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax2.scatter(gemm_ai, gemm_achieved_flops, s=100, c='green', marker='s', 
               label=f'GEMM (AI={gemm_ai:.2f})', zorder=5)
    ax2.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax2.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax2.set_title(f'GEMM FP8 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: MOE Roofline
    ax3 = axes[2]
    # Assuming MOE uses FP8
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax3.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax3.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax3.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax3.scatter(shared_ai, shared_achieved_flops, s=100, c='blue', marker='^', 
               label=f'Shared Expert (AI={shared_ai:.2f})', zorder=5)
    ax3.scatter(routed_ai, routed_achieved_flops, s=100, c='purple', marker='v', 
               label=f'Routed Expert (AI={routed_ai:.2f})', zorder=5)
    ax3.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax3.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax3.set_title(f'MOE Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax3.legend(fontsize=8)
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Print summary
    print(f"\n{'='*80}")
    print(f"Configuration: {gpu_name}, Batch={batch_size}, SeqLen={seq_len}, TP={tp}, EP={ep}")
    print(f"{'='*80}")
    print(f"\nAttention FP16:")
    print(f"  FLOPS: {attn_fp16_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_kv:.2f} MB")
    print(f"  Arithmetic Intensity: {attn_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {attn_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {attn_time:.4f} ms")
    
    print(f"\nGEMM FP8:")
    print(f"  FLOPS: {gemm_fp8_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_attn_gemm:.2f} MB")
    print(f"  Arithmetic Intensity: {gemm_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {gemm_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {gemm_time:.4f} ms")
    
    print(f"\nMOE Shared Expert:")
    print(f"  FLOPS: {shared_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {shared_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {shared_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_shared_time:.4f} ms")
    
    print(f"\nMOE Routed Expert:")
    print(f"  FLOPS: {routed_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {routed_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {routed_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_routed_time:.4f} ms")
    print(f"\nTotal Time: {row['total_time_ms']:.4f} ms")
    
    return fig


# Example: Plot roofline for a specific configuration
# You can modify these parameters to analyze different configurations
fig = plot_roofline(
    df=df,
    gpu_name='H20',
    batch_size=8,
    seq_len=4096,
    tp=1,
    ep=1
)

plt.show()


Configuration: H20, Batch=8, SeqLen=4096, TP=1, EP=1

Attention FP16:
  FLOPS: 11.14 GFLOPS
  Memory: 148.00 MB
  Arithmetic Intensity: 77.08 FLOP/Byte
  Achieved Performance: 125.80 TFLOPS
  Time: 0.0886 ms

GEMM FP8:
  FLOPS: 0.91 GFLOPS
  Memory: 66.44 MB
  Arithmetic Intensity: 14.09 FLOP/Byte
  Achieved Performance: 251.60 TFLOPS
  Time: 0.0036 ms

MOE Shared Expert:
  FLOPS: 0.70 GFLOPS
  Memory: 42.00 MB
  Arithmetic Intensity: 17.18 FLOP/Byte
  Achieved Performance: 15.09 TFLOPS
  Time: 0.0467 ms

MOE Routed Expert:
  FLOPS: 5.64 GFLOPS
  Memory: 42.00 MB
  Arithmetic Intensity: 137.44 FLOP/Byte
  Achieved Performance: 14.85 TFLOPS
  Time: 0.3796 ms

Total Time: 0.5685 ms


  plt.show()


In [None]:
# Interactive widget to explore different configurations
from ipywidgets import interact, widgets

def interactive_roofline(GPU, Batch, SeqLen, TP, EP):
    fig = plot_roofline(df, GPU, Batch, SeqLen, TP, EP, figsize=(18, 5))
    if fig:
        plt.show()

# Create widgets
interact(
    interactive_roofline,
    GPU=widgets.Dropdown(options=df['GPU'].unique().tolist(), value=df['GPU'].iloc[0], description='GPU:'),
    Batch=widgets.Dropdown(options=sorted(df['batch'].unique().tolist()), value=8, description='Batch:'),
    SeqLen=widgets.Dropdown(options=sorted(df['seq_len'].unique().tolist()), value=4096, description='Seq Len:'),
    TP=widgets.Dropdown(options=sorted(df['TP'].unique().tolist()), value=1, description='TP:'),
    EP=widgets.Dropdown(options=sorted(df['EP'].unique().tolist()), value=1, description='EP:')
)

In [None]:
# Additional analysis: Compare different GPUs
def compare_gpus_roofline(batch_size=8, seq_len=4096, tp=1, ep=1, figsize=(20, 12)):
    """
    Compare roofline models across different GPUs.
    """
    gpus = df['GPU'].unique()
    n_gpus = len(gpus)
    
    fig, axes = plt.subplots(n_gpus, 3, figsize=figsize)
    if n_gpus == 1:
        axes = axes.reshape(1, -1)
    
    gpu_name_mapping = {'B200': 'DGX-B200'}
    gpu_dict = get_gpu_info(filename='./device/gpu_info.csv', 
                           decoding_mode=True,
                           gpu_name_mapping=gpu_name_mapping)
    
    ai_range = np.logspace(-1, 3, 100)
    
    for i, gpu_name in enumerate(gpus):
        # Filter data
        mask = (df['GPU'] == gpu_name) & \
               (df['batch'] == batch_size) & \
               (df['seq_len'] == seq_len) & \
               (df['TP'] == tp) & \
               (df['EP'] == ep)
        
        if mask.sum() == 0:
            continue
        
        row = df[mask].iloc[0]
        mla_result = row['mla_result']
        moe_result = row['moe_result']
        
        gpu = gpu_dict[gpu_name]
        mem_bw = gpu.get_mem_bw()
        fp16_flops = gpu.get_fp16_flops()
        fp8_flops = gpu.get_fp8_flops()
        
        # Calculate metrics (same as before)
        gemm_fp8_gflops = mla_result['gemm_fp8_flpos']
        mem_attn_gemm = mla_result['mem_attn_gemm']
        gemm_ai = gemm_fp8_gflops / (mem_attn_gemm / 1024) if mem_attn_gemm > 0 else 0
        gemm_time = mla_result['gemm_fp8_time']
        gemm_achieved_flops = gemm_fp8_gflops / gemm_time if gemm_time > 0 else 0
        
        attn_fp16_gflops = mla_result['attn_fp16_flpos']
        mem_kv = mla_result['mem_KVCache']
        attn_ai = attn_fp16_gflops / (mem_kv / 1024) if mem_kv > 0 else 0
        attn_time = mla_result['attn_fp16_time']
        attn_achieved_flops = attn_fp16_gflops / attn_time if attn_time > 0 else 0
        
        from decode_mla_moe import ModelArgs, moe_expert_flops
        args = ModelArgs()
        mem_moe = moe_result['mem_moe']
        
        shared_gflops = moe_expert_flops(args, batch_size)
        shared_ai = shared_gflops / (mem_moe / 1024) if mem_moe > 0 else 0
        shared_achieved_flops = shared_gflops / moe_result['shared_expert_time'] if moe_result['shared_expert_time'] > 0 else 0
        
        routed_tokens = batch_size * args.n_activated_experts
        routed_gflops = moe_expert_flops(args, routed_tokens)
        routed_ai = routed_gflops / (mem_moe / 1024) if mem_moe > 0 else 0
        routed_achieved_flops = routed_gflops / moe_result['routed_expert_time'] if moe_result['routed_expert_time'] > 0 else 0
        
        # Plot Attention FP16
        ax = axes[i, 0]
        mem_bound = mem_bw * ai_range
        compute_bound = np.full_like(ai_range, fp16_flops)
        roofline = np.minimum(mem_bound, compute_bound)
        ax.loglog(ai_range, roofline, 'k-', linewidth=2)
        ax.scatter(attn_ai, attn_achieved_flops, s=100, c='red', marker='o', zorder=5)
        ax.set_title(f'{gpu_name} - Attention FP16', fontweight='bold')
        ax.set_xlabel('AI (FLOP/Byte)')
        ax.set_ylabel('Performance (TFLOPS)')
        ax.grid(True, alpha=0.3)
        
        # Plot GEMM FP8
        ax = axes[i, 1]
        mem_bound = mem_bw * ai_range
        compute_bound = np.full_like(ai_range, fp8_flops)
        roofline = np.minimum(mem_bound, compute_bound)
        ax.loglog(ai_range, roofline, 'k-', linewidth=2)
        ax.scatter(gemm_ai, gemm_achieved_flops, s=100, c='green', marker='s', zorder=5)
        ax.set_title(f'{gpu_name} - GEMM FP8', fontweight='bold')
        ax.set_xlabel('AI (FLOP/Byte)')
        ax.set_ylabel('Performance (TFLOPS)')
        ax.grid(True, alpha=0.3)
        
        # Plot MOE
        ax = axes[i, 2]
        mem_bound = mem_bw * ai_range
        compute_bound = np.full_like(ai_range, fp8_flops)
        roofline = np.minimum(mem_bound, compute_bound)
        ax.loglog(ai_range, roofline, 'k-', linewidth=2)
        ax.scatter(shared_ai, shared_achieved_flops, s=100, c='blue', marker='^', label='Shared', zorder=5)
        ax.scatter(routed_ai, routed_achieved_flops, s=100, c='purple', marker='v', label='Routed', zorder=5)
        ax.set_title(f'{gpu_name} - MOE', fontweight='bold')
        ax.set_xlabel('AI (FLOP/Byte)')
        ax.set_ylabel('Performance (TFLOPS)')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

# Plot comparison
fig = compare_gpus_roofline(batch_size=8, seq_len=4096, tp=1, ep=1)
plt.show()

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from decode_mla_moe import get_Theory_Data, get_gpu_info

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [5]:
# Cell 1: Get theoretical performance data
print("Generating theoretical performance data...")
print("This may take a few minutes...\n")

df = get_Theory_Data()

print(f"\nData generation complete!")
print(f"Total configurations: {len(df)}")
print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
df.head()

Generating theoretical performance data...
This may take a few minutes...

memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
memory bound
me

Unnamed: 0,GPU,batch,seq_len,TP,EP,mla_result,moe_result,total_time_ms
0,H20,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0058588104630365664, ...",0.141659
1,H800,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.001071362625769105, '...",0.098186
2,B200,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0002363072232843137, ...",0.069899
3,GB200-NVL72,1,1024,1,1,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0002132796719117647, ...",0.069688
4,H20,1,1024,1,2,"{'gemm_fp8_flpos': 0.114294784, 'attn_fp16_flp...","{'shared_expert_time': 0.0058588104630365664, ...",0.138571


In [6]:
# Save to CSV
df.to_csv('theory_performance_data.csv', index=False)
print("Data saved to: theory_performance_data.csv")

Data saved to: theory_performance_data.csv


In [7]:
# Cell 2: Plot Roofline Model for Attention (FP16), GEMM (FP8), and MOE

def plot_roofline(df, gpu_name, batch_size, seq_len, tp, ep, figsize=(15, 5)):
    """
    Plot roofline model for a specific configuration.
    
    Args:
        df: DataFrame with performance data
        gpu_name: GPU name
        batch_size: Batch size
        seq_len: Sequence length
        tp: Tensor parallelism
        ep: Expert parallelism
        figsize: Figure size
    """
    # Filter data for specific configuration
    mask = (df['GPU'] == gpu_name) & \
           (df['batch'] == batch_size) & \
           (df['seq_len'] == seq_len) & \
           (df['TP'] == tp) & \
           (df['EP'] == ep)
    
    if mask.sum() == 0:
        print(f"No data found for configuration: {gpu_name}, batch={batch_size}, seq_len={seq_len}, TP={tp}, EP={ep}")
        return
    
    row = df[mask].iloc[0]
    mla_result = row['mla_result']
    moe_result = row['moe_result']
    
    # Get GPU info
    gpu_name_mapping = {'B200': 'DGX-B200'}
    gpu_dict = get_gpu_info(filename='./device/gpu_info.csv', 
                           decoding_mode=True,
                           gpu_name_mapping=gpu_name_mapping)
    gpu = gpu_dict[gpu_name]
    
    # GPU specs
    mem_bw = gpu.get_mem_bw()  # GB/s
    fp16_flops = gpu.get_fp16_flops()  # TFLOPS
    fp8_flops = gpu.get_fp8_flops()  # TFLOPS
    
    # Extract data from results
    # MLA GEMM (FP8)
    gemm_fp8_gflops = mla_result['gemm_fp8_flpos']  # GFLOPS
    mem_attn_gemm = mla_result['mem_attn_gemm']  # MB
    gemm_ai = gemm_fp8_gflops / (mem_attn_gemm / 1024) if mem_attn_gemm > 0 else 0  # FLOP/Byte
    gemm_time = mla_result['gemm_fp8_time']  # ms
    gemm_achieved_flops = gemm_fp8_gflops / gemm_time if gemm_time > 0 else 0  # GFLOPS/ms = TFLOPS
    
    # MLA Attention (FP16)
    attn_fp16_gflops = mla_result['attn_fp16_flpos']  # GFLOPS
    mem_kv = mla_result['mem_KVCache']  # MB
    attn_ai = attn_fp16_gflops / (mem_kv / 1024) if mem_kv > 0 else 0  # FLOP/Byte
    attn_time = mla_result['attn_fp16_time']  # ms
    attn_achieved_flops = attn_fp16_gflops / attn_time if attn_time > 0 else 0  # TFLOPS
    
    # MOE
    mem_moe = moe_result['mem_moe']  # MB
    moe_shared_time = moe_result['shared_expert_time']  # ms
    moe_routed_time = moe_result['routed_expert_time']  # ms
    
    # Calculate MOE FLOPS (3 linear layers per expert)
    from decode_mla_moe import ModelArgs, moe_expert_flops
    args = ModelArgs()
    
    # Shared expert FLOPS
    shared_gflops = moe_expert_flops(args, batch_size)  # GFLOPS
    shared_ai = shared_gflops / (mem_moe / 1024) if mem_moe > 0 else 0  # FLOP/Byte
    shared_achieved_flops = shared_gflops / moe_shared_time if moe_shared_time > 0 else 0  # TFLOPS
    
    # Routed expert FLOPS
    routed_tokens = batch_size * args.n_activated_experts
    routed_gflops = moe_expert_flops(args, routed_tokens)  # GFLOPS
    routed_ai = routed_gflops / (mem_moe / 1024) if mem_moe > 0 else 0  # FLOP/Byte
    routed_achieved_flops = routed_gflops / moe_routed_time if moe_routed_time > 0 else 0  # TFLOPS
    
    # Create figure with 3 subplots
    fig, axes = plt.subplots(1, 3, figsize=figsize)
    
    # AI range for plotting roofline
    ai_range = np.logspace(-1, 3, 100)  # 0.1 to 1000 FLOP/Byte
    
    # Plot 1: Attention FP16 Roofline
    ax1 = axes[0]
    # Memory bound region
    mem_bound = mem_bw * ai_range  # TFLOPS
    # Compute bound region
    compute_bound = np.full_like(ai_range, fp16_flops)
    # Roofline
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax1.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax1.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax1.axhline(fp16_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP16 ({fp16_flops:.0f} TFLOPS)')
    ax1.scatter(attn_ai, attn_achieved_flops, s=100, c='red', marker='o', 
               label=f'Attention (AI={attn_ai:.2f})', zorder=5)
    ax1.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax1.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax1.set_title(f'Attention FP16 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: GEMM FP8 Roofline
    ax2 = axes[1]
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax2.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax2.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax2.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax2.scatter(gemm_ai, gemm_achieved_flops, s=100, c='green', marker='s', 
               label=f'GEMM (AI={gemm_ai:.2f})', zorder=5)
    ax2.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax2.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax2.set_title(f'GEMM FP8 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: MOE Roofline
    ax3 = axes[2]
    # Assuming MOE uses FP8
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax3.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax3.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax3.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax3.scatter(shared_ai, shared_achieved_flops, s=100, c='blue', marker='^', 
               label=f'Shared Expert (AI={shared_ai:.2f})', zorder=5)
    ax3.scatter(routed_ai, routed_achieved_flops, s=100, c='purple', marker='v', 
               label=f'Routed Expert (AI={routed_ai:.2f})', zorder=5)
    ax3.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax3.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax3.set_title(f'MOE Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax3.legend(fontsize=8)
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Print summary
    print(f"\n{'='*80}")
    print(f"Configuration: {gpu_name}, Batch={batch_size}, SeqLen={seq_len}, TP={tp}, EP={ep}")
    print(f"{'='*80}")
    print(f"\nAttention FP16:")
    print(f"  FLOPS: {attn_fp16_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_kv:.2f} MB")
    print(f"  Arithmetic Intensity: {attn_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {attn_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {attn_time:.4f} ms")
    
    print(f"\nGEMM FP8:")
    print(f"  FLOPS: {gemm_fp8_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_attn_gemm:.2f} MB")
    print(f"  Arithmetic Intensity: {gemm_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {gemm_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {gemm_time:.4f} ms")
    
    print(f"\nMOE Shared Expert:")
    print(f"  FLOPS: {shared_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {shared_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {shared_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_shared_time:.4f} ms")
    
    print(f"\nMOE Routed Expert:")
    print(f"  FLOPS: {routed_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {routed_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {routed_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_routed_time:.4f} ms")
    print(f"\nTotal Time: {row['total_time_ms']:.4f} ms")
    
    return fig


# Example: Plot roofline for a specific configuration
# You can modify these parameters to analyze different configurations
fig = plot_roofline(
    df=df,
    gpu_name='H20',
    batch_size=8,
    seq_len=4096,
    tp=1,
    ep=1
)

plt.show()


Configuration: H20, Batch=8, SeqLen=4096, TP=1, EP=1

Attention FP16:
  FLOPS: 11.14 GFLOPS
  Memory: 148.00 MB
  Arithmetic Intensity: 77.08 FLOP/Byte
  Achieved Performance: 125.80 TFLOPS
  Time: 0.0886 ms

GEMM FP8:
  FLOPS: 0.91 GFLOPS
  Memory: 66.44 MB
  Arithmetic Intensity: 14.09 FLOP/Byte
  Achieved Performance: 251.60 TFLOPS
  Time: 0.0036 ms

MOE Shared Expert:
  FLOPS: 0.70 GFLOPS
  Memory: 42.00 MB
  Arithmetic Intensity: 17.18 FLOP/Byte
  Achieved Performance: 15.09 TFLOPS
  Time: 0.0467 ms

MOE Routed Expert:
  FLOPS: 5.64 GFLOPS
  Memory: 42.00 MB
  Arithmetic Intensity: 137.44 FLOP/Byte
  Achieved Performance: 14.85 TFLOPS
  Time: 0.3796 ms

Total Time: 0.5685 ms


  plt.show()


In [None]:
# Interactive widget to explore different configurations
from ipywidgets import interact, widgets

def interactive_roofline(GPU, Batch, SeqLen, TP, EP):
    fig = plot_roofline(df, GPU, Batch, SeqLen, TP, EP, figsize=(18, 5))
    if fig:
        plt.show()

# Create widgets
interact(
    interactive_roofline,
    GPU=widgets.Dropdown(options=df['GPU'].unique().tolist(), value=df['GPU'].iloc[0], description='GPU:'),
    Batch=widgets.Dropdown(options=sorted(df['batch'].unique().tolist()), value=8, description='Batch:'),
    SeqLen=widgets.Dropdown(options=sorted(df['seq_len'].unique().tolist()), value=4096, description='Seq Len:'),
    TP=widgets.Dropdown(options=sorted(df['TP'].unique().tolist()), value=1, description='TP:'),
    EP=widgets.Dropdown(options=sorted(df['EP'].unique().tolist()), value=1, description='EP:')
)

In [None]:
# Configure matplotlib to display inline in Jupyter
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from decode_mla_moe import get_Theory_Data, get_gpu_info

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully with inline plotting enabled!")

In [None]:
# Modified plot_roofline function that saves to file
def plot_roofline_save(df, gpu_name, batch_size, seq_len, tp, ep, figsize=(15, 5), save_path=None):
    """
    Plot roofline model for a specific configuration and save to file.
    """
    # Filter data for specific configuration
    mask = (df['GPU'] == gpu_name) & \
           (df['batch'] == batch_size) & \
           (df['seq_len'] == seq_len) & \
           (df['TP'] == tp) & \
           (df['EP'] == ep)
    
    if mask.sum() == 0:
        print(f"No data found for configuration: {gpu_name}, batch={batch_size}, seq_len={seq_len}, TP={tp}, EP={ep}")
        return None
    
    row = df[mask].iloc[0]
    mla_result = row['mla_result']
    moe_result = row['moe_result']
    
    # Get GPU info
    gpu_name_mapping = {'B200': 'DGX-B200'}
    gpu_dict = get_gpu_info(filename='./device/gpu_info.csv', 
                           decoding_mode=True,
                           gpu_name_mapping=gpu_name_mapping)
    gpu = gpu_dict[gpu_name]
    
    # GPU specs
    mem_bw = gpu.get_mem_bw()
    fp16_flops = gpu.get_fp16_flops()
    fp8_flops = gpu.get_fp8_flops()
    
    # Extract data from results
    gemm_fp8_gflops = mla_result['gemm_fp8_flpos']
    mem_attn_gemm = mla_result['mem_attn_gemm']
    gemm_ai = gemm_fp8_gflops / (mem_attn_gemm / 1024) if mem_attn_gemm > 0 else 0
    gemm_time = mla_result['gemm_fp8_time']
    gemm_achieved_flops = gemm_fp8_gflops / gemm_time if gemm_time > 0 else 0
    
    attn_fp16_gflops = mla_result['attn_fp16_flpos']
    mem_kv = mla_result['mem_KVCache']
    attn_ai = attn_fp16_gflops / (mem_kv / 1024) if mem_kv > 0 else 0
    attn_time = mla_result['attn_fp16_time']
    attn_achieved_flops = attn_fp16_gflops / attn_time if attn_time > 0 else 0
    
    mem_moe = moe_result['mem_moe']
    moe_shared_time = moe_result['shared_expert_time']
    moe_routed_time = moe_result['routed_expert_time']
    
    from decode_mla_moe import ModelArgs, moe_expert_flops
    args = ModelArgs()
    
    shared_gflops = moe_expert_flops(args, batch_size)
    shared_ai = shared_gflops / (mem_moe / 1024) if mem_moe > 0 else 0
    shared_achieved_flops = shared_gflops / moe_shared_time if moe_shared_time > 0 else 0
    
    routed_tokens = batch_size * args.n_activated_experts
    routed_gflops = moe_expert_flops(args, routed_tokens)
    routed_ai = routed_gflops / (mem_moe / 1024) if mem_moe > 0 else 0
    routed_achieved_flops = routed_gflops / moe_routed_time if moe_routed_time > 0 else 0
    
    # Create figure with 3 subplots
    fig, axes = plt.subplots(1, 3, figsize=figsize)
    
    ai_range = np.logspace(-1, 3, 100)
    
    # Plot 1: Attention FP16 Roofline
    ax1 = axes[0]
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp16_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax1.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax1.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax1.axhline(fp16_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP16 ({fp16_flops:.0f} TFLOPS)')
    ax1.scatter(attn_ai, attn_achieved_flops, s=100, c='red', marker='o', 
               label=f'Attention (AI={attn_ai:.2f})', zorder=5)
    ax1.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax1.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax1.set_title(f'Attention FP16 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: GEMM FP8 Roofline
    ax2 = axes[1]
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax2.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax2.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax2.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax2.scatter(gemm_ai, gemm_achieved_flops, s=100, c='green', marker='s', 
               label=f'GEMM (AI={gemm_ai:.2f})', zorder=5)
    ax2.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax2.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax2.set_title(f'GEMM FP8 Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: MOE Roofline
    ax3 = axes[2]
    mem_bound = mem_bw * ai_range
    compute_bound = np.full_like(ai_range, fp8_flops)
    roofline = np.minimum(mem_bound, compute_bound)
    
    ax3.loglog(ai_range, roofline, 'k-', linewidth=2, label='Roofline')
    ax3.loglog(ai_range, mem_bound, 'b--', alpha=0.5, label=f'Mem BW ({mem_bw:.0f} GB/s)')
    ax3.axhline(fp8_flops, color='r', linestyle='--', alpha=0.5, label=f'Peak FP8 ({fp8_flops:.0f} TFLOPS)')
    ax3.scatter(shared_ai, shared_achieved_flops, s=100, c='blue', marker='^', 
               label=f'Shared Expert (AI={shared_ai:.2f})', zorder=5)
    ax3.scatter(routed_ai, routed_achieved_flops, s=100, c='purple', marker='v', 
               label=f'Routed Expert (AI={routed_ai:.2f})', zorder=5)
    ax3.set_xlabel('Arithmetic Intensity (FLOP/Byte)', fontsize=10)
    ax3.set_ylabel('Performance (TFLOPS)', fontsize=10)
    ax3.set_title(f'MOE Roofline\n{gpu_name}', fontsize=12, fontweight='bold')
    ax3.legend(fontsize=8)
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save to file
    if save_path is None:
        save_path = f'roofline_{gpu_name}_batch{batch_size}_seq{seq_len}_tp{tp}_ep{ep}.png'
    
    fig.savefig(save_path, dpi=150, bbox_inches='tight')
    print(f"\nPlot saved to: {save_path}")
    
    # Print summary
    print(f"\n{'='*80}")
    print(f"Configuration: {gpu_name}, Batch={batch_size}, SeqLen={seq_len}, TP={tp}, EP={ep}")
    print(f"{'='*80}")
    print(f"\nAttention FP16:")
    print(f"  FLOPS: {attn_fp16_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_kv:.2f} MB")
    print(f"  Arithmetic Intensity: {attn_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {attn_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {attn_time:.4f} ms")
    
    print(f"\nGEMM FP8:")
    print(f"  FLOPS: {gemm_fp8_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_attn_gemm:.2f} MB")
    print(f"  Arithmetic Intensity: {gemm_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {gemm_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {gemm_time:.4f} ms")
    
    print(f"\nMOE Shared Expert:")
    print(f"  FLOPS: {shared_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {shared_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {shared_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_shared_time:.4f} ms")
    
    print(f"\nMOE Routed Expert:")
    print(f"  FLOPS: {routed_gflops:.2f} GFLOPS")
    print(f"  Memory: {mem_moe:.2f} MB")
    print(f"  Arithmetic Intensity: {routed_ai:.2f} FLOP/Byte")
    print(f"  Achieved Performance: {routed_achieved_flops:.2f} TFLOPS")
    print(f"  Time: {moe_routed_time:.4f} ms")
    print(f"\nTotal Time: {row['total_time_ms']:.4f} ms")
    
    plt.close(fig)
    return save_path


# Generate roofline plots for all 4 GPUs
print("Generating roofline plots for all GPUs...")
print("="*80)

saved_files = []
for gpu in ['H20', 'H800', 'B200', 'GB200-NVL72']:
    save_path = plot_roofline_save(
        df=df,
        gpu_name=gpu,
        batch_size=8,
        seq_len=4096,
        tp=1,
        ep=1
    )
    if save_path:
        saved_files.append(save_path)

print("\n" + "="*80)
print("All plots generated successfully!")
print("Saved files:")
for f in saved_files:
    print(f"  - {f}")
