# GPT-2 FineWeb-Edu Performance Analysis

This notebook provides performance analysis visualizations including:
- Token processing speed and throughput
- GPU memory usage
- Training efficiency metrics
- Resource utilization over time
- Cost and time estimates


In [None]:
# Install required packages
# !pip install wandb matplotlib seaborn numpy pandas torch

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import wandb
import os
from pathlib import Path
from datetime import datetime, timedelta

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")


## 1. Load Performance Data

Load training performance metrics including token processing speed and GPU memory usage.


In [None]:
def load_performance_data(use_wandb=False, project_name="gpt-fineweb-demo", run_id=None):
    """
    Load performance data from WandB or generate synthetic data.
    """
    if use_wandb:
        try:
            api = wandb.Api()
            if run_id:
                run = api.run(f"{project_name}/{run_id}")
            else:
                runs = list(api.runs(project_name))
                if len(runs) == 0:
                    raise ValueError(f"No runs found in project {project_name}")
                run = runs[0]
            
            history = run.history()
            return history
        except Exception as e:
            print(f"WandB API error: {e}")
            print("Generating synthetic performance data...")
            use_wandb = False
    
    if not use_wandb:
        # Generate synthetic performance data
        max_tokens = 200_000_000
        log_interval = 20
        tokens_per_step = 16 * 256
        
        steps = np.arange(0, max_tokens // tokens_per_step, log_interval)
        tokens_seen = steps * tokens_per_step
        
        # Token processing speed: starts at ~4k, stabilizes around ~6-7k tok/s
        base_speed = 4000
        target_speed = 6500
        tokens_per_sec = base_speed + (target_speed - base_speed) * (1 - np.exp(-tokens_seen / 20_000_000))
        tokens_per_sec += np.random.normal(0, 300, len(tokens_per_sec))
        
        # GPU memory: starts low, increases as model warms up, then stabilizes
        gpu_mem_gb = 3.5 + 0.5 * (1 - np.exp(-tokens_seen / 10_000_000)) + np.random.normal(0, 0.1, len(tokens_seen))
        gpu_mem_gb = np.clip(gpu_mem_gb, 3.0, 4.5)
        
        # Loss for context
        train_loss = 10.8 * np.exp(-tokens_seen / 50_000_000) + 4.19 * (1 - np.exp(-tokens_seen / 50_000_000))
        train_loss += np.random.normal(0, 0.1, len(train_loss))
        
        data = pd.DataFrame({
            'step': steps,
            'tokens_seen': tokens_seen,
            'loss': train_loss,
            'tokens_per_sec': tokens_per_sec,
            'gpu_mem_gb': gpu_mem_gb,
        })
        
        return data
    
    return None

# Load performance data
perf_data = load_performance_data(use_wandb=False)

print(f"Loaded {len(perf_data)} performance data points")
print("\nFirst few rows:")
print(perf_data.head())
print("\nPerformance Statistics:")
print(perf_data[['tokens_per_sec', 'gpu_mem_gb']].describe())


## 2. Token Processing Speed Analysis


In [None]:
def plot_processing_speed_analysis(perf_data, tokens_or_steps='tokens'):
    """
    Analyze token processing speed over training.
    """
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    x = perf_data['tokens_seen'] if tokens_or_steps == 'tokens' else perf_data['step']
    x_label = 'Tokens Seen' if tokens_or_steps == 'tokens' else 'Training Steps'
    
    # Plot 1: Speed over time
    ax1 = axes[0, 0]
    ax1.plot(x, perf_data['tokens_per_sec'], color='#6A4C93', linewidth=2, alpha=0.8)
    avg_speed = perf_data['tokens_per_sec'].mean()
    median_speed = perf_data['tokens_per_sec'].median()
    ax1.axhline(y=avg_speed, color='r', linestyle='--', linewidth=2, label=f'Mean: {avg_speed:,.0f} tok/s')
    ax1.axhline(y=median_speed, color='g', linestyle='--', linewidth=2, label=f'Median: {median_speed:,.0f} tok/s')
    ax1.set_xlabel(x_label, fontsize=12)
    ax1.set_ylabel('Tokens per Second', fontsize=12)
    ax1.set_title('Token Processing Speed Over Time', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=10)
    ax1.grid(True, alpha=0.3)
    if tokens_or_steps == 'tokens':
        ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.1f}M'))
    
    # Plot 2: Speed distribution
    ax2 = axes[0, 1]
    ax2.hist(perf_data['tokens_per_sec'], bins=50, color='#6A4C93', alpha=0.7, edgecolor='black')
    ax2.axvline(x=avg_speed, color='r', linestyle='--', linewidth=2, label=f'Mean: {avg_speed:,.0f}')
    ax2.axvline(x=median_speed, color='g', linestyle='--', linewidth=2, label=f'Median: {median_speed:,.0f}')
    ax2.set_xlabel('Tokens per Second', fontsize=12)
    ax2.set_ylabel('Frequency', fontsize=12)
    ax2.set_title('Token Processing Speed Distribution', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3, axis='y')
    
    # Plot 3: Rolling average
    ax3 = axes[1, 0]
    window = max(1, len(perf_data) // 20)  # 5% window
    rolling_avg = perf_data['tokens_per_sec'].rolling(window=window, center=True).mean()
    ax3.plot(x, perf_data['tokens_per_sec'], color='#6A4C93', alpha=0.3, linewidth=1, label='Raw')
    ax3.plot(x, rolling_avg, color='#C73E1D', linewidth=2, label=f'Rolling Avg (window={window})')
    ax3.set_xlabel(x_label, fontsize=12)
    ax3.set_ylabel('Tokens per Second', fontsize=12)
    ax3.set_title('Token Processing Speed (with Rolling Average)', fontsize=14, fontweight='bold')
    ax3.legend(fontsize=10)
    ax3.grid(True, alpha=0.3)
    if tokens_or_steps == 'tokens':
        ax3.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.1f}M'))
    
    # Plot 4: Speed vs Loss (efficiency)
    ax4 = axes[1, 1]
    scatter = ax4.scatter(perf_data['loss'], perf_data['tokens_per_sec'], 
                         c=x, cmap='viridis', alpha=0.6, s=30)
    ax4.set_xlabel('Training Loss', fontsize=12)
    ax4.set_ylabel('Tokens per Second', fontsize=12)
    ax4.set_title('Processing Speed vs Loss', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax4, label=x_label.replace(' (M)', ''))
    
    plt.tight_layout()
    return fig

# Plot processing speed analysis
fig = plot_processing_speed_analysis(perf_data, tokens_or_steps='tokens')
plt.show()

# Print statistics
print(f"\n=== Token Processing Speed Statistics ===")
print(f"Mean: {perf_data['tokens_per_sec'].mean():,.0f} tokens/sec")
print(f"Median: {perf_data['tokens_per_sec'].median():,.0f} tokens/sec")
print(f"Std Dev: {perf_data['tokens_per_sec'].std():,.0f} tokens/sec")
print(f"Min: {perf_data['tokens_per_sec'].min():,.0f} tokens/sec")
print(f"Max: {perf_data['tokens_per_sec'].max():,.0f} tokens/sec")


In [None]:
def plot_gpu_memory_analysis(perf_data, tokens_or_steps='tokens'):
    """
    Analyze GPU memory usage over training.
    """
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    x = perf_data['tokens_seen'] if tokens_or_steps == 'tokens' else perf_data['step']
    x_label = 'Tokens Seen' if tokens_or_steps == 'tokens' else 'Training Steps'
    
    # Plot 1: Memory usage over time
    ax1 = axes[0]
    ax1.plot(x, perf_data['gpu_mem_gb'], color='#F18F01', linewidth=2, alpha=0.8)
    avg_mem = perf_data['gpu_mem_gb'].mean()
    max_mem = perf_data['gpu_mem_gb'].max()
    min_mem = perf_data['gpu_mem_gb'].min()
    ax1.axhline(y=avg_mem, color='r', linestyle='--', linewidth=2, label=f'Mean: {avg_mem:.2f} GB')
    ax1.axhline(y=max_mem, color='orange', linestyle='--', linewidth=1, alpha=0.7, label=f'Max: {max_mem:.2f} GB')
    ax1.axhline(y=min_mem, color='blue', linestyle='--', linewidth=1, alpha=0.7, label=f'Min: {min_mem:.2f} GB')
    ax1.set_xlabel(x_label, fontsize=12)
    ax1.set_ylabel('GPU Memory (GB)', fontsize=12)
    ax1.set_title('GPU Memory Usage Over Time', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=10)
    ax1.grid(True, alpha=0.3)
    if tokens_or_steps == 'tokens':
        ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.1f}M'))
    
    # Plot 2: Memory distribution
    ax2 = axes[1]
    ax2.hist(perf_data['gpu_mem_gb'], bins=30, color='#F18F01', alpha=0.7, edgecolor='black')
    ax2.axvline(x=avg_mem, color='r', linestyle='--', linewidth=2, label=f'Mean: {avg_mem:.2f} GB')
    ax2.axvline(x=perf_data['gpu_mem_gb'].median(), color='g', linestyle='--', linewidth=2, 
               label=f'Median: {perf_data["gpu_mem_gb"].median():.2f} GB')
    ax2.set_xlabel('GPU Memory (GB)', fontsize=12)
    ax2.set_ylabel('Frequency', fontsize=12)
    ax2.set_title('GPU Memory Usage Distribution', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    return fig

# Plot GPU memory analysis
if 'gpu_mem_gb' in perf_data.columns:
    fig = plot_gpu_memory_analysis(perf_data, tokens_or_steps='tokens')
    plt.show()
    
    print(f"\n=== GPU Memory Statistics ===")
    print(f"Mean: {perf_data['gpu_mem_gb'].mean():.2f} GB")
    print(f"Median: {perf_data['gpu_mem_gb'].median():.2f} GB")
    print(f"Std Dev: {perf_data['gpu_mem_gb'].std():.2f} GB")
    print(f"Min: {perf_data['gpu_mem_gb'].min():.2f} GB")
    print(f"Max: {perf_data['gpu_mem_gb'].max():.2f} GB")
else:
    print("GPU memory data not available.")


## 4. Training Time and Efficiency Estimates


In [None]:
def calculate_training_estimates(perf_data, total_tokens=200_000_000):
    """
    Calculate training time estimates and efficiency metrics.
    """
    avg_speed = perf_data['tokens_per_sec'].mean()
    
    # Time estimates
    total_seconds = total_tokens / avg_speed
    total_hours = total_seconds / 3600
    total_days = total_hours / 24
    
    # Efficiency metrics
    tokens_per_hour = avg_speed * 3600
    tokens_per_day = tokens_per_hour * 24
    
    # Estimate for different token counts
    token_targets = [50_000_000, 100_000_000, 200_000_000, 500_000_000, 1_000_000_000]
    estimates = []
    
    for target in token_targets:
        hours = target / tokens_per_hour
        days = hours / 24
        estimates.append({
            'Tokens': f"{target/1e6:.0f}M",
            'Hours': f"{hours:.1f}",
            'Days': f"{days:.2f}",
        })
    
    estimates_df = pd.DataFrame(estimates)
    
    # Create visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Time estimates bar chart
    ax1 = axes[0]
    token_nums = [t/1e6 for t in token_targets]
    hours_list = [t / tokens_per_hour for t in token_targets]
    bars = ax1.barh([f"{t:.0f}M" for t in token_nums], hours_list, color='#2E86AB', alpha=0.7)
    ax1.set_xlabel('Training Time (Hours)', fontsize=12)
    ax1.set_ylabel('Token Target', fontsize=12)
    ax1.set_title('Estimated Training Time by Token Count', fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for i, (bar, hours) in enumerate(zip(bars, hours_list)):
        ax1.text(hours, i, f' {hours:.1f}h', va='center', fontsize=10)
    
    # Plot 2: Throughput over time
    ax2 = axes[1]
    x = perf_data['tokens_seen']
    cumulative_hours = x / tokens_per_hour
    ax2.plot(x / 1e6, cumulative_hours, color='#06A77D', linewidth=2)
    ax2.set_xlabel('Tokens Processed (Millions)', fontsize=12)
    ax2.set_ylabel('Cumulative Training Time (Hours)', fontsize=12)
    ax2.set_title('Cumulative Training Time', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    # Add annotation for 200M tokens
    final_hours = total_tokens / tokens_per_hour
    ax2.plot(total_tokens / 1e6, final_hours, 'ro', markersize=10)
    ax2.annotate(f'{final_hours:.1f}h ({final_hours/24:.2f}d)', 
                xy=(total_tokens / 1e6, final_hours),
                xytext=(10, 10), textcoords='offset points', fontsize=11,
                bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7))
    
    plt.tight_layout()
    
    return fig, estimates_df, {
        'avg_speed': avg_speed,
        'total_hours': total_hours,
        'total_days': total_days,
        'tokens_per_hour': tokens_per_hour,
        'tokens_per_day': tokens_per_day,
    }

# Calculate estimates
fig, estimates_df, metrics = calculate_training_estimates(perf_data)

print(f"\n=== Training Efficiency Metrics ===")
print(f"Average Processing Speed: {metrics['avg_speed']:,.0f} tokens/sec")
print(f"Tokens per Hour: {metrics['tokens_per_hour']:,.0f}")
print(f"Tokens per Day: {metrics['tokens_per_day']:,.0f}")
print(f"\n=== Time Estimates for 200M Tokens ===")
print(f"Total Time: {metrics['total_hours']:.2f} hours ({metrics['total_days']:.2f} days)")

print("\n=== Time Estimates Table ===")
print(estimates_df.to_string(index=False))

plt.show()


## 5. Comprehensive Performance Dashboard


In [None]:
def create_performance_dashboard(perf_data, tokens_or_steps='tokens'):
    """
    Create a comprehensive performance dashboard.
    """
    fig = plt.figure(figsize=(18, 12))
    gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
    
    x = perf_data['tokens_seen'] if tokens_or_steps == 'tokens' else perf_data['step']
    x_label = 'Tokens Seen (M)' if tokens_or_steps == 'tokens' else 'Training Steps'
    
    # 1. Processing speed (top left)
    ax1 = fig.add_subplot(gs[0, 0])
    ax1.plot(x / 1e6, perf_data['tokens_per_sec'], color='#6A4C93', linewidth=2)
    ax1.axhline(y=perf_data['tokens_per_sec'].mean(), color='r', linestyle='--', 
               label=f'Avg: {perf_data["tokens_per_sec"].mean():,.0f}')
    ax1.set_xlabel(x_label, fontsize=10)
    ax1.set_ylabel('Tokens/sec', fontsize=10)
    ax1.set_title('Processing Speed', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # 2. GPU memory (top center)
    if 'gpu_mem_gb' in perf_data.columns:
        ax2 = fig.add_subplot(gs[0, 1])
        ax2.plot(x / 1e6, perf_data['gpu_mem_gb'], color='#F18F01', linewidth=2)
        ax2.axhline(y=perf_data['gpu_mem_gb'].mean(), color='r', linestyle='--',
                   label=f'Avg: {perf_data["gpu_mem_gb"].mean():.2f} GB')
        ax2.set_xlabel(x_label, fontsize=10)
        ax2.set_ylabel('GPU Memory (GB)', fontsize=10)
        ax2.set_title('GPU Memory Usage', fontsize=12, fontweight='bold')
        ax2.legend(fontsize=9)
        ax2.grid(True, alpha=0.3)
    
    # 3. Loss for context (top right)
    ax3 = fig.add_subplot(gs[0, 2])
    ax3.plot(x / 1e6, perf_data['loss'], color='#2E86AB', linewidth=2)
    ax3.set_xlabel(x_label, fontsize=10)
    ax3.set_ylabel('Loss', fontsize=10)
    ax3.set_title('Training Loss', fontsize=12, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # 4. Speed vs Memory (middle left)
    if 'gpu_mem_gb' in perf_data.columns:
        ax4 = fig.add_subplot(gs[1, 0])
        scatter = ax4.scatter(perf_data['gpu_mem_gb'], perf_data['tokens_per_sec'],
                             c=x, cmap='viridis', alpha=0.6, s=30)
        ax4.set_xlabel('GPU Memory (GB)', fontsize=10)
        ax4.set_ylabel('Tokens/sec', fontsize=10)
        ax4.set_title('Speed vs Memory', fontsize=12, fontweight='bold')
        ax4.grid(True, alpha=0.3)
        plt.colorbar(scatter, ax=ax4, label='Tokens (M)')
    
    # 5. Efficiency over time (middle center)
    ax5 = fig.add_subplot(gs[1, 1])
    # Efficiency = tokens/sec per GB of memory (if available)
    if 'gpu_mem_gb' in perf_data.columns:
        efficiency = perf_data['tokens_per_sec'] / perf_data['gpu_mem_gb']
        ax5.plot(x / 1e6, efficiency, color='#06A77D', linewidth=2)
        ax5.set_ylabel('Efficiency (tok/s per GB)', fontsize=10)
    else:
        ax5.plot(x / 1e6, perf_data['tokens_per_sec'], color='#06A77D', linewidth=2)
        ax5.set_ylabel('Tokens/sec', fontsize=10)
    ax5.set_xlabel(x_label, fontsize=10)
    ax5.set_title('Training Efficiency', fontsize=12, fontweight='bold')
    ax5.grid(True, alpha=0.3)
    
    # 6. Cumulative tokens (middle right)
    ax6 = fig.add_subplot(gs[1, 2])
    cumulative_hours = x / (perf_data['tokens_per_sec'].mean() * 3600)
    ax6.plot(x / 1e6, cumulative_hours, color='#A23B72', linewidth=2)
    ax6.set_xlabel(x_label, fontsize=10)
    ax6.set_ylabel('Cumulative Hours', fontsize=10)
    ax6.set_title('Cumulative Training Time', fontsize=12, fontweight='bold')
    ax6.grid(True, alpha=0.3)
    
    # 7. Performance summary table (bottom, spans all columns)
    ax7 = fig.add_subplot(gs[2, :])
    ax7.axis('off')
    
    avg_speed = perf_data['tokens_per_sec'].mean()
    total_tokens = perf_data['tokens_seen'].iloc[-1]
    total_hours = total_tokens / (avg_speed * 3600)
    
    summary_data = {
        'Metric': [
            'Avg Processing Speed',
            'Total Tokens Processed',
            'Total Training Time',
            'Tokens per Hour',
            'Tokens per Day',
        ],
        'Value': [
            f"{avg_speed:,.0f} tokens/sec",
            f"{total_tokens:,.0f}",
            f"{total_hours:.2f} hours ({total_hours/24:.2f} days)",
            f"{avg_speed * 3600:,.0f}",
            f"{avg_speed * 3600 * 24:,.0f}",
        ]
    }
    
    if 'gpu_mem_gb' in perf_data.columns:
        summary_data['Metric'].insert(3, 'Avg GPU Memory')
        summary_data['Value'].insert(3, f"{perf_data['gpu_mem_gb'].mean():.2f} GB")
    
    summary_df = pd.DataFrame(summary_data)
    table = ax7.table(cellText=summary_df.values, colLabels=summary_df.columns,
                     cellLoc='left', loc='center', bbox=[0, 0, 1, 1])
    table.auto_set_font_size(False)
    table.set_fontsize(11)
    table.scale(1, 2)
    ax7.set_title('Performance Summary Statistics', fontsize=14, fontweight='bold', pad=20)
    
    plt.suptitle('GPT-2 FineWeb-Edu Performance Dashboard', fontsize=16, fontweight='bold', y=0.995)
    return fig

# Create performance dashboard
fig = create_performance_dashboard(perf_data, tokens_or_steps='tokens')
plt.show()
