# Workload-Variant-Autoscaler (WVA) Performance Analysis

This notebook analyzes WVA controller logs to visualize autoscaling behavior, warmup gaps, and SLO compliance.

**Workflow:**
1. Extract metrics from WVA controller logs using the bash script
2. Load and process the CSV data
3. Detect scaling phases and warmup gaps
4. Visualize performance metrics and scaling behavior
5. Perform statistical analysis

## 1. Setup and Configuration

In [None]:
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Rectangle
from matplotlib.ticker import MultipleLocator
import subprocess
from datetime import datetime
import os

# Plotting style
plt.style.use('seaborn-v0_8')
%matplotlib inline

# X-axis granularity configuration (in minutes)
X_MAJOR_TICK_INTERVAL = 1.0  # Major tick every 1 minute
X_MINOR_TICK_INTERVAL = 0.5  # Minor tick every 0.5 minutes (30 seconds)

# Configuration
LOG_DIR = f'./txt_logs'
LOG_NAME = 'experiment_rls'
# LOG_NAME = 'rls_blend_new'
LOG_FILE = f'{LOG_DIR}/{LOG_NAME}.txt'  # Change this to your log file
EXTRACT_SCRIPT = './extract_metrics.sh'

# Generate unique experiment directory based on timestamp
EXPERIMENT_NAME = datetime.now().strftime('%Y%m%d_%H%M%S')
EXPERIMENT_DIR = f'experiments/{EXPERIMENT_NAME}_{LOG_NAME}'
OUTPUT_CSV = f'{EXPERIMENT_DIR}/extracted_metrics.csv'

# Create experiment directory structure
os.makedirs(f'{EXPERIMENT_DIR}/plots', exist_ok=True)
os.makedirs(f'{EXPERIMENT_DIR}/data', exist_ok=True)
os.makedirs(f'{EXPERIMENT_DIR}/analysis', exist_ok=True)

print(f"✅ Configuration loaded")
print(f"   Experiment: {EXPERIMENT_NAME}")
print(f"   Experiment directory: {EXPERIMENT_DIR}")
print(f"   Log file: {LOG_FILE}")
print(f"   Extract script: {EXTRACT_SCRIPT}")
print(f"   Output CSV: {OUTPUT_CSV}")

**📊 X-Axis Granularity Settings:**
- **Major ticks** (bold lines with labels): Every **0.5 minutes** (30 seconds)
- **Minor ticks** (light grid lines): Every **0.1 minutes** (6 seconds)

💡 *To adjust granularity, change `X_MAJOR_TICK_INTERVAL` and `X_MINOR_TICK_INTERVAL` above:*
- For **coarser** view: Use 1.0 (major) and 0.5 (minor) for 1-minute intervals
- For **finer** view: Use 0.25 (major) and 0.05 (minor) for 15-second intervals
- For **very fine** view: Use 0.1 (major) and 0.02 (minor) for 6-second intervals

## 2. Extract Metrics from Logs

Run the bash script to parse the WVA controller logs and extract optimization metrics.

In [None]:
# Make the script executable
!chmod +x {EXTRACT_SCRIPT}

# Run the extraction script
print(f"📊 Extracting metrics from {LOG_FILE}...")
result = subprocess.run(
    [EXTRACT_SCRIPT, LOG_FILE],
    capture_output=True,
    text=True
)

# Save to CSV
with open(OUTPUT_CSV, 'w') as f:
    f.write(result.stdout)

print(f"✅ Metrics extracted to {OUTPUT_CSV}")
print(f"   Lines extracted: {len(result.stdout.splitlines())}")

# Preview the first few lines
print("\n📋 Preview of extracted data:")
!head -5 {OUTPUT_CSV}

## 3. Load and Process Data

### 3.1 Diagnostics (Run if you get errors)

In [None]:
print("🔍 Diagnosing extraction issue...\n")

# 1. Check log file
print(f"1. Log file check:")
log_path = LOG_FILE
BASH_SCRIPT = EXTRACT_SCRIPT
if os.path.exists(log_path):
    size = os.path.getsize(log_path)
    print(f"   ✓ File exists: {log_path}")
    print(f"   ✓ Size: {size:,} bytes")
    
    # Sample first few lines
    print(f"\n   First 5 lines of log file:")
    with open(log_path, 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            print(f"   {i+1}: {line[:100]}...")
else:
    print(f"   ✗ File not found: {log_path}")

# 2. Check CSV output
print(f"\n2. CSV output check:")
if os.path.exists(OUTPUT_CSV):
    size = os.path.getsize(OUTPUT_CSV)
    print(f"   ✓ File exists: {OUTPUT_CSV}")
    print(f"   ✓ Size: {size:,} bytes")
    
    if size > 0:
        # Read CSV and show structure
        df_test = pd.read_csv(OUTPUT_CSV)
        print(f"   ✓ Rows: {len(df_test)}")
        print(f"   ✓ Columns: {list(df_test.columns)}")
        
        if len(df_test) > 0:
            print(f"\n   First row:")
            print(df_test.head(1).to_string())
        else:
            print(f"   ✗ CSV has no data rows (only headers)")
    else:
        print(f"   ✗ CSV is empty (0 bytes)")
else:
    print(f"   ✗ File not found: {OUTPUT_CSV}")

# 3. Check for JSON patterns in log file
print(f"\n3. Log format check:")
if os.path.exists(log_path):
    with open(log_path, 'r') as f:
        content = f.read()
        
    # Look for expected patterns
    has_optimization = 'optimizationMetrics' in content
    has_itl = 'itlAverage' in content or '"itl"' in content
    has_ttft = 'ttftAverage' in content or '"ttft"' in content
    
    print(f"   optimizationMetrics found: {'✓' if has_optimization else '✗'}")
    print(f"   ITL metrics found: {'✓' if has_itl else '✗'}")
    print(f"   TTFT metrics found: {'✓' if has_ttft else '✗'}")
    
    if not (has_optimization or has_itl):
        print(f"\n   ⚠️  Log file doesn't appear to contain expected metrics.")
        print(f"   Expected format: JSON with 'optimizationMetrics' field")
        print(f"\n   Sample expected format:")
        print(f'   {{"optimizationMetrics":{{"itlAverage":12.5,"ttftAverage":850,"rate":45.2,...}}}}')

print("\n" + "="*70)
print("If you see issues above, check:")
print("  1. Is LOG_FILE pointing to the correct WVA controller log file?")
print("  2. Does the log contain optimizationMetrics JSON entries?")
print("  3. Try running the extraction manually:")
print(f"     {BASH_SCRIPT} {LOG_FILE}")
print("="*70)

In [None]:
# Load the CSV data
df = pd.read_csv(OUTPUT_CSV)

# Check if data was extracted
if len(df) == 0:
    print("❌ ERROR: No data was extracted from the log file!")
    print("\nPossible issues:")
    print("1. Log file format doesn't match the extraction script")
    print("2. Log file doesn't contain optimization data")
    print("3. Log file path is incorrect")
    print(f"\nLog file: {LOG_FILE}")
    print(f"CSV file: {OUTPUT_CSV}")
    print("\nPlease check:")
    print("- Does the log file exist and have content?")
    print("- Does it contain lines with 'System data prepared for optimization'?")
    print("- Is the extraction script working correctly?")
    print("\nRun this to check the log file:")
    print(f"  !grep -i 'optimization' {LOG_FILE} | head -5")
    raise ValueError("No data extracted - please check log file and extraction script")

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Calculate time in minutes from start
df['time_minutes'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 60

# Display basic statistics
print("📊 Data Summary:")
print(f"   Duration: {df['time_minutes'].max():.1f} minutes")
print(f"   Data points: {len(df)}")
print(f"   Replica range: {df['numRep'].min():.0f} → {df['numRep'].max():.0f}")
print(f"   Load range: {df['rate'].min():.1f} → {df['rate'].max():.1f} rpm")
print(f"   SLO ITL: {df['slo_itl'].iloc[0]:.0f} ms")
print(f"   SLO TTFT: {df['slo_ttft'].iloc[0]:.0f} ms")

# Display first few rows
print("\n📋 Data preview:")
df.head(10)

## 4. Detect Scaling Events and Warmup Gaps

In [None]:
def detect_scaling_events(df, slo_threshold=None):
    """
    Detect scaling events and warmup gaps where actual TPOT exceeds SLO.
    """
    if slo_threshold is None:
        slo_threshold = df['slo_itl'].iloc[0]
    
    scaling_events = []
    warmup_gaps = []
    
    # Detect replica changes
    for i in range(1, len(df)):
        if df['numRep'].iloc[i] != df['numRep'].iloc[i-1]:
            scaling_events.append({
                'time': df['time_minutes'].iloc[i],
                'from_replicas': df['numRep'].iloc[i-1],
                'to_replicas': df['numRep'].iloc[i],
                'load': df['rate'].iloc[i]
            })
    
    # Detect warmup gaps (TPOT > SLO)
    in_gap = False
    gap_start = None
    
    for i in range(len(df)):
        tpot = df['itlAverage'].iloc[i]
        
        if tpot > slo_threshold and not in_gap:
            # Start of warmup gap
            in_gap = True
            gap_start = i
        elif tpot <= slo_threshold and in_gap:
            # End of warmup gap
            in_gap = False
            warmup_gaps.append({
                'start_time': df['time_minutes'].iloc[gap_start],
                'end_time': df['time_minutes'].iloc[i-1],
                'duration': df['time_minutes'].iloc[i-1] - df['time_minutes'].iloc[gap_start],
                'peak_violation': df['itlAverage'].iloc[gap_start:i].max(),
                'avg_tpot': df['itlAverage'].iloc[gap_start:i].mean(),
                'avg_load': df['rate'].iloc[gap_start:i].mean()
            })
    
    return scaling_events, warmup_gaps

# Detect events
scaling_events, warmup_gaps = detect_scaling_events(df)

print("🔄 Scaling Events Detected:")
for i, event in enumerate(scaling_events, 1):
    print(f"   {i}. t={event['time']:.1f}min: {event['from_replicas']:.0f} → {event['to_replicas']:.0f} replicas (load: {event['load']:.1f} rpm)")

print("\n⚠️  Warmup Gaps Detected (TPOT > SLO):")
for i, gap in enumerate(warmup_gaps, 1):
    print(f"   {i}. t={gap['start_time']:.1f}-{gap['end_time']:.1f}min ({gap['duration']:.1f}min)")
    print(f"      Peak: {gap['peak_violation']:.2f}ms, Avg: {gap['avg_tpot']:.2f}ms, Load: {gap['avg_load']:.0f} rpm")

## 5. Visualization: ITL Performance with Warmup Gaps

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))

# Highlight warmup gaps
for i, gap in enumerate(warmup_gaps):
    color = ['red', 'orange', 'yellow'][i % 3]
    rect = Rectangle(
        (gap['start_time'], 0),
        gap['end_time'] - gap['start_time'],
        df['itlAverage'].max() * 1.2,
        alpha=0.15,
        color=color,
        label=f"Warmup Gap {i+1}"
    )
    ax.add_patch(rect)

# Plot ITL metrics
ax.plot(df['time_minutes'], df['itlAverage'], 'o-', 
        linewidth=3, markersize=6, color='#dc2626', 
        label='Actual ITL', zorder=3)
ax.plot(df['time_minutes'], df['itl'], 's--', 
        linewidth=2, markersize=4, color='#2563eb', 
        label='Predicted ITL', zorder=3)
ax.axhline(y=df['slo_itl'].iloc[0], color='#ef4444', 
           linestyle=':', linewidth=2, label='SLO Target', zorder=2)

# Mark scaling events
for event in scaling_events:
    ax.axvline(x=event['time'], color='#9ca3af', 
               linestyle='--', alpha=0.7, linewidth=1, zorder=1)
    ax.text(event['time'], df['itlAverage'].max() * 1.1, 
            f"{event['from_replicas']:.0f}→{event['to_replicas']:.0f}",
            ha='center', va='bottom', fontsize=9, color='#6b7280',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

ax.set_xlabel('Time (minutes from start)', fontsize=12)
ax.set_ylabel('Inter-Token Latency (ms)', fontsize=12)
ax.set_title('WVA Performance: ITL vs Time with Warmup Gap Analysis', 
             fontweight='bold', fontsize=16)
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
ax.set_ylim(0, df['itlAverage'].max() * 1.2)

# Set x-axis granularity
ax.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax.grid(True, which='major', alpha=0.3, linewidth=1)
ax.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/itl_analysis.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))

# Highlight warmup gaps
for i, gap in enumerate(warmup_gaps):
    color = ['red', 'orange', 'yellow'][i % 3]
    rect = Rectangle(
        (gap['start_time'], 0),
        gap['end_time'] - gap['start_time'],
        df['ttftAverage'].max() * 1.2,
        alpha=0.15,
        color=color,
        label=f"Warmup Gap {i+1}"
    )
    ax.add_patch(rect)

# Plot TTFT metrics
ax.plot(df['time_minutes'], df['ttftAverage'], 'o-', 
        linewidth=3, markersize=6, color='#059669', 
        label='Actual TTFT', zorder=3)
ax.plot(df['time_minutes'], df['ttft'], 's--', 
        linewidth=2, markersize=4, color='#0891b2', 
        label='Predicted TTFT', zorder=3)
ax.axhline(y=df['slo_ttft'].iloc[0], color='#10b981', 
           linestyle=':', linewidth=2, label='SLO Target', zorder=2)

# Mark scaling events
for event in scaling_events:
    ax.axvline(x=event['time'], color='#9ca3af', 
               linestyle='--', alpha=0.7, linewidth=1, zorder=1)
    ax.text(event['time'], df['ttftAverage'].max() * 1.1, 
            f"{event['from_replicas']:.0f}→{event['to_replicas']:.0f}",
            ha='center', va='bottom', fontsize=9, color='#6b7280',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

ax.set_xlabel('Time (minutes from start)', fontsize=12)
ax.set_ylabel('Time to First Token (ms)', fontsize=12)
ax.set_title('WVA Performance: TTFT vs Time with Warmup Gap Analysis', 
             fontweight='bold', fontsize=16)
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
ax.set_ylim(0, df['ttftAverage'].max() * 1.2)

# Set x-axis granularity
ax.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax.grid(True, which='major', alpha=0.3, linewidth=1)
ax.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/ttft_analysis.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

## 6. Visualization: TTFT Performance with Warmup Gaps

## 7. Visualization: Load Pattern Evolution

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

ax.plot(df['time_minutes'], df['rate'], 'o-', 
        linewidth=3, markersize=4, color='#7c3aed', 
        label='Arrival Rate (rpm)')

# Mark scaling events
for event in scaling_events:
    ax.axvline(x=event['time'], color='#f59e0b', 
               linestyle='--', alpha=0.7, linewidth=1)
    ax.text(event['time'], df['rate'].max() * 0.9,
            f"Scale\n{event['from_replicas']:.0f}→{event['to_replicas']:.0f}",
            ha='center', va='top', fontsize=9, color='#f59e0b')

ax.set_xlabel('Time (minutes)', fontsize=12)
ax.set_ylabel('Requests per minute', fontsize=12)
ax.set_title(f"Load Pattern Evolution (Peak: {df['rate'].max():.0f} rpm)",
             fontweight='bold', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

# Set x-axis granularity
ax.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax.grid(True, which='major', alpha=0.3, linewidth=1)
ax.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/load_pattern.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

## 8. Visualization: ITL vs Replica Scaling Timeline

In [None]:
fig, ax1 = plt.subplots(figsize=(20, 8))
ax2 = ax1.twinx()

# ITL on left axis
line1 = ax1.plot(df['time_minutes'], df['itlAverage'], 'o-', 
                 linewidth=3, color='#dc2626', label='Actual ITL', zorder=3)
line2 = ax1.plot(df['time_minutes'], df['itl'], 's--', 
                 linewidth=2, color='#2563eb', label='Predicted ITL', zorder=3)
ax1.axhline(y=df['slo_itl'].iloc[0], color='#ef4444', 
            linestyle=':', linewidth=2, label='SLO')

# Replicas on right axis
line3 = ax2.step(df['time_minutes'], df['numRep'], where='post',
                 linewidth=4, color='#7c3aed', alpha=0.7, label='Replicas')

# Highlight warmup gaps
for gap in warmup_gaps:
    ax1.axvspan(gap['start_time'], gap['end_time'], 
                alpha=0.1, color='red', zorder=1)

ax1.set_xlabel('Time (minutes)', fontsize=12)
ax1.set_ylabel('Inter-Token Latency (ms)', color='black', fontsize=12)
ax2.set_ylabel('Number of Replicas', color='#7c3aed', fontsize=12)
ax2.tick_params(axis='y', labelcolor='#7c3aed')

# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

ax1.set_title('ITL vs Replica Scaling Timeline', fontweight='bold', fontsize=16)
ax1.grid(True, alpha=0.3)

# Set x-axis granularity
ax1.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax1.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax1.grid(True, which='major', alpha=0.3, linewidth=1)
ax1.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/itl_replicas_timeline.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

In [None]:
fig, ax1 = plt.subplots(figsize=(20, 8))
ax2 = ax1.twinx()

# TTFT on left axis
line1 = ax1.plot(df['time_minutes'], df['ttftAverage'], 'o-', 
                 linewidth=3, color='#059669', label='Actual TTFT', zorder=3)
line2 = ax1.plot(df['time_minutes'], df['ttft'], 's--', 
                 linewidth=2, color='#0891b2', label='Predicted TTFT', zorder=3)
ax1.axhline(y=df['slo_ttft'].iloc[0], color='#10b981', 
            linestyle=':', linewidth=2, label='SLO')

# Replicas on right axis
line3 = ax2.step(df['time_minutes'], df['numRep'], where='post',
                 linewidth=4, color='#7c3aed', alpha=0.7, label='Replicas')

# Highlight warmup gaps
for gap in warmup_gaps:
    ax1.axvspan(gap['start_time'], gap['end_time'], 
                alpha=0.1, color='red', zorder=1)

ax1.set_xlabel('Time (minutes)', fontsize=12)
ax1.set_ylabel('Time to First Token (ms)', color='black', fontsize=12)
ax2.set_ylabel('Number of Replicas', color='#7c3aed', fontsize=12)
ax2.tick_params(axis='y', labelcolor='#7c3aed')

# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

ax1.set_title('TTFT vs Replica Scaling Timeline', fontweight='bold', fontsize=16)
ax1.grid(True, alpha=0.3)

# Set x-axis granularity
ax1.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax1.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax1.grid(True, which='major', alpha=0.3, linewidth=1)
ax1.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/ttft_replicas_timeline.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

## 9. Visualization: TTFT vs Replica Scaling Timeline

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 12), sharex=True)

# ITL subplot
for i, gap in enumerate(warmup_gaps):
    color = ['red', 'orange', 'yellow'][i % 3]
    rect = Rectangle(
        (gap['start_time'], 0),
        gap['end_time'] - gap['start_time'],
        df['itlAverage'].max() * 1.2,
        alpha=0.15,
        color=color,
        label=f"Warmup Gap {i+1}" if i == 0 else ""
    )
    ax1.add_patch(rect)

ax1.plot(df['time_minutes'], df['itlAverage'], 'o-', 
         linewidth=3, markersize=6, color='#dc2626', label='Actual ITL', zorder=3)
ax1.plot(df['time_minutes'], df['itl'], 's--', 
         linewidth=2, markersize=4, color='#2563eb', label='Predicted ITL', zorder=3)
ax1.axhline(y=df['slo_itl'].iloc[0], color='#ef4444', 
            linestyle=':', linewidth=2, label='SLO Target', zorder=2)

for event in scaling_events:
    ax1.axvline(x=event['time'], color='#9ca3af', linestyle='--', alpha=0.7, linewidth=1, zorder=1)
    ax1.text(event['time'], df['itlAverage'].max() * 1.1, 
             f"{event['from_replicas']:.0f}→{event['to_replicas']:.0f}",
             ha='center', va='bottom', fontsize=9, color='#6b7280',
             bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

ax1.set_ylabel('Inter-Token Latency (ms)', fontsize=12)
ax1.set_title('Combined ITL and TTFT Performance Analysis', fontweight='bold', fontsize=16)
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, df['itlAverage'].max() * 1.2)

# TTFT subplot
for i, gap in enumerate(warmup_gaps):
    color = ['red', 'orange', 'yellow'][i % 3]
    rect = Rectangle(
        (gap['start_time'], 0),
        gap['end_time'] - gap['start_time'],
        df['ttftAverage'].max() * 1.2,
        alpha=0.15,
        color=color
    )
    ax2.add_patch(rect)

ax2.plot(df['time_minutes'], df['ttftAverage'], 'o-', 
         linewidth=3, markersize=6, color='#059669', label='Actual TTFT', zorder=3)
ax2.plot(df['time_minutes'], df['ttft'], 's--', 
         linewidth=2, markersize=4, color='#0891b2', label='Predicted TTFT', zorder=3)
ax2.axhline(y=df['slo_ttft'].iloc[0], color='#10b981', 
            linestyle=':', linewidth=2, label='SLO Target', zorder=2)

for event in scaling_events:
    ax2.axvline(x=event['time'], color='#9ca3af', linestyle='--', alpha=0.7, linewidth=1, zorder=1)

ax2.set_xlabel('Time (minutes from start)', fontsize=12)
ax2.set_ylabel('Time to First Token (ms)', fontsize=12)
ax2.legend(loc='upper left')
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, df['ttftAverage'].max() * 1.2)

# Set x-axis granularity (applies to both subplots since sharex=True)
ax2.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax2.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax1.grid(True, which='major', alpha=0.3, linewidth=1)
ax1.grid(True, which='minor', alpha=0.15, linewidth=0.5)
ax2.grid(True, which='major', alpha=0.3, linewidth=1)
ax2.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/combined_itl_ttft.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

## 10. Visualization: Combined ITL & TTFT Analysis

## 11. Statistical Analysis

In [None]:
print("="*60)
print("WVA PERFORMANCE ANALYSIS SUMMARY")
print("="*60)
print(f"\n📊 Experiment Overview:")
print(f"   Experiment ID: {EXPERIMENT_NAME}")
print(f"   Duration: {df['time_minutes'].max():.1f} minutes")
print(f"   Data points: {len(df)}")
print(f"   Scaling pattern: {df['numRep'].min():.0f} → {df['numRep'].max():.0f} replicas")
print(f"   Peak load: {df['rate'].max():.0f} rpm")
print(f"   SLO ITL: {df['slo_itl'].iloc[0]:.0f} ms")
print(f"   SLO TTFT: {df['slo_ttft'].iloc[0]:.0f} ms")

print(f"\n⚠️  Warmup Gap Analysis:")
for i, gap in enumerate(warmup_gaps, 1):
    print(f"\n   Gap {i} (t={gap['start_time']:.1f}-{gap['end_time']:.1f}min):")
    print(f"      Duration: {gap['duration']:.1f} minutes")
    print(f"      Peak ITL violation: {gap['peak_violation']:.2f} ms")
    print(f"      Average ITL: {gap['avg_tpot']:.2f} ms")
    print(f"      Average load: {gap['avg_load']:.0f} rpm")
    print(f"      SLO exceedance: {gap['avg_tpot'] - df['slo_itl'].iloc[0]:.2f} ms")

print(f"\n📈 ITL Performance Metrics:")
print(f"   Mean ITL: {df['itlAverage'].mean():.2f} ms")
print(f"   Median ITL: {df['itlAverage'].median():.2f} ms")
print(f"   Peak ITL: {df['itlAverage'].max():.2f} ms")
print(f"   Min ITL: {df['itlAverage'].min():.2f} ms")
print(f"   Std Dev: {df['itlAverage'].std():.2f} ms")

print(f"\n📈 TTFT Performance Metrics:")
print(f"   Mean TTFT: {df['ttftAverage'].mean():.2f} ms")
print(f"   Median TTFT: {df['ttftAverage'].median():.2f} ms")
print(f"   Peak TTFT: {df['ttftAverage'].max():.2f} ms")
print(f"   Min TTFT: {df['ttftAverage'].min():.2f} ms")
print(f"   Std Dev: {df['ttftAverage'].std():.2f} ms")

# SLO compliance
itl_violations = df[df['itlAverage'] > df['slo_itl']]
ttft_violations = df[df['ttftAverage'] > df['slo_ttft']]
itl_compliance_rate = (1 - len(itl_violations) / len(df)) * 100
ttft_compliance_rate = (1 - len(ttft_violations) / len(df)) * 100

print(f"\n✅ SLO Compliance:")
print(f"   ITL Compliance: {itl_compliance_rate:.1f}%")
print(f"   ITL Violations: {len(itl_violations)} / {len(df)} data points")
print(f"   TTFT Compliance: {ttft_compliance_rate:.1f}%")
print(f"   TTFT Violations: {len(ttft_violations)} / {len(df)} data points")

print(f"\n🔄 Scaling Events:")
for i, event in enumerate(scaling_events, 1):
    print(f"   {i}. t={event['time']:.1f}min: {event['from_replicas']:.0f} → {event['to_replicas']:.0f} replicas (load: {event['load']:.1f} rpm)")

print("\n" + "="*60)

# Save summary to file
summary_path = f'{EXPERIMENT_DIR}/analysis/summary.txt'
with open(summary_path, 'w') as f:
    f.write("="*60 + "\n")
    f.write("WVA PERFORMANCE ANALYSIS SUMMARY\n")
    f.write("="*60 + "\n")
    f.write(f"\nExperiment ID: {EXPERIMENT_NAME}\n")
    f.write(f"Duration: {df['time_minutes'].max():.1f} minutes\n")
    f.write(f"ITL Compliance: {itl_compliance_rate:.1f}%\n")
    f.write(f"TTFT Compliance: {ttft_compliance_rate:.1f}%\n")
    f.write(f"Peak Load: {df['rate'].max():.0f} rpm\n")
    f.write(f"Scaling Pattern: {df['numRep'].min():.0f} → {df['numRep'].max():.0f} replicas\n")

print(f"\n✅ Summary saved to '{summary_path}'")

## 12. Export Results

Export the processed data and analysis results for further use.

In [None]:
# Export processed data with phases
processed_csv = f'{EXPERIMENT_DIR}/data/processed_metrics.csv'
df.to_csv(processed_csv, index=False)
print(f"✅ Processed data exported to '{processed_csv}'")

# Export warmup gaps analysis
if warmup_gaps:
    gaps_df = pd.DataFrame(warmup_gaps)
    gaps_csv = f'{EXPERIMENT_DIR}/analysis/warmup_gaps.csv'
    gaps_df.to_csv(gaps_csv, index=False)
    print(f"✅ Warmup gaps analysis exported to '{gaps_csv}'")

# Export scaling events
if scaling_events:
    events_df = pd.DataFrame(scaling_events)
    events_csv = f'{EXPERIMENT_DIR}/analysis/scaling_events.csv'
    events_df.to_csv(events_csv, index=False)
    print(f"✅ Scaling events exported to '{events_csv}'")

# Copy original log file to experiment directory
import shutil
log_copy = f'{EXPERIMENT_DIR}/data/original_log.txt'
shutil.copy(LOG_FILE, log_copy)
print(f"✅ Original log file copied to '{log_copy}'")

# Create experiment manifest
manifest_path = f'{EXPERIMENT_DIR}/manifest.txt'
with open(manifest_path, 'w') as f:
    f.write(f"Experiment ID: {EXPERIMENT_NAME}\n")
    f.write(f"Created: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Log File: {LOG_FILE}\n")
    f.write(f"Duration: {df['time_minutes'].max():.1f} minutes\n")
    f.write(f"Data Points: {len(df)}\n")
    f.write(f"\nDirectory Structure:\n")
    f.write(f"  - data/: Raw and processed data files\n")
    f.write(f"  - plots/: Generated visualization plots\n")
    f.write(f"  - analysis/: Statistical analysis results\n")
    
print(f"✅ Manifest created at '{manifest_path}'")
print(f"\n📁 Experiment directory: {EXPERIMENT_DIR}")
print(f"   All outputs have been organized in subdirectories")

## 13. Custom Analysis (Optional)

Use this cell for custom queries and exploration.

In [None]:
# Example: Find periods where predicted ITL underestimated actual ITL
itl_underestimation = df[df['itlAverage'] > df['itl']]
print("ITL Controller Underestimation Analysis:")
print(f"Occurred in {len(itl_underestimation)} / {len(df)} samples ({len(itl_underestimation)/len(df)*100:.1f}%)")
print(f"Average underestimation: {(itl_underestimation['itlAverage'] - itl_underestimation['itl']).mean():.2f} ms")

# Show samples with largest ITL underestimation
print("\nTop 5 largest ITL underestimations:")
itl_underestimation['gap'] = itl_underestimation['itlAverage'] - itl_underestimation['itl']
display(itl_underestimation.nlargest(5, 'gap')[['time_minutes', 'itlAverage', 'itl', 'gap', 'numRep', 'rate']])

# Example: Find periods where predicted TTFT underestimated actual TTFT
ttft_underestimation = df[df['ttftAverage'] > df['ttft']]
print("\nTTFT Controller Underestimation Analysis:")
print(f"Occurred in {len(ttft_underestimation)} / {len(df)} samples ({len(ttft_underestimation)/len(df)*100:.1f}%)")
print(f"Average underestimation: {(ttft_underestimation['ttftAverage'] - ttft_underestimation['ttft']).mean():.2f} ms")

# Show samples with largest TTFT underestimation
print("\nTop 5 largest TTFT underestimations:")
ttft_underestimation['gap'] = ttft_underestimation['ttftAverage'] - ttft_underestimation['ttft']
display(ttft_underestimation.nlargest(5, 'gap')[['time_minutes', 'ttftAverage', 'ttft', 'gap', 'numRep', 'rate']])

## 14. Experiment Summary

Review the complete experiment directory structure and outputs.

In [None]:
import os
from pathlib import Path

print("="*70)
print(f"EXPERIMENT {EXPERIMENT_NAME} - COMPLETE")
print("="*70)

print(f"\n📁 Experiment Directory: {EXPERIMENT_DIR}")
print("\nDirectory Structure:")

# Walk through the experiment directory
for root, dirs, files in os.walk(EXPERIMENT_DIR):
    level = root.replace(EXPERIMENT_DIR, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        file_path = os.path.join(root, file)
        file_size = os.path.getsize(file_path)
        print(f'{subindent}📄 {file} ({file_size:,} bytes)')

print("\n" + "="*70)
print("Generated Outputs:")
print("="*70)

print("\n📊 Plots (6 visualizations):")
print("   1. itl_analysis.png - ITL performance with warmup gaps")
print("   2. ttft_analysis.png - TTFT performance with warmup gaps")
print("   3. load_pattern.png - Arrival rate evolution")
print("   4. itl_replicas_timeline.png - ITL vs replica scaling")
print("   5. ttft_replicas_timeline.png - TTFT vs replica scaling")
print("   6. combined_itl_ttft.png - Combined ITL & TTFT analysis")

print("\n📈 Data Files:")
print("   • extracted_metrics.csv - Raw extracted metrics")
print("   • processed_metrics.csv - Processed data with time calculations")
print("   • original_log.txt - Copy of original log file")

print("\n📋 Analysis Files:")
print("   • warmup_gaps.csv - Detected warmup gap statistics")
print("   • scaling_events.csv - Scaling event log")
print("   • summary.txt - Text summary of key findings")

print("\n📝 Experiment Metadata:")
print("   • manifest.txt - Experiment configuration and metadata")

print("\n" + "="*70)
print(f"✅ All outputs saved to: {EXPERIMENT_DIR}")
print("="*70)

## 15. Calibration Analysis - Understanding the ITL Gap

This section analyzes the gap between predicted and observed ITL to identify calibration opportunities.

In [None]:
print("="*70)
print("CALIBRATION ANALYSIS: ITL Prediction Gap")
print("="*70)

# Calculate prediction error
df['itl_error'] = df['itlAverage'] - df['itl']
df['itl_error_pct'] = (df['itl_error'] / df['itl']) * 100

# Calculate effective batch size (approximation)
# effectiveBatch ≈ rate per replica / service rate
# For simplicity, we'll use a proxy: rate/numRep
df['rate_per_replica'] = df['rate'] / df['numRep']

print(f"\n📊 Prediction Error Statistics:")
print(f"   Mean Error: {df['itl_error'].mean():.2f} ms")
print(f"   Median Error: {df['itl_error'].median():.2f} ms")
print(f"   Std Dev Error: {df['itl_error'].std():.2f} ms")
print(f"   Mean % Error: {df['itl_error_pct'].mean():.1f}%")
print(f"   Max Underestimate: {df['itl_error'].max():.2f} ms")
print(f"   Max Overestimate: {df['itl_error'].min():.2f} ms")

# Identify systematic bias
overestimates = df[df['itl_error'] < 0]
underestimates = df[df['itl_error'] > 0]

print(f"\n⚖️  Bias Analysis:")
print(f"   Overestimates: {len(overestimates)} / {len(df)} ({len(overestimates)/len(df)*100:.1f}%)")
print(f"   Underestimates: {len(underestimates)} / {len(df)} ({len(underestimates)/len(df)*100:.1f}%)")

if len(underestimates) > len(overestimates):
    print(f"   ⚠️  System tends to UNDERESTIMATE ITL (optimistic)")
else:
    print(f"   ℹ️  System tends to OVERESTIMATE ITL (conservative)")

# Correlation with load
print(f"\n🔗 Correlation Analysis:")
print(f"   Error vs Rate: {df[['itl_error', 'rate']].corr().iloc[0,1]:.3f}")
print(f"   Error vs Replicas: {df[['itl_error', 'numRep']].corr().iloc[0,1]:.3f}")
print(f"   Error vs Rate/Replica: {df[['itl_error', 'rate_per_replica']].corr().iloc[0,1]:.3f}")

# Identify worst predictions
print(f"\n❌ Top 5 Worst Predictions (Largest Underestimates):")
worst = df.nlargest(5, 'itl_error')[['time_minutes', 'itlAverage', 'itl', 'itl_error', 
                                       'rate', 'numRep', 'rate_per_replica']]
display(worst)

print("\n" + "="*70)
print("💡 Calibration Opportunities:")
print("="*70)
print("1. Linear Regression: Fit alpha/beta to observed ITL vs load")
print("2. Query Prometheus: Get last 1h of ITL observations")
print("3. Filter Outliers: Remove anomalous points (> 2.5 std dev)")
print("4. Update Parameters: Use calibrated alpha/beta for predictions")
print(f"5. Expected Improvement: Reduce mean error from {df['itl_error'].mean():.2f}ms")
print("="*70)

In [None]:
# Visualize prediction error over time
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10), sharex=True)

# Top: Predicted vs Actual ITL
ax1.plot(df['time_minutes'], df['itlAverage'], 'o-', 
         linewidth=3, color='#dc2626', label='Actual ITL (observed)', zorder=3)
ax1.plot(df['time_minutes'], df['itl'], 's--', 
         linewidth=2, color='#2563eb', label='Predicted ITL (model)', zorder=3)
ax1.fill_between(df['time_minutes'], df['itl'], df['itlAverage'], 
                  where=(df['itlAverage'] > df['itl']), 
                  alpha=0.3, color='red', label='Underestimate')
ax1.fill_between(df['time_minutes'], df['itl'], df['itlAverage'], 
                  where=(df['itlAverage'] <= df['itl']), 
                  alpha=0.3, color='green', label='Overestimate')
ax1.set_ylabel('ITL (ms)', fontsize=12)
ax1.set_title('ITL Prediction Gap Analysis', fontweight='bold', fontsize=16)
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)

# Bottom: Prediction Error
ax2.plot(df['time_minutes'], df['itl_error'], 'o-', 
         linewidth=3, color='#7c3aed', label='Prediction Error')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.5)
ax2.axhline(y=df['itl_error'].mean(), color='orange', linestyle='--', 
            linewidth=2, label=f'Mean Error: {df["itl_error"].mean():.2f}ms')
ax2.fill_between(df['time_minutes'], 0, df['itl_error'], 
                  where=(df['itl_error'] > 0), alpha=0.3, color='red')
ax2.fill_between(df['time_minutes'], 0, df['itl_error'], 
                  where=(df['itl_error'] <= 0), alpha=0.3, color='green')
ax2.set_xlabel('Time (minutes)', fontsize=12)
ax2.set_ylabel('Error (Actual - Predicted) ms', fontsize=12)
ax2.legend(loc='upper left')
ax2.grid(True, alpha=0.3)

# Set x-axis granularity
ax2.xaxis.set_major_locator(MultipleLocator(X_MAJOR_TICK_INTERVAL))
ax2.xaxis.set_minor_locator(MultipleLocator(X_MINOR_TICK_INTERVAL))
ax1.grid(True, which='major', alpha=0.3, linewidth=1)
ax1.grid(True, which='minor', alpha=0.15, linewidth=0.5)
ax2.grid(True, which='major', alpha=0.3, linewidth=1)
ax2.grid(True, which='minor', alpha=0.15, linewidth=0.5)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/calibration_gap_analysis.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")

In [None]:
# Scatter plot: Error vs Load characteristics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Error vs Arrival Rate
axes[0, 0].scatter(df['rate'], df['itl_error'], alpha=0.6, s=100, c='purple')
axes[0, 0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[0, 0].set_xlabel('Arrival Rate (rpm)', fontsize=11)
axes[0, 0].set_ylabel('Prediction Error (ms)', fontsize=11)
axes[0, 0].set_title('Error vs Arrival Rate', fontweight='bold', fontsize=13)
axes[0, 0].grid(True, alpha=0.3)

# Error vs Number of Replicas
axes[0, 1].scatter(df['numRep'], df['itl_error'], alpha=0.6, s=100, c='teal')
axes[0, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[0, 1].set_xlabel('Number of Replicas', fontsize=11)
axes[0, 1].set_ylabel('Prediction Error (ms)', fontsize=11)
axes[0, 1].set_title('Error vs Replicas', fontweight='bold', fontsize=13)
axes[0, 1].grid(True, alpha=0.3)

# Error vs Rate per Replica (proxy for batch size)
axes[1, 0].scatter(df['rate_per_replica'], df['itl_error'], alpha=0.6, s=100, c='orange')
axes[1, 0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Rate per Replica (rpm)', fontsize=11)
axes[1, 0].set_ylabel('Prediction Error (ms)', fontsize=11)
axes[1, 0].set_title('Error vs Load per Replica', fontweight='bold', fontsize=13)
axes[1, 0].grid(True, alpha=0.3)

# Error distribution histogram
axes[1, 1].hist(df['itl_error'], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
axes[1, 1].axvline(x=0, color='black', linestyle='--', linewidth=2)
axes[1, 1].axvline(x=df['itl_error'].mean(), color='red', linestyle='--', 
                   linewidth=2, label=f'Mean: {df["itl_error"].mean():.2f}ms')
axes[1, 1].set_xlabel('Prediction Error (ms)', fontsize=11)
axes[1, 1].set_ylabel('Frequency', fontsize=11)
axes[1, 1].set_title('Error Distribution', fontweight='bold', fontsize=13)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plot_path = f'{EXPERIMENT_DIR}/plots/calibration_correlation_analysis.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"✅ Plot saved as '{plot_path}'")