In [13]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import scipy.stats as stats
from pathlib import Path

# Set professional styling for scientific papers
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

class BenchmarkAnalyzer:
    def __init__(self, results_file: str):
        """
        Initialize the analyzer with benchmark results.
        
        Args:
            results_file: Path to the JSON results file from Criterion
        """
        self.data = {}
        self.groups = set()
        self.runtimes = set()
        
        # Fixed color mapping for consistency across all plots
        self.color_map = {
            'Native': '#2E8B57',      # Sea Green
            'WAMR': '#FF8C00',        # Dark Orange  
            'Wasmtime': '#87CEEB'     # Sky Blue
        }
        
        # Professional styling parameters
        self.style_params = {
            'font.family': 'serif',
            'font.serif': ['Times New Roman', 'DejaVu Serif'],
            'font.size': 10,
            'axes.labelsize': 10,
            'axes.titlesize': 12,
            'xtick.labelsize': 8,
            'ytick.labelsize': 8,
            'legend.fontsize': 8,
            'figure.titlesize': 14,
            'axes.linewidth': 1.2,
            'grid.linewidth': 0.3,
            'grid.color': '#FFFFFF',
            'lines.linewidth': 1,
            'axes.facecolor': '#EAEAF2',
            'figure.facecolor': '#FFFFFF'
        }
        
        plt.rcParams.update(self.style_params)
        self._load_data(results_file)
        
    def _load_data(self, results_file: str):
        """Load and parse the JSON benchmark results."""
        with open(results_file, 'r') as f:
            for line in f:
                try:
                    result = json.loads(line.strip())
                    if result.get('reason') == 'benchmark-complete':
                        self._process_benchmark(result)
                except json.JSONDecodeError:
                    continue
                    
    def _process_benchmark(self, result: dict):
        """Process a single benchmark result."""
        benchmark_id = result['id']
        parts = benchmark_id.split('/')
        if len(parts) != 2:
            return
            
        group, runtime = parts
        self.groups.add(group)
        self.runtimes.add(runtime)
        
        # Calculate time per iteration for each sample
        iteration_counts = np.array(result['iteration_count'])
        measured_values = np.array(result['measured_values'])
        
        # Convert from nanoseconds to microseconds for better readability
        times_per_iteration = (measured_values / iteration_counts) / 1000
        
        if group not in self.data:
            self.data[group] = {}
        self.data[group][runtime] = times_per_iteration
        
    def _calculate_stats(self, data: np.ndarray) -> Dict:
        """Calculate statistical measures for a dataset."""
        # Remove outliers using IQR method
        q75, q25 = np.percentile(data, [75, 25])
        iqr = q75 - q25
        lower_bound = q25 - 1.5 * iqr
        upper_bound = q75 + 1.5 * iqr
        clean_data = data[(data >= lower_bound) & (data <= upper_bound)]
        
        # Calculate statistics
        mean = np.mean(data)
        median = np.median(data)
        std = np.std(data, ddof=1)
        cv = (std / mean) * 100 if mean != 0 else 0
        
        # Confidence intervals (95%)
        n = len(data)
        sem = std / np.sqrt(n)
        ci_95 = stats.t.interval(0.95, n-1, loc=mean, scale=sem)
        
        return {
            'mean': mean,
            'median': median,
            'std': std,
            'cv': cv,
            'ci_95': ci_95,
            'n_samples': len(data),
            'n_clean': len(clean_data),
            'raw_data': data,
            'clean_data': clean_data
        }
    
    def _filter_data(self, groups: Optional[List[str]] = None, 
                    runtimes: Optional[List[str]] = None) -> Dict:
        """Filter data based on specified groups and runtimes."""
        if groups is None:
            groups = list(self.groups)
        if runtimes is None:
            runtimes = list(self.runtimes)
            
        filtered = {}
        for group in groups:
            if group in self.data:
                filtered[group] = {}
                for runtime in runtimes:
                    if runtime in self.data[group]:
                        filtered[group][runtime] = self.data[group][runtime]
        return filtered
    
    def _get_runtime_color(self, runtime: str) -> str:
        """Get consistent color for a runtime."""
        return self.color_map.get(runtime, '#808080')  # Default gray for unknown runtimes
    
    def plot_comparison(self, groups: Optional[List[str]] = None, 
                       runtimes: Optional[List[str]] = None, 
                       log_scale: bool = False):
        filtered_data = self._filter_data(groups, runtimes)
        
        fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
        
        group_names = list(filtered_data.keys())
        runtime_names = sorted(list(set().union(*[list(g.keys()) for g in filtered_data.values()])))
        
        x = np.arange(len(group_names))
        bar_width = 0.2
        spacing = 0.05
        
        for i, runtime in enumerate(runtime_names):
            means = []
            errors = []
            for group in group_names:
                if runtime in filtered_data[group]:
                    stats = self._calculate_stats(filtered_data[group][runtime])
                    means.append(stats['mean'])
                    errors.append(stats['std'] / np.sqrt(stats['n_clean']))
                else:
                    means.append(0)
                    errors.append(0)
            
            offset = (i - len(runtime_names)/2 + 0.5) * (bar_width + spacing)
            bars = ax.bar(x + offset, means, bar_width, label=runtime, 
                         color=self._get_runtime_color(runtime), alpha=0.8, 
                         yerr=errors, capsize=4, error_kw={'linewidth': 1.5},
                         edgecolor='black', linewidth=1.2)
            
            # Add value labels on bars
            for bar, mean_val in zip(bars, means):
                if mean_val > 0:
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
                           f'{mean_val:.1f}', ha='center', va='bottom', 
                           fontsize=9, fontweight='bold')
                    
        # Add group seperator
        group_width = len(runtime_names) * (bar_width + spacing) - spacing
        group_left = x - group_width/2
        group_right = x + group_width/2
        for i, group_x in enumerate(x):
            ax.axvspan(group_left[i], group_right[i], 
                       facecolor='#DFDFE6', alpha=0.3, zorder=0)
            
        ax.set_xlabel('Benchmark Groups', fontweight='bold')
        ax.set_ylabel('Execution Time (μs)', fontweight='bold')
        ax.set_title('Performance Comparison Across Runtime Environments', fontweight='bold', pad=20)
        ax.set_xticks(x)
        ax.set_xticklabels(group_names)
        
        # Legend styling
        legend = ax.legend(frameon=True, fancybox=False, shadow=False, 
                          borderpad=1, columnspacing=1.5)
        legend.get_frame().set_facecolor('white')
        legend.get_frame().set_alpha(0.9)
        
        if log_scale:
            ax.set_yscale('log')
        else:
            # Add padding at top for labels
            ylim_top = ax.get_ylim()[1]
            ax.set_ylim(top=ylim_top * 1.15)
        
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"comp_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_relative(self, groups: Optional[List[str]] = None, 
                     runtimes: Optional[List[str]] = None, 
                     log_scale: bool = False):
        filtered_data = self._filter_data(groups, runtimes)
        
        fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
        
        group_names = list(filtered_data.keys())
        runtime_names = sorted(list(set().union(*[list(g.keys()) for g in filtered_data.values()])))
        
        x = np.arange(len(group_names))
        bar_width = 0.20
        spacing = 0.05
        
        for i, runtime in enumerate(runtime_names):
            execution_times = []
            relative_labels = []
            
            for group in group_names:
                if runtime in filtered_data[group]:
                    # Calculate means for all runtimes in this group
                    group_means = {}
                    for rt in filtered_data[group]:
                        stats = self._calculate_stats(filtered_data[group][rt])
                        group_means[rt] = stats['mean']
                    
                    fastest = min(group_means.values())
                    current = group_means[runtime]
                    relative = current / fastest
                    
                    execution_times.append(current)
                    relative_labels.append(relative)
                else:
                    execution_times.append(0)
                    relative_labels.append(0)
            
            offset = (i - len(runtime_names)/2 + 0.5) * (bar_width + spacing)
            bars = ax.bar(x + offset, execution_times, bar_width, label=runtime, 
                         color=self._get_runtime_color(runtime), alpha=0.8,
                         edgecolor='black', linewidth=1.2)
            
            # Add relative performance labels on bars
            for j, (bar, rel_val) in enumerate(zip(bars, relative_labels)):
                if rel_val > 0:
                    height = bar.get_height()
                    if rel_val == 1.0:
                        label_text = "1.0000×\n(fastest)"
                    else:
                        label_text = f"{rel_val:.4f}×\nslower"
                    
                    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
                           label_text, ha='center', va='bottom', fontsize=9,
                           fontweight='bold')
                    
        # Add group seperator
        group_width = len(runtime_names) * (bar_width + spacing) - spacing
        group_left = x - group_width/2
        group_right = x + group_width/2
        for i, group_x in enumerate(x):
            ax.axvspan(group_left[i], group_right[i], 
                       facecolor='#DFDFE6', alpha=0.3, zorder=0)
        
        ax.set_xlabel('Benchmark Groups', fontweight='bold')
        ax.set_ylabel('Execution Time (μs)', fontweight='bold')
        ax.set_title('Relative Performance Analysis\n(Labels show slowdown factor vs. fastest)', 
                    fontweight='bold', pad=20)
        ax.set_xticks(x)
        ax.set_xticklabels(group_names)
        
        # Legend styling
        legend = ax.legend(frameon=True, fancybox=False, shadow=False, 
                          borderpad=1, columnspacing=1.5)
        legend.get_frame().set_facecolor('white')
        legend.get_frame().set_alpha(0.9)
        
        if log_scale:
            ax.set_yscale('log')
        else:
            # Add some padding at the top for labels
            ylim_top = ax.get_ylim()[1]
            ax.set_ylim(top=ylim_top * 1.3)
        
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"rel_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_stability(self, groups: Optional[List[str]] = None, 
                      runtimes: Optional[List[str]] = None):
        filtered_data = self._filter_data(groups, runtimes)
        
        fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
        
        group_names = list(filtered_data.keys())
        runtime_names = sorted(list(set().union(*[list(g.keys()) for g in filtered_data.values()])))
        
        x = np.arange(len(group_names))
        bar_width = 0.20
        spacing = 0.05
        
        for i, runtime in enumerate(runtime_names):
            cvs = []
            for group in group_names:
                if runtime in filtered_data[group]:
                    stats = self._calculate_stats(filtered_data[group][runtime])
                    cvs.append(stats['cv'])
                else:
                    cvs.append(0)
            
            offset = (i - len(runtime_names)/2 + 0.5) * (bar_width + spacing)
            bars = ax.bar(x + offset, cvs, bar_width, label=runtime, 
                         color=self._get_runtime_color(runtime), alpha=0.8,
                         edgecolor='black', linewidth=1.2)
            
            # Add CV values on bars
            for bar, cv_val in zip(bars, cvs):
                if cv_val > 0:
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
                           f'{cv_val:.4f}%', ha='center', va='bottom', 
                           fontsize=9, fontweight='bold')
                    
        # Add group seperator
        group_width = len(runtime_names) * (bar_width + spacing) - spacing
        group_left = x - group_width/2
        group_right = x + group_width/2
        for i, group_x in enumerate(x):
            ax.axvspan(group_left[i], group_right[i], 
                       facecolor='#DFDFE6', alpha=0.3, zorder=0)
        
        ax.set_xlabel('Benchmark Groups', fontweight='bold')
        ax.set_ylabel('Coefficient of Variation (%)', fontweight='bold')
        ax.set_title('Performance Stability Analysis\n(Lower values indicate more stable performance)', 
                    fontweight='bold', pad=20)
        ax.set_xticks(x)
        ax.set_xticklabels(group_names)
        
        # Legend styling
        legend = ax.legend(frameon=True, fancybox=False, shadow=False, 
                          borderpad=1, columnspacing=1.5)
        legend.get_frame().set_facecolor('white')
        legend.get_frame().set_alpha(0.9)

        # Add padding at top for labels
        ylim_top = ax.get_ylim()[1]
        ax.set_ylim(top=ylim_top * 1.15)
        
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"stab_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def print_summary(self, groups: Optional[List[str]] = None, 
                     runtimes: Optional[List[str]] = None):
        filtered_data = self._filter_data(groups, runtimes)
        
        print("=" * 80)
        print("BENCHMARK ANALYSIS SUMMARY")
        print("=" * 80)
        
        for group in filtered_data:
            print(f"\n📊 GROUP: {group}")
            print("-" * 60)
            
            for runtime in sorted(filtered_data[group].keys()):
                stats = self._calculate_stats(filtered_data[group][runtime])
                
                print(f"\n🚀 Runtime: {runtime}")
                print(f"  • Mean:         {stats['mean']:.4f} μs")
                print(f"  • Median:       {stats['median']:.4f} μs")
                print(f"  • Std Dev:      {(stats['std']*1000):.4f} ns")
                print(f"  • CV:           {stats['cv']:.4f}%")
                print(f"  • 95% CI:       [{stats['ci_95'][0]:.4f}, {stats['ci_95'][1]:.4f}] μs")
                print(f"  • Samples:      {stats['n_clean']}/{stats['n_samples']} (after outlier removal)")
    
    def plot_boxes(self, groups: Optional[List[str]] = None, 
                   runtimes: Optional[List[str]] = None):
        filtered_data = self._filter_data(groups, runtimes)
        
        fig, axes = plt.subplots(1, len(filtered_data), figsize=(5*len(filtered_data), 6), dpi=300)
        if len(filtered_data) == 1:
            axes = [axes]
        group_names = set()
        runtime_names = set()
        for idx, (group, group_data) in enumerate(filtered_data.items()):
            group_names.add(group)
            ax = axes[idx]
            
            data_for_box = []
            tick_labels = []
            colors = []
            
            for runtime in sorted(group_data.keys()):
                runtime_names.add(runtime)
                stats = self._calculate_stats(group_data[runtime])
                data_for_box.append(stats['raw_data'])
                tick_labels.append(runtime)
                colors.append(self._get_runtime_color(runtime))
            
            bp = ax.boxplot(data_for_box, tick_labels=tick_labels, patch_artist=True,
                           boxprops=dict(linewidth=1.5),
                           whiskerprops=dict(linewidth=1.5),
                           capprops=dict(linewidth=1.5),
                           medianprops=dict(linewidth=2, color='red'))
            
            # Color the boxes
            for patch, color in zip(bp['boxes'], colors):
                patch.set_facecolor(color)
                patch.set_alpha(0.7)
            
            ax.set_title(f'{group}', fontweight='bold', pad=15)
            ax.set_ylabel('Time per Iteration (μs)', fontweight='bold')
            ax.grid(True, alpha=0.3)
            
        
        plt.suptitle('Distribution Analysis via Box Plots', fontsize=16, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"box_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_evolution(self, groups: Optional[List[str]] = None, 
                      runtimes: Optional[List[str]] = None):
        """
        Create line plots showing evolution of execution time across samples.
        """
        filtered_data = self._filter_data(groups, runtimes)
        
        fig, axes = plt.subplots(len(filtered_data), 1, figsize=(12, 5*len(filtered_data)), dpi=300)
        if len(filtered_data) == 1:
            axes = [axes]
        group_names = set()
        runtime_names = set()
        for idx, (group, group_data) in enumerate(filtered_data.items()):
            group_names.add(group)
            ax = axes[idx]
            
            for runtime in sorted(group_data.keys()):
                runtime_names.add(runtime)
                data = group_data[runtime]
                ax.plot(range(len(data)), data, label=runtime, 
                       color=self._get_runtime_color(runtime), alpha=0.8, 
                       linewidth=2, marker='o', markersize=3, markevery=max(1, len(data)//50))
            
            ax.set_title(f'{group} - Sample Evolution', fontweight='bold', pad=15)
            ax.set_xlabel('Sample Index', fontweight='bold')
            ax.set_ylabel('Time per Iteration (μs)', fontweight='bold')
            
            # Legend styling
            legend = ax.legend(frameon=True, fancybox=False, shadow=False)
            legend.get_frame().set_facecolor('white')
            legend.get_frame().set_alpha(0.9)
            
            ax.grid(True, alpha=0.3)
            
        plt.suptitle('Performance Evolution Across Samples', fontsize=16, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"evol_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()

    def plot_distribution_flip(self, groups: Optional[List[str]] = None, 
                         runtimes: Optional[List[str]] = None):
        filtered_data = self._filter_data(groups, runtimes)
        
        n_groups = len(filtered_data)
        n_runtimes = max(len(group_data) for group_data in filtered_data.values())
        
        fig, axes = plt.subplots(n_groups, n_runtimes, 
                                figsize=(5*n_runtimes, 4*n_groups), dpi=300)
        if n_groups == 1 and n_runtimes == 1:
            axes = np.array([[axes]])
        elif n_groups == 1:
            axes = axes.reshape(1, -1)
        elif n_runtimes == 1:
            axes = axes.reshape(-1, 1)
        group_names = set()
        runtime_names = set()
        for group_idx, (group, group_data) in enumerate(filtered_data.items()):
            group_names.add(group)
            for runtime_idx, runtime in enumerate(sorted(group_data.keys())):
                runtime_names.add(runtime)
                ax = axes[group_idx, runtime_idx]
                
                stats = self._calculate_stats(group_data[runtime])
                data = stats['raw_data']
                
                # Create density estimate
                from scipy.stats import gaussian_kde
                kde = gaussian_kde(data)
                
                # Create smooth x values for the curve
                x_min, x_max = data.min(), data.max()
                x_range = x_max - x_min
                x_smooth = np.linspace(x_min - 0.1*x_range, x_max + 0.1*x_range, 300)
                density = kde(x_smooth)
                
                # Create area plot
                ax.fill_between(x_smooth, 0, density, alpha=0.6, 
                               color=self._get_runtime_color(runtime), 
                               label=f'{runtime} Distribution')
                ax.plot(x_smooth, density, color=self._get_runtime_color(runtime), 
                       linewidth=2)
                
                # Add mean line
                ax.axvline(stats['mean'], color='red', linestyle='--', linewidth=2, 
                          label=f"Mean: {stats['mean']:.4f} μs")
                
                # Add 95% CI shading
                ci_mask = (x_smooth >= stats['ci_95'][0]) & (x_smooth <= stats['ci_95'][1])
                ax.fill_between(x_smooth, 0, density, where=ci_mask, alpha=0.8, 
                               color='#ffb3ba', label='95% CI')
                
                ax.set_title(f'{group} - {runtime}', fontweight='bold', pad=10)
                ax.set_xlabel('Time per Iteration (μs)', fontweight='bold')
                ax.set_ylabel('Probability Density', fontweight='bold')
                
                # Legend styling
                legend = ax.legend(frameon=True, fancybox=False, shadow=False, fontsize=9)
                legend.get_frame().set_facecolor('white')
                legend.get_frame().set_alpha(0.9)
                
                ax.grid(True, alpha=0.3)
                
        
        # Hide empty subplots
        for group_idx in range(n_groups):
            for runtime_idx in range(len(sorted(filtered_data[list(filtered_data.keys())[group_idx]].keys())), n_runtimes):
                if n_runtimes > 1:
                    axes[group_idx, runtime_idx].set_visible(False)
        
        plt.suptitle('Performance Distribution Analysis', fontsize=16, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"dist_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_distribution(self, groups: Optional[List[str]] = None, 
                         runtimes: Optional[List[str]] = None):
        """
        Create area plots showing distribution with 95% confidence intervals.
        """
        filtered_data = self._filter_data(groups, runtimes)
        
        n_groups = len(filtered_data)
        n_runtimes = max(len(group_data) for group_data in filtered_data.values())
        
        fig, axes = plt.subplots(n_runtimes, n_groups, 
                                figsize=(5*n_groups, 4*n_runtimes), dpi=300)
        if n_groups == 1 and n_runtimes == 1:
            axes = np.array([[axes]])
        elif n_runtimes == 1:
            axes = axes.reshape(1, -1)
        elif n_groups == 1:
            axes = axes.reshape(-1, 1)
        group_names = set()
        runtime_names = set()
        for group_idx, (group, group_data) in enumerate(filtered_data.items()):
            group_names.add(group)
            for runtime_idx, runtime in enumerate(sorted(group_data.keys())):
                runtime_names.add(runtime)
                ax = axes[runtime_idx, group_idx]
                
                stats = self._calculate_stats(group_data[runtime])
                data = stats['raw_data']
                
                # Create density estimate
                from scipy.stats import gaussian_kde
                kde = gaussian_kde(data)
                
                # Create smooth x values for the curve
                x_min, x_max = data.min(), data.max()
                x_range = x_max - x_min
                x_smooth = np.linspace(x_min - 0.1*x_range, x_max + 0.1*x_range, 300)
                density = kde(x_smooth)
                
                # Create area plot
                ax.fill_between(x_smooth, 0, density, alpha=0.6, 
                               color=self._get_runtime_color(runtime), 
                               label=f'{runtime} Distribution')
                ax.plot(x_smooth, density, color=self._get_runtime_color(runtime), 
                       linewidth=2)
                
                # Add mean line
                ax.axvline(stats['mean'], color='red', linestyle='--', linewidth=2, 
                          label=f"Mean: {stats['mean']:.4f} μs")
                
                # Add 95% CI shading
                ci_mask = (x_smooth >= stats['ci_95'][0]) & (x_smooth <= stats['ci_95'][1])
                ax.fill_between(x_smooth, 0, density, where=ci_mask, alpha=0.8, 
                               color='#ffb3ba', label='95% CI')
                
                ax.set_title(f'{group} - {runtime}', fontweight='bold', pad=10)
                ax.set_xlabel('Time per Iteration (μs)', fontweight='bold')
                ax.set_ylabel('Probability Density', fontweight='bold')
                
                # Legend styling
                legend = ax.legend(frameon=True, fancybox=False, shadow=False, fontsize=9)
                legend.get_frame().set_facecolor('white')
                legend.get_frame().set_alpha(0.9)
                
                ax.grid(True, alpha=0.3)
                
        
        # Hide empty subplots
        for group_idx in range(n_groups):
            for runtime_idx in range(len(sorted(filtered_data[list(filtered_data.keys())[group_idx]].keys())), n_runtimes):
                if n_runtimes > 1:
                    axes[runtime_idx, group_idx].set_visible(False)
        
        plt.suptitle('Performance Distribution Analysis', fontsize=16, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.savefig(Path("my_new_results") / f"dist_{'_'.join(group_names)}_{'_'.join(runtime_names)}.png", dpi=300, bbox_inches='tight')
        plt.show()

In [14]:
analyzer = BenchmarkAnalyzer('results.log')
groups = ["Runtime Setup", "Cold Ping Pong Execution", "Hot Ping Pong Execution"]
runtimes = ["Native", "WAMR", "Wasmtime"]

In [28]:
analyzer.print_summary()
# analyzer.plot_comparison(groups=groups[1:], runtimes=runtimes[:], log_scale=False)

# analyzer.plot_relative(groups=groups[2:3], runtimes=runtimes[:], log_scale=False)

# analyzer.plot_distribution(groups=groups[2:3], runtimes=runtimes[1:])
# analyzer.plot_distribution_flip(groups=groups[2:3], runtimes=runtimes[1:])

# analyzer.plot_stability(groups=groups[:], runtimes=runtimes[:])

# analyzer.plot_boxes()

# analyzer.plot_boxes(groups=groups[2:3], runtimes=runtimes[1:])
# analyzer.plot_boxes(groups=groups[0:1], runtimes=runtimes[1:2])
# analyzer.plot_boxes(groups=groups[0:1], runtimes=runtimes[2:3])
# analyzer.plot_boxes(groups=groups[1:2], runtimes=runtimes[0:1])
# analyzer.plot_boxes(groups=groups[1:2], runtimes=runtimes[1:2])
# analyzer.plot_boxes(groups=groups[1:2], runtimes=runtimes[2:3])
# analyzer.plot_boxes(groups=groups[2:3], runtimes=runtimes[0:1])
# analyzer.plot_boxes(groups=groups[2:3], runtimes=runtimes[1:2])
# analyzer.plot_boxes(groups=groups[2:3], runtimes=runtimes[2:3])

# analyzer.plot_evolution(groups=groups[:], runtimes=runtimes[:])

BENCHMARK ANALYSIS SUMMARY

📊 GROUP: Cold Ping Pong Execution
------------------------------------------------------------

🚀 Runtime: Native
  • Mean:         594.5275 μs
  • Median:       594.5636 μs
  • Std Dev:      281.8579 ns
  • CV:           0.0474%
  • 95% CI:       [594.4715, 594.5834] μs
  • Samples:      99/100 (after outlier removal)

🚀 Runtime: WAMR
  • Mean:         1211.0415 μs
  • Median:       1210.9849 μs
  • Std Dev:      5144.4222 ns
  • CV:           0.4248%
  • 95% CI:       [1210.0207, 1212.0622] μs
  • Samples:      100/100 (after outlier removal)

🚀 Runtime: Wasmtime
  • Mean:         1301.8975 μs
  • Median:       1300.5868 μs
  • Std Dev:      4895.5251 ns
  • CV:           0.3760%
  • 95% CI:       [1300.9262, 1302.8689] μs
  • Samples:      86/100 (after outlier removal)

📊 GROUP: Hot Ping Pong Execution
------------------------------------------------------------

🚀 Runtime: Native
  • Mean:         589.0368 μs
  • Median:       589.0307 μs
  • Std Dev:  