In [3]:
%cd ..

/Users/giorgiomastrotucci/Desktop/Lavoro/AssegnoDiRicerca/mlir_research


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# Define the data processing functions
def load_and_process_data(vector_file, scalar_file):
    """
    Load and process the vector and scalar data from CSV files.
    
    Parameters:
    vector_file (str): Path to the vector CSV file
    scalar_file (str): Path to the scalar CSV file
    
    Returns:
    tuple: (vector_df, scalar_df, combined_df)
    """
    # Load the CSV files
    vector_raw = pd.read_csv(vector_file)
    scalar_raw = pd.read_csv(scalar_file)
    
    # Add a column to identify the source
    vector_raw['Implementation'] = 'Vector'
    scalar_raw['Implementation'] = 'Scalar'
    
    # The data is stored in a sparse format where each row has only one metric value
    # Let's reshape it to have one row per experiment
    
    def reshape_df(df):
        # Initialize dictionary to store results
        results = {}
        
        # Extract unique combinations of Executable, Sparsity Level, and Stride
        unique_exps = df[['Executable', 'Sparsity Level', 'Stride']].drop_duplicates()
        
        # For each unique experiment
        for _, exp in unique_exps.iterrows():
            exe = exp['Executable']
            sparsity = exp['Sparsity Level']
            stride = exp['Stride']
            
            # Create a key for this experiment
            key = (exe, sparsity, stride)
            results[key] = {'Executable': exe, 
                           'Sparsity Level': sparsity, 
                           'Stride': stride,
                           'Implementation': df['Implementation'].iloc[0]}
            
            # Find all rows for this experiment
            mask = ((df['Executable'] == exe) & 
                   (df['Sparsity Level'] == sparsity) & 
                   (df['Stride'] == stride))
            
            # For each metric, find the corresponding value
            for metric in ['Branch Misses', 'Branches', 'Context Switch', 'CPU Migration', 
                          'Cycles', 'Instructions', 'Page Faults']:
                metric_row = df[mask & ~df[metric].isna()]
                if len(metric_row) > 0:
                    results[key][f'{metric}_value'] = metric_row[metric].iloc[0]
        
        # Convert results to DataFrame
        return pd.DataFrame(list(results.values()))
    
    # Reshape both dataframes
    vector_df = reshape_df(vector_raw)
    scalar_df = reshape_df(scalar_raw)
    
    # Combine them
    combined_df = pd.concat([vector_df, scalar_df], ignore_index=True)
    
    return vector_df, scalar_df, combined_df

def calculate_speedup_by_stride(vector_df, scalar_df):
    """
    Calculate speedup of vector implementation over scalar, grouped by stride.
    
    Parameters:
    vector_df (DataFrame): Vector processed dataframe
    scalar_df (DataFrame): Scalar processed dataframe
    
    Returns:
    DataFrame: Speedup metrics with stride information
    """
    # Merge on Sparsity Level and Stride
    merged = pd.merge(vector_df, scalar_df, 
                     on=['Sparsity Level', 'Stride'],
                     suffixes=('_vector', '_scalar'))
    
    # Calculate speedup metrics
    merged['cycles_speedup'] = merged['Cycles_value_scalar'] / merged['Cycles_value_vector']
    merged['instructions_speedup'] = merged['Instructions_value_scalar'] / merged['Instructions_value_vector']
    merged['branches_speedup'] = merged['Branches_value_scalar'] / merged['Branches_value_vector']
    
    return merged[['Sparsity Level', 'Stride', 'cycles_speedup', 
                  'instructions_speedup', 'branches_speedup']]

def analyze_by_stride(combined_df, speedup_df, output_dir):
    """
    Perform analysis grouped by stride values.
    
    Parameters:
    combined_df (DataFrame): Combined data
    speedup_df (DataFrame): Speedup metrics
    output_dir (str): Directory to save output files
    """


    # Get unique stride values
    strides = combined_df['Stride'].unique()
    strides.sort()
    
    # Create stride-specific directory
    stride_dir = f"{output_dir}/stride_analysis"
    if not os.path.exists(stride_dir):
        os.makedirs(stride_dir)
    
    # For each stride, perform analysis
    for stride in strides:
        # Filter data for this stride
        stride_df = combined_df[combined_df['Stride'] == stride]
        stride_speedup = speedup_df[speedup_df['Stride'] == stride]

        # Create stride-specific output directory
        stride_specific_dir = f"{stride_dir}/stride_{stride}"
        if not os.path.exists(stride_specific_dir):
            os.makedirs(stride_specific_dir)
        
        # Generate plots for this stride
        
        # Plot 1: Sparsity Impact for this stride
        plt.figure(figsize=(12, 8))
        for implementation in ['Vector', 'Scalar']:
            subset = stride_df[stride_df['Implementation'] == implementation]
            plt.plot(subset['Sparsity Level'], subset['Cycles_value'], 
                    marker='o' if implementation == 'Vector' else 's',
                    linestyle='-' if implementation == 'Vector' else '--',
                    label=f"{implementation}")
        
        plt.title(f'Impact of Sparsity Level on Cycles (Stride {stride})')
        plt.xlabel('Sparsity Level (%)')
        plt.ylabel('Cycles')
        plt.grid(True, alpha=0.3)
        plt.legend(loc='best')
        plt.savefig(f"{stride_specific_dir}/sparsity_impact_on_cycles.png", dpi=300, bbox_inches='tight')
        plt.close()
        
        # Plot 2: Instructions Comparison for this stride
        plt.figure(figsize=(12, 8))
        for implementation in ['Vector', 'Scalar']:
            subset = stride_df[stride_df['Implementation'] == implementation]
            plt.plot(subset['Sparsity Level'], subset['Instructions_value'], 
                    marker='o' if implementation == 'Vector' else 's',
                    linestyle='-' if implementation == 'Vector' else '--',
                    linewidth=2, markersize=8,
                    label=f"{implementation}")
        
        plt.title(f'Instructions by Sparsity Level (Stride {stride})')
        plt.xlabel('Sparsity Level (%)')
        plt.ylabel('Instructions')
        plt.grid(True, alpha=0.3)
        plt.legend(loc='best')
        plt.savefig(f"{stride_specific_dir}/instructions_by_sparsity.png", dpi=300, bbox_inches='tight')
        plt.close()
        
        # Plot 3: Instructions Speedup by Sparsity Level for this stride
        plt.figure(figsize=(12, 8))
        plt.bar(stride_speedup['Sparsity Level'], stride_speedup['instructions_speedup'], color='skyblue', edgecolor='navy')
        plt.axhline(y=1.0, color='red', linestyle='--', alpha=0.7)
        plt.title(f'Instructions Speedup by Sparsity Level (Stride {stride})')
        plt.xlabel('Sparsity Level (%)')
        plt.ylabel('Instructions Speedup (Scalar/Vector)')
        plt.grid(True, alpha=0.3, axis='y')
        
        for i, v in enumerate(stride_speedup['instructions_speedup']):
            plt.text(stride_speedup['Sparsity Level'].iloc[i], v + 0.1, f"{v:.2f}", ha='center')
        
        plt.savefig(f"{stride_specific_dir}/instructions_speedup_by_sparsity.png", dpi=300, bbox_inches='tight')
        plt.close()
        
        # Plot 4: Branch Misses vs Sparsity Level for this stride
        plt.figure(figsize=(12, 8))
        for implementation in ['Vector', 'Scalar']:
            subset = stride_df[stride_df['Implementation'] == implementation]
            plt.scatter(subset['Sparsity Level'], subset['Branch Misses_value'], 
                       s=100, alpha=0.7,
                       marker='o' if implementation == 'Vector' else 's',
                       label=f"{implementation}")
        
        plt.title(f'Branch Misses vs Sparsity Level (Stride {stride})')
        plt.xlabel('Sparsity Level (%)')
        plt.ylabel('Branch Misses')
        plt.grid(True, alpha=0.3)
        plt.legend(loc='best')
        plt.savefig(f"{stride_specific_dir}/branch_misses_vs_sparsity.png", dpi=300, bbox_inches='tight')
        plt.close()
        
        # Statistical analysis for this stride
        vector_cycles = stride_df[stride_df['Implementation'] == 'Vector']['Cycles_value'].mean()
        scalar_cycles = stride_df[stride_df['Implementation'] == 'Scalar']['Cycles_value'].mean()
        
        vector_instr = stride_df[stride_df['Implementation'] == 'Vector']['Instructions_value'].mean()
        scalar_instr = stride_df[stride_df['Implementation'] == 'Scalar']['Instructions_value'].mean()
        
        # Save summary for this stride
        with open(f"{stride_specific_dir}/summary.txt", 'w') as f:
            f.write(f"Performance Summary for Stride {stride}\n")
            f.write("===============================\n\n")
            f.write(f"1. Average cycles - Vector: {vector_cycles:.2f}, Scalar: {scalar_cycles:.2f}, Speedup: {scalar_cycles/vector_cycles:.2f}x\n")
            f.write(f"2. Average instructions - Vector: {vector_instr:.2f}, Scalar: {scalar_instr:.2f}, Speedup: {scalar_instr/vector_instr:.2f}x\n")
            
            # Best and worst speedup for this stride
            if len(stride_speedup) > 0:
                best_speedup_idx = stride_speedup['instructions_speedup'].idxmax()
                best_scenario = stride_speedup.loc[best_speedup_idx]
                f.write(f"3. Best instructions speedup: {best_scenario['instructions_speedup']:.2f}x at Sparsity={best_scenario['Sparsity Level']}%\n")
                
                worst_speedup_idx = stride_speedup['instructions_speedup'].idxmin()
                worst_scenario = stride_speedup.loc[worst_speedup_idx]
                f.write(f"4. Worst instructions speedup: {worst_scenario['instructions_speedup']:.2f}x at Sparsity={worst_scenario['Sparsity Level']}%\n")
        
        # Save stride-specific data
        stride_df.to_csv(f"{stride_specific_dir}/stride_{stride}_data.csv", index=False)
        if len(stride_speedup) > 0:
            stride_speedup.to_csv(f"{stride_specific_dir}/stride_{stride}_speedup.csv", index=False)

def create_comparative_stride_analysis(combined_df, speedup_df, output_dir):
    """
    Create comparative analysis across different strides.
    
    Parameters:
    combined_df (DataFrame): Combined data
    speedup_df (DataFrame): Speedup metrics
    output_dir (str): Directory to save output files
    """
    stride_dir = f"{output_dir}/stride_comparison"
    if not os.path.exists(stride_dir):
        os.makedirs(stride_dir)
    
    # Get unique stride values
    strides = combined_df['Stride'].unique()
    strides.sort()
    
    # 1. Average Instructions Speedup by Stride
    plt.figure(figsize=(10, 6))
    avg_speedup_by_stride = speedup_df.groupby('Stride')['instructions_speedup'].mean().reset_index()
    
    plt.bar(avg_speedup_by_stride['Stride'], avg_speedup_by_stride['instructions_speedup'], 
           color='skyblue', edgecolor='navy')
    plt.axhline(y=1.0, color='red', linestyle='--', alpha=0.7)
    
    plt.title('Average Instructions Speedup by Stride')
    plt.xlabel('Stride')
    plt.ylabel('Instructions Speedup (Scalar/Vector)')
    plt.grid(True, alpha=0.3, axis='y')
    
    for i, v in enumerate(avg_speedup_by_stride['instructions_speedup']):
        plt.text(avg_speedup_by_stride['Stride'].iloc[i], v + 0.1, f"{v:.2f}", ha='center')
    
    plt.savefig(f"{stride_dir}/avg_instructions_speedup_by_stride.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Heatmap of Instructions Speedup by Stride and Sparsity
    pivot_data = speedup_df.pivot_table(index='Stride', columns='Sparsity Level', 
                                       values='instructions_speedup', aggfunc='mean')
    
    plt.figure(figsize=(14, 8))
    sns.heatmap(pivot_data, annot=True, cmap='YlGnBu', fmt='.2f', cbar_kws={'label': 'Instructions Speedup'})
    
    plt.title('Instructions Speedup Heatmap (Stride vs Sparsity)')
    plt.ylabel('Stride')
    plt.xlabel('Sparsity Level (%)')
    plt.tight_layout()
    plt.savefig(f"{stride_dir}/instructions_speedup_heatmap.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Comparative line plot for each stride
    plt.figure(figsize=(14, 8))
    
    for stride in strides:
        stride_data = speedup_df[speedup_df['Stride'] == stride]
        plt.plot(stride_data['Sparsity Level'], stride_data['instructions_speedup'], 
                marker='o', linestyle='-', linewidth=2, 
                label=f"Stride {stride}")
    
    plt.axhline(y=1.0, color='red', linestyle='--', alpha=0.7)
    plt.title('Instructions Speedup by Sparsity Level Across Different Strides')
    plt.xlabel('Sparsity Level (%)')
    plt.ylabel('Instructions Speedup (Scalar/Vector)')
    plt.grid(True, alpha=0.3)
    plt.legend(loc='best')
    plt.savefig(f"{stride_dir}/instructions_speedup_comparison.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save summary of stride comparison
    with open(f"{stride_dir}/stride_comparison_summary.txt", 'w') as f:
        f.write("Stride Comparison Summary\n")
        f.write("========================\n\n")
        
        for stride in strides:
            stride_speedup = speedup_df[speedup_df['Stride'] == stride]['instructions_speedup'].mean()
            f.write(f"Stride {stride} - Average Instructions Speedup: {stride_speedup:.2f}x\n")
        
        f.write("\nBest Performing Combinations:\n")
        f.write("----------------------------\n")
        
        # Top 5 best performing combinations
        top5 = speedup_df.nlargest(5, 'instructions_speedup')
        for idx, row in top5.iterrows():
            f.write(f"Stride {row['Stride']}, Sparsity {row['Sparsity Level']}% - Speedup: {row['instructions_speedup']:.2f}x\n")
        
        f.write("\nWorst Performing Combinations:\n")
        f.write("-----------------------------\n")
        
        # Top 5 worst performing combinations
        bottom5 = speedup_df.nsmallest(5, 'instructions_speedup')
        for idx, row in bottom5.iterrows():
            f.write(f"Stride {row['Stride']}, Sparsity {row['Sparsity Level']}% - Speedup: {row['instructions_speedup']:.2f}x\n")

def main():
    """
    Main function to execute the analysis with stride-grouped approach.
    """
    # File paths - update these with your actual file paths
    vector_file = 'all_results_vector.csv'
    scalar_file = 'all_results_scalar.csv'
    
    # Create output directory for plots if it doesn't exist
    output_dir = 'stride_grouped_analysis'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Load and process data
    print("Loading and processing data...")
    vector_df, scalar_df, combined_df = load_and_process_data(vector_file, scalar_file)
    
    # Calculate speedup by stride
    print("Calculating speedup metrics...")
    speedup_df = calculate_speedup_by_stride(vector_df, scalar_df)
    
    # Perform stride-specific analysis
    print("Performing stride-specific analysis...")
    analyze_by_stride(combined_df, speedup_df, output_dir)
    
    # Create comparative stride analysis
    print("Creating comparative stride analysis...")
    create_comparative_stride_analysis(combined_df, speedup_df, output_dir)
    print(vector_df.head())
    print(scalar_df.head())
    print(speedup_df.head())

    # Generate overall instructions speedup visualization
    print("Generating overall instructions speedup plot...")
    plt.figure(figsize=(12, 8))
    avg_speedup = speedup_df.groupby('Sparsity Level')['instructions_speedup'].mean()
    plt.bar(avg_speedup.index, avg_speedup.values, color='skyblue', edgecolor='navy')
    plt.axhline(y=1.0, color='red', linestyle='--', alpha=0.7)
    plt.title('Average Instructions Speedup by Sparsity Level (All Strides)')
    plt.xlabel('Sparsity Level (%)')
    plt.ylabel('Instructions Speedup (Scalar/Vector)')
    plt.grid(True, alpha=0.3, axis='y')
    
    for i, v in enumerate(avg_speedup.values):
        plt.text(avg_speedup.index[i], v + 0.1, f"{v:.2f}", ha='center')
    
    plt.savefig(f"{output_dir}/overall_instructions_speedup_by_sparsity.png", dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save all dataframes
    vector_df.to_csv(f"{output_dir}/processed_vector_data.csv", index=False)
    scalar_df.to_csv(f"{output_dir}/processed_scalar_data.csv", index=False)
    combined_df.to_csv(f"{output_dir}/combined_data.csv", index=False)
    speedup_df.to_csv(f"{output_dir}/speedup_metrics.csv", index=False)
    
    print("\nAnalysis complete. Results organized by stride in the 'stride_grouped_analysis' directory.")

if __name__ == "__main__":
    main()

Loading and processing data...
Calculating speedup metrics...
Performing stride-specific analysis...
Creating comparative stride analysis...
                       Executable  Sparsity Level  Stride Implementation  \
0       mlir_sparsity_50_stride_1              50       1         Vector   
1       mlir_sparsity_50_stride_2              50       2         Vector   
2       mlir_sparsity_50_stride_3              50       3         Vector   
3       mlir_sparsity_55_stride_1              55       1         Vector   
4  mlir_sparsity_55_stride_1.llvm              55       1         Vector   

   Branch Misses_value  Branches_value  Context Switch_value  \
0              13576.0        189528.0                   0.0   
1              13591.0        189648.0                   0.0   
2              13522.0        190032.0                   0.0   
3              13643.0        181967.0                   0.0   
4              13546.0        182473.0                   0.0   

   CPU Migration_