# Workflow Performance Analysis

This notebook analyzes the performance of Nemesis file processing workflows.

In [None]:
import os
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from gql import Client, gql
from gql.transport.requests import RequestsHTTPTransport

warnings.filterwarnings("ignore")

# Set up plotting style
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

# Set up the GraphQL client
hasura_url = os.getenv("HASURA_GRAPHQL_URL", "http://hasura:8080/v1/graphql")
admin_secret = os.getenv("HASURA_ADMIN_SECRET", "")

transport = RequestsHTTPTransport(url=hasura_url, headers={"x-hasura-admin-secret": admin_secret})

client = Client(transport=transport, fetch_schema_from_transport=True)
print(f"Connected to Hasura at: {hasura_url}")

## Data Collection

Let's collect all the necessary data for our performance analysis.

In [None]:
# Get all files data with timestamps and sizes
files_query = gql("""
    query {
        files_enriched {
            object_id
            file_name
            extension
            size
            originating_object_id
            created_at
            updated_at
        }
    }
""")

print("Fetching files data...")
files_result = client.execute(files_query)
files_df = pd.DataFrame(files_result["files_enriched"])

# Convert timestamps to datetime objects
files_df["created_at"] = pd.to_datetime(files_df["created_at"])
files_df["updated_at"] = pd.to_datetime(files_df["updated_at"])

# Add file size in MB for easier analysis
files_df["size_mb"] = files_df["size"] / (1024 * 1024)

print(f"Fetched {len(files_df)} files")

In [None]:
# Get workflow data for processing time analysis
workflows_query = gql("""
    query {
        workflows {
            wf_id
            object_id
            filename
            status
            runtime_seconds
            start_time
        }
    }
""")

print("Fetching workflows data...")
workflows_result = client.execute(workflows_query)
workflows_df = pd.DataFrame(workflows_result["workflows"])

if not workflows_df.empty:
    # Convert timestamps to datetime objects
    workflows_df["start_time"] = pd.to_datetime(workflows_df["start_time"])

    # Filter for completed workflows only
    completed_workflows = workflows_df[workflows_df["status"].str.lower() == "completed"].copy()

    print(f"Fetched {len(workflows_df)} workflows ({len(completed_workflows)} completed)")
else:
    print("No workflows found")
    completed_workflows = pd.DataFrame()

## Basic Statistics

Let's calculate the basic statistics for our performance analysis.

In [None]:
# Calculate basic file statistics
total_files = len(files_df)
submitted_files = len(files_df[files_df["originating_object_id"].isna()])
derived_files = total_files - submitted_files

print("\n" + "=" * 80)
print("                           BASIC STATISTICS")
print("=" * 80)

# File counts with nice formatting
print("\n📁 FILE COUNTS")
print(f"   Total processed files:     {total_files:>12,}")
print(f"   Submitted files:           {submitted_files:>12,}")
print(f"   Derived files:             {derived_files:>12,}")
if submitted_files > 0:
    print(f"   Derivation ratio:          {derived_files / submitted_files:>12.2f}x")
else:
    print(f"   Derivation ratio:          {'N/A':>12}")

# Calculate file size statistics
if not files_df.empty:
    # Remove files with zero or null size for size analysis
    files_with_size = files_df[files_df["size"] > 0]

    print("\n📊 FILE SIZE STATISTICS")
    print(f"   Files with size data:      {len(files_with_size):>12,}")
    if not files_with_size.empty:
        print(f"   Average file size:         {files_with_size['size_mb'].mean():>12.2f} MB")
        print(f"   Median file size:          {files_with_size['size_mb'].median():>12.2f} MB")
        print(f"   Top 10% average size:      {files_with_size['size_mb'].quantile(0.9):>12.2f} MB")
        print(f"   Largest file:              {files_with_size['size_mb'].max():>12.2f} MB")
        total_gb = files_with_size["size_mb"].sum() / 1024
        print(f"   Total data processed:      {total_gb:>12.2f} GB")
    else:
        print("   No files with size data available")

# Calculate actual processing time for each file
# For files where updated_at == created_at, use workflow runtime_seconds
# Otherwise use the difference between updated_at and created_at
files_df["processing_duration_seconds"] = (files_df["updated_at"] - files_df["created_at"]).dt.total_seconds()

# Merge with workflow data to get runtime_seconds where needed
if not completed_workflows.empty:
    files_with_workflow = files_df.merge(
        completed_workflows[["object_id", "runtime_seconds"]], on="object_id", how="left"
    )

    # Use the maximum of file processing duration or workflow runtime
    files_with_workflow["actual_processing_time"] = files_with_workflow[
        ["processing_duration_seconds", "runtime_seconds"]
    ].max(axis=1, skipna=True)

    # For files without workflow data, use the processing duration
    files_with_workflow["actual_processing_time"] = files_with_workflow["actual_processing_time"].fillna(
        files_with_workflow["processing_duration_seconds"]
    )

    # Remove files with zero or negative processing time
    files_with_processing_time = files_with_workflow[files_with_workflow["actual_processing_time"] > 0]

    print("\n⏱️ PROCESSING TIME STATISTICS")
    print(f"   Files with processing data: {len(files_with_processing_time):>11,}")
    if not files_with_processing_time.empty:
        avg_time = files_with_processing_time["actual_processing_time"].mean()
        median_time = files_with_processing_time["actual_processing_time"].median()
        p90_time = files_with_processing_time["actual_processing_time"].quantile(0.9)
        max_time = files_with_processing_time["actual_processing_time"].max()
        total_time = files_with_processing_time["actual_processing_time"].sum()

        print(f"   Average processing time:   {avg_time:>12.2f} seconds")
        print(f"   Median processing time:    {median_time:>12.2f} seconds")
        print(f"   Top 10% processing time:   {p90_time:>12.2f} seconds")
        print(f"   Longest processing time:   {max_time:>12.2f} seconds")
        print(f"   Total processing time:     {total_time / 60:>12.2f} minutes")
        print(f"                              {total_time / 3600:>12.2f} hours")
    else:
        print("   No processing time data available")
else:
    print("\n⏱️ PROCESSING TIME STATISTICS")
    print("   No workflow data available for processing time analysis")
    # Still calculate from file timestamps
    files_with_processing_time = files_df[files_df["processing_duration_seconds"] > 0]
    if not files_with_processing_time.empty:
        print(f"   Files with timestamp data:  {len(files_with_processing_time):>11,}")
        avg_time = files_with_processing_time["processing_duration_seconds"].mean()
        median_time = files_with_processing_time["processing_duration_seconds"].median()
        print(f"   Average processing time:   {avg_time:>12.2f} seconds (from timestamps)")
        print(f"   Median processing time:    {median_time:>12.2f} seconds (from timestamps)")
        # Set actual_processing_time for later use
        files_df["actual_processing_time"] = files_df["processing_duration_seconds"]
    else:
        print("   No processing time data available")

## Total Processing Time and Files Per Minute

Calculate the total time from first submission to last completion and files processed per minute.

In [None]:
# Calculate total processing time from first submitted file to last processed file
if not files_df.empty:
    # Get submitted files only for the start time
    submitted_files_df = files_df[files_df["originating_object_id"].isna()]

    if not submitted_files_df.empty:
        first_submission = submitted_files_df["created_at"].min()
        last_completion = files_df["updated_at"].max()  # Use all files for completion time

        total_processing_duration = last_completion - first_submission
        total_processing_minutes = total_processing_duration.total_seconds() / 60
        total_processing_hours = total_processing_minutes / 60

        print("\n" + "=" * 80)
        print("                    TOTAL PROCESSING TIME ANALYSIS")
        print("=" * 80)
        print("\n⏰ TIMELINE ANALYSIS")
        print(f"   First file submitted:      {first_submission}")
        print(f"   Last file completed:       {last_completion}")
        print(f"   Total processing duration: {total_processing_duration}")
        print(f"   Total processing time:     {total_processing_minutes:>12.2f} minutes")
        print(f"                              {total_processing_hours:>12.2f} hours")

        # Calculate files per minute
        if total_processing_minutes > 0:
            submitted_files_per_minute = submitted_files / total_processing_minutes
            total_files_per_minute = total_files / total_processing_minutes

            print("\n📈 THROUGHPUT ANALYSIS")
            print(f"   Submitted files per minute: {submitted_files_per_minute:>11.2f}")
            print(f"   Total files per minute:     {total_files_per_minute:>11.2f}")
            print(f"   Files per hour:            {total_files_per_minute * 60:>12.2f}")
            print(f"   Files per day:             {total_files_per_minute * 60 * 24:>12.0f}")

            # Calculate data throughput if we have size data
            if "files_with_size" in locals() and not files_with_size.empty:
                total_data_mb = files_with_size["size_mb"].sum()
                data_per_minute = total_data_mb / total_processing_minutes
                print(f"   Data processed per minute: {data_per_minute:>12.2f} MB/min")
                print(f"   Data processed per hour:   {data_per_minute * 60:>12.2f} MB/hour")
                print(f"                              {data_per_minute * 60 / 1024:>12.2f} GB/hour")
        else:
            print("   Cannot calculate throughput - no processing time")
    else:
        print("No submitted files found for processing time analysis")
else:
    print("No files found for processing time analysis")

## File Size Analysis and Visualizations

Let's create detailed visualizations of file sizes and processing patterns.

In [None]:
# File size distribution analysis
if not files_df.empty:
    files_with_size = files_df[files_df["size"] > 0]

    if not files_with_size.empty:
        # Create comprehensive file size analysis
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # 1. File size distribution histogram
        axes[0, 0].hist(files_with_size["size_mb"], bins=50, alpha=0.7, edgecolor="black")
        axes[0, 0].set_title("File Size Distribution")
        axes[0, 0].set_xlabel("File Size (MB)")
        axes[0, 0].set_ylabel("Frequency")
        axes[0, 0].set_yscale("log")

        # 2. File size box plot
        axes[0, 1].boxplot(files_with_size["size_mb"], vert=True)
        axes[0, 1].set_title("File Size Box Plot")
        axes[0, 1].set_ylabel("File Size (MB)")
        axes[0, 1].set_yscale("log")

        # 3. Top file extensions by count
        top_extensions = files_with_size["extension"].value_counts().head(10)
        axes[1, 0].bar(range(len(top_extensions)), top_extensions.values)
        axes[1, 0].set_title("Top 10 File Extensions by Count")
        axes[1, 0].set_xlabel("Extension")
        axes[1, 0].set_ylabel("Count")
        axes[1, 0].set_xticks(range(len(top_extensions)))
        axes[1, 0].set_xticklabels(top_extensions.index, rotation=45)

        # 4. File size by extension (top 10)
        ext_sizes = files_with_size.groupby("extension")["size_mb"].mean().sort_values(ascending=False).head(10)
        axes[1, 1].bar(range(len(ext_sizes)), ext_sizes.values)
        axes[1, 1].set_title("Average File Size by Extension (Top 10)")
        axes[1, 1].set_xlabel("Extension")
        axes[1, 1].set_ylabel("Average Size (MB)")
        axes[1, 1].set_xticks(range(len(ext_sizes)))
        axes[1, 1].set_xticklabels(ext_sizes.index, rotation=45)

        plt.tight_layout()
        plt.show()

        print("\n" + "=" * 80)
        print("                        FILE SIZE ANALYSIS RESULTS")
        print("=" * 80)

        # Print top 10% file sizes
        top_10_percent = files_with_size.nlargest(int(len(files_with_size) * 0.1), "size_mb")
        print("\n📊 TOP 10% LARGEST FILES ANALYSIS")
        print(f"   Number of files in top 10%: {len(top_10_percent):>11,}")
        print(f"   Average size of top 10%:    {top_10_percent['size_mb'].mean():>12.2f} MB")
        print(
            f"   Size range:                 {top_10_percent['size_mb'].min():>12.2f} - {top_10_percent['size_mb'].max():.2f} MB"
        )

        # Show top 10 largest files
        print("\n🏆 TOP 10 LARGEST FILES")
        largest_files = files_with_size.nlargest(10, "size_mb")[["file_name", "extension", "size_mb"]]
        for i, (_idx, row) in enumerate(largest_files.iterrows(), 1):
            print(f"   {i:>2}. {row['file_name']} ({row['extension']}): {row['size_mb']:.2f} MB")

        print("=" * 80)
    else:
        print("No files with size data found")
else:
    print("No files found for size analysis")

## Processing Time Analysis and Visualizations

Analyze workflow processing times and create visualizations.

In [None]:
# Processing time analysis
if not completed_workflows.empty:
    # Create comprehensive processing time analysis
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))  # Changed from (2, 2) to (1, 2)

    # 1. Processing time distribution histogram
    axes[0].hist(completed_workflows["runtime_seconds"], bins=50, alpha=0.7, edgecolor="black")
    axes[0].set_title("Processing Time Distribution")
    axes[0].set_xlabel("Processing Time (seconds)")
    axes[0].set_ylabel("Frequency")
    axes[0].set_yscale("log")

    # 2. Processing time percentiles
    percentiles = [50, 75, 90, 95, 99]
    percentile_values = [completed_workflows["runtime_seconds"].quantile(p / 100) for p in percentiles]
    axes[1].bar(range(len(percentiles)), percentile_values)
    axes[1].set_title("Processing Time Percentiles")
    axes[1].set_xlabel("Percentile")
    axes[1].set_ylabel("Processing Time (seconds)")
    axes[1].set_xticks(range(len(percentiles)))
    axes[1].set_xticklabels([f"P{p}" for p in percentiles])

    plt.tight_layout()
    plt.show()

    print("\n" + "=" * 80)
    print("                    PROCESSING TIME ANALYSIS RESULTS")
    print("=" * 80)

    # Print top 10% processing times
    top_10_percent_time = completed_workflows.nlargest(int(len(completed_workflows) * 0.1), "runtime_seconds")
    print("\n⏱️ TOP 10% LONGEST PROCESSING TIMES")
    print(f"   Number of workflows in top 10%: {len(top_10_percent_time):>12,}")
    print(f"   Average processing time:        {top_10_percent_time['runtime_seconds'].mean():>12.2f} seconds")
    print(
        f"   Time range:                     {top_10_percent_time['runtime_seconds'].min():>12.2f} - {top_10_percent_time['runtime_seconds'].max():.2f} seconds"
    )

    # Show top 10 longest processing times
    print("\n🕐 TOP 10 LONGEST PROCESSING TIMES")
    longest_times = completed_workflows.nlargest(10, "runtime_seconds")[["filename", "runtime_seconds", "start_time"]]
    for i, (_idx, row) in enumerate(longest_times.iterrows(), 1):
        print(f"   {i:>2}. {row['filename']}: {row['runtime_seconds']:.2f} seconds (started: {row['start_time']})")

    print("=" * 80)
else:
    print("No completed workflows found for processing time analysis")

## Performance Summary Report

Generate a comprehensive performance summary report.

In [None]:
# Generate comprehensive performance report
print("\n" + "=" * 80)
print("                 NEMESIS WORKFLOW PERFORMANCE REPORT")
print("=" * 80)

# Overall statistics
print("\n📊 OVERALL STATISTICS")
print("-" * 40)
print(f"Total files processed:     {total_files:>12,}")
print(f"Submitted files:           {submitted_files:>12,}")
print(f"Derived files:             {derived_files:>12,}")
if submitted_files > 0:
    print(f"Derivation ratio:          {derived_files / submitted_files:>12.2f}x")
else:
    print(f"Derivation ratio:          {'N/A':>12}")

# Processing time summary
if not files_df.empty and "submitted_files_df" in locals() and not submitted_files_df.empty:
    print("\n⏱️ PROCESSING TIME SUMMARY")
    print("-" * 40)
    print(f"Total processing time:     {total_processing_hours:>12.2f} hours")
    if total_processing_minutes > 0:
        print(f"Files per minute:          {total_files_per_minute:>12.2f}")
        print(f"Files per hour:            {total_files_per_minute * 60:>12.2f}")
        print(f"Processing rate:           {total_files / total_processing_hours:>12.2f} files/hour")

# File size summary
if not files_df.empty and "files_with_size" in locals() and not files_with_size.empty:
    print("\n📁 FILE SIZE SUMMARY")
    print("-" * 40)
    print(f"Average file size:         {files_with_size['size_mb'].mean():>12.2f} MB")
    print(f"Median file size:          {files_with_size['size_mb'].median():>12.2f} MB")
    print(f"Top 10% average size:      {files_with_size['size_mb'].quantile(0.9):>12.2f} MB")
    print(f"Largest file:              {files_with_size['size_mb'].max():>12.2f} MB")
    print(f"Total data processed:      {files_with_size['size_mb'].sum() / 1024:>12.2f} GB")

# Workflow performance summary (original workflow data)
if not completed_workflows.empty:
    print("\n🏭 WORKFLOW ENGINE SUMMARY")
    print("-" * 40)
    print(f"Completed workflows:       {len(completed_workflows):>12,}")
    print(f"Average workflow time:     {completed_workflows['runtime_seconds'].mean():>12.2f} seconds")
    print(f"Median workflow time:      {completed_workflows['runtime_seconds'].median():>12.2f} seconds")
    print(f"Top 10% workflow time:     {completed_workflows['runtime_seconds'].quantile(0.9):>12.2f} seconds")
    print(f"Longest workflow time:     {completed_workflows['runtime_seconds'].max():>12.2f} seconds")
    print(f"Workflow coverage:         {len(completed_workflows) / total_files * 100:>12.1f}% of files")

# Performance insights
print("\n💡 PERFORMANCE INSIGHTS")
print("-" * 40)

if submitted_files > 0:
    expansion_ratio = derived_files / submitted_files
    print(f"• Container expansion: Each submitted file generates {expansion_ratio:.2f} derived files on average")
    if expansion_ratio > 3:
        print("  → High expansion ratio indicates significant archive/container processing")
    elif expansion_ratio > 1.5:
        print("  → Moderate expansion from container extraction")
    else:
        print("  → Low expansion suggests mostly individual files")

if not files_df.empty and "total_processing_minutes" in locals() and total_processing_minutes > 0:
    print(f"• Processing throughput: {total_files_per_minute:.2f} files/minute overall")
    if "files_with_size" in locals() and not files_with_size.empty:
        data_throughput = files_with_size["size_mb"].sum() / total_processing_minutes
        print(f"• Data throughput: {data_throughput:.2f} MB/minute ({data_throughput * 60 / 1024:.2f} GB/hour)")

# Correlation analysis if we have both size and processing time data
if (
    "completed_workflows" in locals()
    and not completed_workflows.empty
    and "files_with_size" in locals()
    and not files_with_size.empty
):
    merged_data = pd.merge(completed_workflows, files_with_size, on="object_id", how="inner")
    if not merged_data.empty and len(merged_data) > 10:  # Need enough data points
        correlation = merged_data["size_mb"].corr(merged_data["runtime_seconds"])
        print(f"• File size vs processing time correlation: {correlation:.3f}")
        if correlation > 0.5:
            print("  → Strong positive correlation: larger files take significantly longer")
        elif correlation > 0.3:
            print("  → Moderate positive correlation: larger files tend to take longer")
        elif correlation < -0.3:
            print("  → Negative correlation: larger files process faster (investigate)")
        else:
            print("  → Weak correlation: processing time not strongly size-dependent")

# Processing efficiency insights
if (
    "files_with_processing_time" in locals()
    and not files_with_processing_time.empty
    and "total_processing_minutes" in locals()
    and total_processing_minutes > 0
):
    # Calculate actual CPU time vs wall clock time ratio
    if hasattr(files_with_processing_time, "actual_processing_time"):
        total_cpu_time_minutes = files_with_processing_time["actual_processing_time"].sum() / 60
        efficiency_ratio = total_cpu_time_minutes / total_processing_minutes
        print(f"• Processing efficiency: {efficiency_ratio:.2f} (CPU time / wall time)")
        if efficiency_ratio > 0.8:
            print("  → High efficiency: CPU-bound processing with good parallelization")
        elif efficiency_ratio > 0.4:
            print("  → Moderate efficiency: some parallel processing occurring")
        else:
            print("  → Low efficiency: significant I/O wait or limited parallelization")

print("\n" + "=" * 80)
print(f"Report generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)

## Export Results

Export the performance data for further analysis or reporting.

In [None]:
# Export performance data to CSV files
print("\n" + "=" * 60)
print("                    EXPORTING RESULTS")
print("=" * 60)

if not files_df.empty:
    # Export enhanced files data with processing times
    files_summary = files_df[
        ["object_id", "file_name", "extension", "size_mb", "originating_object_id", "created_at", "updated_at"]
    ].copy()

    # Add processing duration from timestamps
    files_summary["processing_duration_seconds"] = (files_df["updated_at"] - files_df["created_at"]).dt.total_seconds()

    # Add actual processing time if we calculated it
    if "files_with_processing_time" in locals() and hasattr(files_with_processing_time, "actual_processing_time"):
        # Merge the actual processing time back to the summary
        processing_time_data = files_with_processing_time[["object_id", "actual_processing_time"]].copy()
        files_summary = files_summary.merge(processing_time_data, on="object_id", how="left")

    files_summary.to_csv("nemesis_files_performance.csv", index=False)
    print("✓ Files performance data exported to: nemesis_files_performance.csv")

if not completed_workflows.empty:
    # Export workflow data
    workflow_summary = completed_workflows[["wf_id", "object_id", "filename", "runtime_seconds", "start_time"]].copy()
    workflow_summary.to_csv("nemesis_workflow_performance.csv", index=False)
    print("✓ Workflow performance data exported to: nemesis_workflow_performance.csv")

# Create an enhanced summary statistics file
summary_stats = {
    "metric": [
        "total_files",
        "submitted_files",
        "derived_files",
        "derivation_ratio",
        "avg_file_size_mb",
        "median_file_size_mb",
        "top_10_percent_file_size_mb",
        "total_data_gb",
        "avg_individual_processing_time_sec",
        "median_individual_processing_time_sec",
        "top_10_percent_processing_time_sec",
        "avg_workflow_time_sec",
        "median_workflow_time_sec",
        "total_processing_hours",
        "files_per_minute",
        "files_per_hour",
        "data_throughput_mb_per_minute",
    ],
    "value": [
        total_files,
        submitted_files,
        derived_files,
        derived_files / submitted_files if submitted_files > 0 else 0,
        files_with_size["size_mb"].mean() if "files_with_size" in locals() and not files_with_size.empty else 0,
        files_with_size["size_mb"].median() if "files_with_size" in locals() and not files_with_size.empty else 0,
        files_with_size["size_mb"].quantile(0.9) if "files_with_size" in locals() and not files_with_size.empty else 0,
        files_with_size["size_mb"].sum() / 1024 if "files_with_size" in locals() and not files_with_size.empty else 0,
        files_with_processing_time["actual_processing_time"].mean()
        if "files_with_processing_time" in locals()
        and not files_with_processing_time.empty
        and hasattr(files_with_processing_time, "actual_processing_time")
        else 0,
        files_with_processing_time["actual_processing_time"].median()
        if "files_with_processing_time" in locals()
        and not files_with_processing_time.empty
        and hasattr(files_with_processing_time, "actual_processing_time")
        else 0,
        files_with_processing_time["actual_processing_time"].quantile(0.9)
        if "files_with_processing_time" in locals()
        and not files_with_processing_time.empty
        and hasattr(files_with_processing_time, "actual_processing_time")
        else 0,
        completed_workflows["runtime_seconds"].mean() if not completed_workflows.empty else 0,
        completed_workflows["runtime_seconds"].median() if not completed_workflows.empty else 0,
        total_processing_hours if "total_processing_hours" in locals() else 0,
        total_files_per_minute if "total_files_per_minute" in locals() else 0,
        total_files_per_minute * 60 if "total_files_per_minute" in locals() else 0,
        (files_with_size["size_mb"].sum() / total_processing_minutes)
        if (
            "files_with_size" in locals()
            and not files_with_size.empty
            and "total_processing_minutes" in locals()
            and total_processing_minutes > 0
        )
        else 0,
    ],
}

summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv("nemesis_performance_summary.csv", index=False)
print("✓ Performance summary exported to: nemesis_performance_summary.csv")

# Export processing time comparison if we have both datasets
if (
    "files_with_processing_time" in locals()
    and not files_with_processing_time.empty
    and not completed_workflows.empty
    and hasattr(files_with_processing_time, "actual_processing_time")
):
    # Create comparison dataset
    comparison_data = files_with_processing_time[
        ["object_id", "file_name", "size_mb", "processing_duration_seconds", "actual_processing_time"]
    ].copy()

    # Merge with workflow data
    comparison_data = comparison_data.merge(
        completed_workflows[["object_id", "runtime_seconds", "filename"]], on="object_id", how="left"
    )

    comparison_data.to_csv("nemesis_processing_time_comparison.csv", index=False)
    print("✓ Processing time comparison exported to: nemesis_processing_time_comparison.csv")

print("\n📊 EXPORT SUMMARY")
print(f"   Files exported:            {len(files_df) if not files_df.empty else 0:>12,}")
print(f"   Workflows exported:        {len(completed_workflows) if not completed_workflows.empty else 0:>12,}")
print(f"   Summary metrics:           {len(summary_df):>12,}")

print("\n📄 All performance data has been exported for further analysis!")
print("=" * 60)