## Setup and Data Loading

In [8]:
# Install dependencies
!pip install pandas numpy matplotlib seaborn scipy wordcloud pyarrow datasets --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re
from collections import Counter
from datetime import datetime


# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Plot settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Environment ready!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Environment ready!


In [9]:
# Compatibility shim: some versions of fsspec don't expose url_to_fs at top-level.
# This ensures code that expects fsspec.url_to_fs (used by some IO backends) continues to work.
try:
    import fsspec
    if not hasattr(fsspec, "url_to_fs"):
        try:
            from fsspec.core import url_to_fs as _url_to_fs
        except Exception:
            try:
                import fsspec.core as _core
                _url_to_fs = _core.url_to_fs
            except Exception:
                # Fallback shim: create a minimal url_to_fs that returns a filesystem and the path.
                def _url_to_fs(url, **kwargs):
                    protocol = url.split("://")[0] if "://" in url else "file"
                    fs = fsspec.filesystem(protocol)
                    return fs, url
        fsspec.url_to_fs = _url_to_fs
except Exception:
    # If anything goes wrong, continue without failing here; subsequent IO calls will raise their own errors.
    pass

In [10]:
# Load datasets
print("Loading AIDev datasets...")

# AI Agent PRs
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_task_type.parquet")
ai_perf_prs = (
    pr_df
    .merge(
        pr_task_type_df[["id", "type", "reason"]],
        on="id",
        how="inner"
    )
    .query("type == 'perf'")
    .copy()
)
ai_perf_prs['classification_reason'] = ai_perf_prs['reason']
ai_perf_prs['author_type'] = 'AI Agent'

# Human PRs
human_pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/human_pull_request.parquet")
human_pr_task_type_df = pd.read_parquet("hf://datasets/hao-li/AIDev/human_pr_task_type.parquet")
human_perf_prs = (
    human_pr_df
    .merge(
        human_pr_task_type_df[["id", "type", "reason"]],
        on="id",
        how="inner"
    )
    .query("type == 'perf'")
    .copy()
)
human_perf_prs['classification_reason'] = human_perf_prs['reason']
human_perf_prs['author_type'] = 'Human'
human_perf_prs['agent'] = 'Human'

# Store original counts
original_ai_count = len(ai_perf_prs)
original_human_count = len(human_perf_prs)

# Repository data for language info
all_repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/all_repository.parquet")

# Get list of performance PR IDs we care about
perf_pr_ids = set(ai_perf_prs['id'].tolist() + human_perf_prs['id'].tolist())
print(f"\n✓ Performance PR IDs to process: {len(perf_pr_ids):,}")

# PR commits details - FILTER FIRST, then aggregate
print("\nProcessing commit details (filtering to performance PRs only)...")
pr_commits_details = pd.read_parquet("hf://datasets/hao-li/AIDev/pr_commit_details.parquet")

# Pr commit details for human PRs
human_pr_commit_details = pd.read_parquet("../.././datasets/human_pr/human_pr_commit_details.parquet")


pr_commits_details = pd.concat(
    [pr_commits_details, human_pr_commit_details],
    ignore_index=True
)

if 'pr_id' in pr_commits_details.columns:
    print(f"  Total commit records in dataset: {len(pr_commits_details):,}")
    
    # FILTER: Keep only commits for performance PRs
    pr_commits_filtered = pr_commits_details[pr_commits_details['pr_id'].isin(perf_pr_ids)].copy()
    print(f"  Filtered to performance PRs: {len(pr_commits_filtered):,} commit records")
    print(f"  Unique performance PRs with commits: {pr_commits_filtered['pr_id'].nunique():,}")
    
    # ADDITIONAL FILTERING: Remove commits with null filename
    if 'filename' in pr_commits_filtered.columns:
        before_filename_filter = len(pr_commits_filtered)
        pr_commits_filtered = pr_commits_filtered[pr_commits_filtered['filename'].notna()].copy()
        print(f"  Filtered out null filenames: {before_filename_filter - len(pr_commits_filtered):,} records removed")
        print(f"  Remaining after filename filter: {len(pr_commits_filtered):,} commit records")
        
    # ADDITIONAL FILTERING: Remove config/metadata-only files
    if 'filename' in pr_commits_filtered.columns:
        before_config_filter = len(pr_commits_filtered)
        
        # Define patterns for non-code files to exclude
        config_patterns = [
            r'^\.mvn/',                          # Maven wrapper configs
            r'^\.gradle/',                       # Gradle configs
            r'^\.idea/',                         # IntelliJ configs
            r'^\.vscode/',                       # VSCode configs
            r'^\.github/workflows/',             # GitHub Actions (unless it's code)
            r'\.properties$',                    # Properties files
            r'\.xml$',                           # XML config files (pom.xml, etc.)
            r'\.yml$',                           # YAML configs
            r'\.yaml$',                          # YAML configs
            r'\.json$',                          # JSON configs (package.json, etc.)
            r'\.md$',                            # Markdown docs
            r'\.txt$',                           # Text files
            r'\.gitignore$',                     # Git configs
            r'\.dockerignore$',                  # Docker ignore files
            r'/Dockerfile$',                     # Dockerfiles (anywhere in path)
            r'^Dockerfile$',                     # Dockerfile at root
            r'/docker-compose',                  # Docker compose (anywhere)
            r'^docker-compose',                  # Docker compose at root
            r'\.lock$',                          # Lock files (package-lock, yarn.lock)
            r'^LICENSE',                         # License files
            r'^README',                          # README files
        ]
        
        config_pattern = '|'.join(config_patterns)
        
        # Mark config files
        pr_commits_filtered['is_config_file'] = pr_commits_filtered['filename'].str.contains(
            config_pattern, case=False, na=False, regex=True
        )
        
        # Keep track of which files are code files per PR
        pr_commits_filtered['is_code_file'] = ~pr_commits_filtered['is_config_file']
        
        # For each PR, check if it has ANY code files
        pr_has_code = pr_commits_filtered.groupby('pr_id')['is_code_file'].any().reset_index()
        pr_has_code.columns = ['pr_id', 'has_code_files']
        
        # Filter to keep only PRs that have at least one code file
        pr_commits_filtered = pr_commits_filtered.merge(pr_has_code, on='pr_id', how='left')
        pr_commits_filtered = pr_commits_filtered[pr_commits_filtered['has_code_files']].copy()
        
        # Clean up temporary columns
        pr_commits_filtered = pr_commits_filtered.drop(columns=['is_config_file', 'is_code_file', 'has_code_files'])
        
        print(f"  Filtered out config-only commits: {before_config_filter - len(pr_commits_filtered):,} records removed")
        print(f"  Remaining after config filter: {len(pr_commits_filtered):,} commit records")
    
    # ADDITIONAL FILTERING: Remove merge commits
    if 'message' in pr_commits_filtered.columns:
        before_merge_filter = len(pr_commits_filtered)
        # Common merge commit patterns
        merge_patterns = [
            r'^Merge\s+branch',
            r'^Merge\s+pull\s+request',
            r'^Merge\s+remote-tracking\s+branch',
            r'^Merge\s+.*\s+into\s+',
            r"^Merged\s+in\s+",
        ]
        merge_pattern = '|'.join(merge_patterns)
        pr_commits_filtered = pr_commits_filtered[
            ~pr_commits_filtered['message'].str.match(merge_pattern, case=False, na=False)
        ].copy()
        print(f"  Filtered out merge commits: {before_merge_filter - len(pr_commits_filtered):,} records removed")
        print(f"  Remaining after merge filter: {len(pr_commits_filtered):,} commit records")
    
    print(f"  Unique performance PRs after all filters: {pr_commits_filtered['pr_id'].nunique():,}")
    
    if len(pr_commits_filtered) > 0:
        # AGGREGATE: Now aggregate only the filtered commits
        commit_aggregated = pr_commits_filtered.groupby('pr_id').agg({
            'additions': 'sum',      # Total lines added across all commits
            'deletions': 'sum',      # Total lines deleted across all commits
            'patch': lambda x: '\n\n'.join([str(p) for p in x if pd.notna(p)])  # Concatenate all patches
        }).reset_index()
        
        # Add derived metrics
        commit_aggregated['num_commits'] = pr_commits_filtered.groupby('pr_id').size().values
        
        # Calculate patch length (for analysis)
        commit_aggregated['patch_length'] = commit_aggregated['patch'].str.len()
        
        print(f"  ✓ Aggregated to {len(commit_aggregated):,} unique performance PRs")
        print(f"  Avg commits per PR: {commit_aggregated['num_commits'].mean():.1f}")
        
        # Merge commit stats into AI Agent PR table
        ai_perf_prs = ai_perf_prs.merge(
            commit_aggregated,
            left_on='id',
            right_on='pr_id',
            how='left'
        )
        if 'pr_id' in ai_perf_prs.columns:
            ai_perf_prs = ai_perf_prs.drop(columns=['pr_id'])
        
        # Filter to keep only PRs with commit data
        ai_before_filter = len(ai_perf_prs)
        ai_with_commits = ai_perf_prs[ai_perf_prs['additions'].notna()].copy()
        print(f"  AI Agent PRs with commit data: {len(ai_with_commits):,} / {ai_before_filter:,} ({len(ai_with_commits)/ai_before_filter*100:.1f}%)")
        
        # Merge commit stats into Human PR table
        human_perf_prs = human_perf_prs.merge(
            commit_aggregated,
            left_on='id',
            right_on='pr_id',
            how='left'
        )
        if 'pr_id' in human_perf_prs.columns:
            human_perf_prs = human_perf_prs.drop(columns=['pr_id'])
        
        # Filter to keep only PRs with commit data
        human_before_filter = len(human_perf_prs)
        human_with_commits = human_perf_prs[human_perf_prs['additions'].notna()].copy()
        print(f"  Human PRs with commit data: {len(human_with_commits):,} / {human_before_filter:,} ({len(human_with_commits)/human_before_filter*100:.1f}%)")
    else:
        print("  ⚠ No commits found for performance PRs after filtering")
        # Create empty dataframes with same structure
        ai_with_commits = ai_perf_prs.iloc[0:0].copy()
        human_with_commits = human_perf_prs.iloc[0:0].copy()
    
else:
    print('⚠ pr_commit_details missing pr_id column; skipping commit merges.')
    # Create empty dataframes
    ai_with_commits = ai_perf_prs.iloc[0:0].copy()
    human_with_commits = human_perf_prs.iloc[0:0].copy()

print(f"\n{'='*80}")
print(f"SUMMARY")
print(f"{'='*80}")
print(f"Original Performance PRs:")
print(f"  AI Agent: {original_ai_count:,}")
print(f"  Human: {original_human_count:,}")
print(f"  Total: {original_ai_count + original_human_count:,}")
print(f"\nAfter Commit Filtering:")
print(f"✓ AI Agent Performance PRs: {len(ai_with_commits):,}")
print(f"✓ Human Performance PRs: {len(human_with_commits):,}")
print(f"✓ Total Performance PRs: {len(ai_with_commits) + len(human_with_commits):,}")

# Distribution by AI agent
if len(ai_with_commits) > 0:
    print(f"\nAI Agent Distribution:")
    for agent, count in ai_with_commits['agent'].value_counts().items():
        pct = count / len(ai_with_commits) * 100
        print(f"  {agent:20s} {count:5,d} ({pct:5.1f}%)")

# Commit statistics summary
if len(ai_with_commits) > 0 or len(human_with_commits) > 0:
    print(f"\n{'='*80}")
    print(f"COMMIT STATISTICS")
    print(f"{'='*80}")
    
    for author_type, df in [('AI Agent', ai_with_commits), ('Human', human_with_commits)]:
        if len(df) > 0:
            print(f"\n{author_type}:")
            print(f"  PRs with commit data: {len(df):,}")
            print(f"  Avg commits per PR: {df['num_commits'].mean():.1f}")
            print(f"  Median commits per PR: {df['num_commits'].median():.1f}")
            print(f"  Avg additions: {df['additions'].mean():.0f} lines")
            print(f"  Median additions: {df['additions'].median():.0f} lines")
            print(f"  Avg deletions: {df['deletions'].mean():.0f} lines")
            print(f"  Median deletions: {df['deletions'].median():.0f} lines")

print(f"\n{'='*80}")

Loading AIDev datasets...

✓ Performance PR IDs to process: 428

Processing commit details (filtering to performance PRs only)...
  Total commit records in dataset: 719,797
  Filtered to performance PRs: 15,284 commit records
  Unique performance PRs with commits: 427
  Filtered out null filenames: 46 records removed
  Remaining after filename filter: 15,238 commit records
  Filtered out config-only commits: 44 records removed
  Remaining after config filter: 15,194 commit records
  Filtered out merge commits: 2,945 records removed
  Remaining after merge filter: 12,249 commit records
  Unique performance PRs after all filters: 409
  ✓ Aggregated to 409 unique performance PRs
  Avg commits per PR: 29.9
  AI Agent PRs with commit data: 326 / 340 (95.9%)
  Human PRs with commit data: 83 / 88 (94.3%)

SUMMARY
Original Performance PRs:
  AI Agent: 340
  Human: 88
  Total: 428

After Commit Filtering:
✓ AI Agent Performance PRs: 326
✓ Human Performance PRs: 83
✓ Total Performance PRs: 409



In [12]:
# Combine AI and Human PRs
perf_prs = pd.concat([ai_with_commits, human_with_commits], ignore_index=True)

print(f"Combined dataset: {len(perf_prs):,} performance PRs")
print(f"  AI Agents: {(perf_prs['author_type'] == 'AI Agent').sum():,}")
print(f"  Humans: {(perf_prs['author_type'] == 'Human').sum():,}")

Combined dataset: 409 performance PRs
  AI Agents: 326
  Humans: 83


# Optimization Pattern Detection

## LLM inference script
Script to map optimization to the performance optimization pattern catalog using LLM, based on PR title, body, and patch

In [None]:
# ============================================================================
# Performance Optimization Pattern Detection with GPT
# ============================================================================
!pip install openai dotenv --quiet
from openai import OpenAI
import os
import time
from tqdm import tqdm
import json
from pydantic import BaseModel 
from pathlib import Path
from dotenv import load_dotenv

def analyze_optimization_with_gpt(title, body, patch):
    """
    Call GPT to analyze performance optimization patterns in a commit.
    
    Parameters:
    - title: PR/commit title
    - body: PR/commit description
    - patch: Git diff/patch content
    
    Returns:
    - dict with analysis results or error info
    """
    
    # Prepare the context
    context_parts = []
    
    if pd.notna(title) and str(title).strip():
        context_parts.append(f"**Title**: {title}")
    
    if pd.notna(body) and str(body).strip():
        context_parts.append(f"**Description**: {body}")
    
    if pd.notna(patch) and str(patch).strip():
        # Truncate very long patches to avoid token limits
        patch_str = str(patch)
        if len(patch_str) > 15000:  # Rough character limit
            patch_str = patch_str[:15000] + "\n\n... [patch truncated for length] ..."
        context_parts.append(f"**Code Changes (Patch)**:\n```diff\n{patch_str}\n```")
    
    if not context_parts:
        return {
            "success": False,
            "error": "No content available",
            "explanation": None,
            "optimization_comparison": None,
            "high_level_pattern": None,
            "sub_pattern": None,
            "tokens_used": 0
        }
    
    context = "\n\n".join(context_parts)
    
    try:
        load_dotenv()
    except Exception:
        # dotenv not installed / .env not loaded; rely on environment variables
        pass

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to your .env or export it.")

    client = OpenAI(api_key=api_key)
    
    # Construct the prompt
    prompt = f"""I have a performance optimization commit with the following information. Please analyze with the following goals:

1. **Code Function Explanation**: Briefly explain what the code is doing—what problem it solves and how it works.

2. **Optimization Comparison**: Compare the original and optimized versions to identify:
   - **Algorithmic changes**: Any differences in logic, algorithm design, or problem-solving approach.
   - **Performance improvements**: Enhancements related to time complexity, space efficiency, or runtime behavior.
   - **Redundant code removal**: Elimination of unnecessary logic, method calls, or control structures.
   - **Other noteworthy changes**: Any structural or stylistic differences that could impact performance or readability.
   
3. **Optimization Pattern Classification**:
   Based on the overall nature of the optimized code, assign the following. Return "No Meaningful Change" if no meaningful change is made.
   - **Exactly one high-level optimization pattern** from the list below  
   - **One most representative sub-pattern** within that high-level category
   
   ### High-Level Optimization Patterns Taxonomy:
   - **Algorithm-Level Optimizations**
        - Select Computationally Efficient Algorithms
        - Select Algorithm Based on Instruction Speed
        - Structure Algorithm to Support instruction level parallelism (ILP)
        - Select Space Efficient Algorithm
        - Inheritance over Delegation for Energy Efficiency
   - **Control-Flow and Branching Optimizations**
        - Make Conditional Branches More Predictable
        - Remove Branches with min/max Instructions
        - Remove Branches by Doing Extra Work
        - Remove Branching with Masking
        - Rearranging Branches
        - Combining Branches
   - **Memory and Data Locality Optimizations**
        - Access Data with Appropriate Type (Prevent Store Forwarding Issues)
        - Increase Cache Efficiency via Locality
        - Arrange Data for Optimal Hardware Prefetching
        - Avoid cache capacity issues by segmenting work
        - Increase Workload to Mitigate Memory Access Latency
        - Use Smaller Data Types
        - Caching
        - Buffering
        - Improve cache locality via data structure
        - Optimize Object Use
        - Reduce memory bloat from RTSJ Immortal Memory
   - **Loop Transformations**
        - Remove Conditional by Loop Unrolling
        - Loop Distribution (Fission)
        - Loop Fusion
        - Loop Peeling
        - Loop Interchanging
        - Loop Invariant Branches
        - Loop Strip-mining
   - **I/O and Synchronization**
        - Selection of I/O Size
        - Polling
        - Non-Blocking I/O
   - **Data Structure Selection and Adaptation**
        - Choose Structure for Energy Efficiency
        - Darwinian Data Structure Selection
        - Choose more energy-efficient data structure across Java Collections Framework, Apache Common Collections, and Eclipse Collections
        - Choose energy-efficient data structure by method calls
   - **Code Smells and Structural Simplification**
        - Remove code bloat by removing optional features
        - Remove Unnecessary Method Calls
        - Remove long method by extracting new method
        - Remove Duplicate code
        - Minimize feature envy by moving methods
        - Minimize occurrences of God Class
        - Type Checking
         
Here are the info:
            
{context}

**Output Structure**:  
Please respond in JSON format with the following structure:
{{
  "explanation": "Brief description of what the code is doing",
  "optimization_comparison": "Detailed comparison highlighting specific optimizations",
  "high_level_pattern": "Single most representative high-level optimization pattern (or 'No Meaningful Change')",
  "sub_pattern": "Most representative sub-pattern within the category (or null if No Meaningful Change)",
}}

Ensure your response is valid JSON that can be parsed.
"""

    class AnalysisResult(BaseModel):  
        explanation: str
        optimization_comparison: str
        high_level_pattern: str
        sub_pattern: str
    
    try:        
        response = client.beta.chat.completions.parse(
                    model = "gpt-5.1",
                    messages = [
                        {
                            "role": "system",
                            "content": "You are an expert software engineer specializing in performance optimization analysis. Analyze code changes and classify optimization patterns accurately."
                        },
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    response_format=AnalysisResult,
                    temperature=0,
                )

        # Parse the response
        content = response.choices[0].message.content
        result = json.loads(content)
        
        return {
            "success": True,
            "explanation": result.get("explanation", ""),
            "optimization_comparison": result.get("optimization_comparison", ""),
            "high_level_pattern": result.get("high_level_pattern", ""),
            "sub_pattern": result.get("sub_pattern", ""),
            "tokens_used": response.usage.total_tokens,
            "error": None
        }
        
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "explanation": None,
            "optimization_comparison": None,
            "high_level_pattern": None,
            "sub_pattern": None,
            "tokens_used": 0
        }


def batch_analyze_performance_prs(perf_prs, batch_size=10, delay=1.0,resume=False, checkpoint_prefix='perf_prs_checkpoint', output_file='perf_prs_with_gpt_analysis.csv'):
    """
    Analyze all performance PRs in batches.

    Parameters:
    - perf_prs: DataFrame with performance PRs
    - batch_size: Number of PRs to process before saving checkpoint
    - delay: Delay between API calls in seconds
    - resume: Continue from the last available checkpoint if True
    - checkpoint_prefix: Filename prefix used for checkpoint files
    - output_file: Final CSV filename for the aggregated results

    Returns:
    - DataFrame with analysis results added
    """

    print(f"Starting GPT analysis of {len(perf_prs):,} performance PRs...")

    checkpoint_files = []
    processed_count = 0

    if resume:
        checkpoint_files = sorted(Path('.').glob(f"{checkpoint_prefix}_*.csv"))
        if checkpoint_files:
            def _processed_from_path(path_obj):
                suffix = path_obj.stem.rsplit('_', 1)[-1]
                return int(suffix) if suffix.isdigit() else 0

            latest_checkpoint = max(checkpoint_files, key=_processed_from_path)
            checkpoint_progress = _processed_from_path(latest_checkpoint)
            perf_prs = pd.read_csv(latest_checkpoint)
            processed_count = min(checkpoint_progress, len(perf_prs))
            print(f"↻ Resuming from checkpoint {latest_checkpoint} ({processed_count} PRs processed)...")
        else:
            print("↻ Resume requested but no checkpoint found. Starting from scratch.")

    result_defaults = {
        'gpt_explanation': None,
        'gpt_comparison': None,
        'optimization_pattern': None,
        'optimization_subpattern': None,
        'gpt_success': False,
        'gpt_error': None,
        'gpt_tokens': 0
    }

    for column, default in result_defaults.items():
        if resume and column in perf_prs.columns:
            continue
        perf_prs[column] = default

    start_idx = processed_count if resume else 0
    iterator = range(start_idx, len(perf_prs))
    progress_bar = tqdm(iterator, total=len(perf_prs), desc="Analyzing PRs", initial=start_idx)

    for idx in progress_bar:
        row = perf_prs.iloc[idx]
        result = analyze_optimization_with_gpt(
            title=row.get('title'),
            body=row.get('body'),
            patch=row.get('patch')
        )

        perf_prs.at[idx, 'gpt_success'] = result['success']
        perf_prs.at[idx, 'gpt_tokens'] = result['tokens_used']

        if result['success']:
            perf_prs.at[idx, 'gpt_explanation'] = result['explanation']
            perf_prs.at[idx, 'gpt_comparison'] = result['optimization_comparison']
            perf_prs.at[idx, 'optimization_pattern'] = result['high_level_pattern']
            perf_prs.at[idx, 'optimization_subpattern'] = result['sub_pattern']
            perf_prs.at[idx, 'gpt_error'] = None
        else:
            perf_prs.at[idx, 'gpt_error'] = result['error']

        time.sleep(delay)

        if (idx + 1) % batch_size == 0:
            checkpoint_file = f"{checkpoint_prefix}_{idx+1}.csv"
            perf_prs.to_csv(checkpoint_file, index=False)
            print(f"✓ Checkpoint saved: {checkpoint_file}")

    perf_prs.to_csv(output_file, index=False)
    print(f"✓ Analysis complete! Saved to: {output_file}")

    success_series = perf_prs['gpt_success'].fillna(False)
    success_count = success_series.sum()
    success_rate = (success_count / len(perf_prs) * 100) if len(perf_prs) else 0
    failure_count = success_series.eq(False).sum()
    total_tokens = perf_prs['gpt_tokens'].sum()

    print(f"{'='*80}")
    print("ANALYSIS SUMMARY")
    print(f"{'='*80}")
    print(f"Total PRs analyzed: {len(perf_prs):,}")
    print(f"Successful: {success_count:,} ({success_rate:.1f}%)")
    print(f"Failed: {failure_count:,}")
    print(f"Total tokens used: {total_tokens:,}")

    if success_count > 0:
        print(f"{'='*80}")
        print("OPTIMIZATION PATTERN DISTRIBUTION")
        print(f"{'='*80}")
        pattern_counts = perf_prs[perf_prs['gpt_success'] == True]['optimization_pattern'].value_counts()
        for pattern, count in pattern_counts.items():
            pct = count / success_count * 100
            print(f"  {pattern:50s} {count:4d} ({pct:5.1f}%)")

    return perf_prs


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Usage scripts

In [None]:
# ============================================================================
# Usage
# ============================================================================

# run ai and human pr analysis separately

# ai pr analysis
ai_sample = perf_prs[perf_prs['author_type'] == 'AI Agent']
print(f"Testing GPT analysis on {len(ai_sample)} AI PRs")

# Run the analysis
perf_prs_analyzed = batch_analyze_performance_prs(
    ai_sample,
    batch_size=10,    # Save checkpoint every 10 PRs
    delay=0.5,        # 0.5 second delay between API calls
    resume=True,      # Continue from the last saved checkpoint if available
    checkpoint_prefix='ai_perf_prs_checkpoint',
    output_file='ai_perf_prs_with_gpt_analysis.csv'
)


In [26]:
# human pr analysis
human_sample = perf_prs[perf_prs['author_type'] == 'Human'].copy().reset_index(drop=True)
print(f"Testing GPT analysis on {len(human_sample)} Human PRs")

# Run the analysis
perf_prs_analyzed = batch_analyze_performance_prs(
    human_sample,
    batch_size=10,    # Save checkpoint every 10 PRs
    delay=0.5,        # 0.5 second delay between API calls
    resume=False,      # Continue from the last saved checkpoint if available
    checkpoint_prefix='human_perf_prs_checkpoint',
    output_file='human_perf_prs_with_gpt_analysis.csv'
)

Testing GPT analysis on 83 Human PRs
Starting GPT analysis of 83 performance PRs...


Analyzing PRs:  12%|█▏        | 10/83 [01:56<15:04, 12.40s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_10.csv


Analyzing PRs:  24%|██▍       | 20/83 [04:15<15:11, 14.47s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_20.csv


Analyzing PRs:  36%|███▌      | 30/83 [06:36<12:06, 13.70s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_30.csv


Analyzing PRs:  48%|████▊     | 40/83 [08:59<12:42, 17.73s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_40.csv


Analyzing PRs:  60%|██████    | 50/83 [11:53<08:40, 15.77s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_50.csv


Analyzing PRs:  72%|███████▏  | 60/83 [14:26<06:16, 16.36s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_60.csv


Analyzing PRs:  84%|████████▍ | 70/83 [16:53<03:43, 17.23s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_70.csv


Analyzing PRs:  96%|█████████▋| 80/83 [18:55<00:37, 12.58s/it]

✓ Checkpoint saved: human_perf_prs_checkpoint_80.csv


Analyzing PRs: 100%|██████████| 83/83 [19:58<00:00, 14.44s/it]


✓ Analysis complete! Saved to: human_perf_prs_with_gpt_analysis.csv
ANALYSIS SUMMARY
Total PRs analyzed: 83
Successful: 83 (100.0%)
Failed: 0
Total tokens used: 374,517
OPTIMIZATION PATTERN DISTRIBUTION
  Code Smells and Structural Simplification            39 ( 47.0%)
  Memory and Data Locality Optimizations               17 ( 20.5%)
  Algorithm-Level Optimizations                        14 ( 16.9%)
  No Meaningful Change                                  8 (  9.6%)
  I/O and Synchronization                               3 (  3.6%)
  Data Structure Selection and Adaptation               1 (  1.2%)
  Control-Flow and Branching Optimizations              1 (  1.2%)
