In [222]:
import polars as pl
import pandas as pd
import json
import os
import re

In [223]:
def read_jsonl_file_line_by_line(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"JSON decode error in {file_path}: {e}")
        return data
    except Exception as e:
        print(f"Error opening {file_path}: {e}")
        return None

In [224]:
pattern = r'diff --git a/.+? b/(.+)'

In [239]:
def process_experiments(experiments, base_dir):
    all_preds = {}
    pattern = r'(?:diff --git a/.+? b/|--- a/)(.+)'

    for exp in experiments:
        file_path = os.path.join(base_dir, exp, 'all_preds.jsonl')
        df_data = read_jsonl_file_line_by_line(file_path)
        
        if df_data is not None:
            df = pl.DataFrame(df_data)

            df = df.with_columns(
                pl.lit(exp).alias('model_name_or_path')
            )

            if 'model_patch' in df.columns:
                df = df.with_columns(
                    pl.col('model_patch').str.extract(pattern, 1).alias('file_changed')
                )
            
            df = df.select(['model_name_or_path', 'instance_id', 'file_changed', 'model_patch'])
            all_preds[exp] = df

    combined_df = pl.concat(list(all_preds.values()))
    return combined_df


In [240]:
def analyze_bench(bench_df, swe_df_small):
    merged_df = bench_df.join(swe_df_small, on='instance_id', how='inner')
    
    merged_df = merged_df.with_columns(
        (pl.col('file_changed') == pl.col('true_patch_file')).alias('is_same_file')
    )
    
    num_same_files = merged_df['is_same_file'].sum()
    total_instances = merged_df.shape[0]
    percentage = (num_same_files / total_instances) * 100 if total_instances > 0 else 0
    
    print(f"Number of instances where 'file_changed' == 'true_patch_file': {num_same_files}")
    print(f"Total number of instances: {total_instances}")
    print(f"Percentage of matches: {percentage:.2f}%")
    
    model_stats = merged_df.group_by('model_name_or_path').agg([
        pl.len().alias('total_instances'),
        pl.col('is_same_file').sum().alias('num_same_files'),
        (pl.col('is_same_file').sum() / pl.len() * 100).alias('percentage_matches')
    ])
    
    print(model_stats)
    return merged_df, model_stats

In [241]:
swe_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench/data/test-00000-of-00001.parquet')
swe_bench_lite_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench_Lite/data/test-00000-of-00001.parquet')

# Lite bench

## Experiments (6 best)

- 20240702_codestory_aide_mixed
- 20240820_honeycomb
- 20240627_abanteai_mentatbot_gpt4o
- 20240811_gru
- 20240829_Isoform
- 20240806_SuperCoder2

In [242]:
lite_experiments = [
    '20240702_codestory_aide_mixed',
    '20240820_honeycomb',
    '20240627_abanteai_mentatbot_gpt4o',
    '20240811_gru',
    '20240829_Isoform',
    '20240806_SuperCoder2.0',
]

lite_dir = './experiments/evaluation/lite/'

lite_bench_df = process_experiments(lite_experiments, lite_dir)

In [243]:
swe_bench_lite_df = swe_bench_lite_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_bench_lite_df.select(['instance_id', 'true_patch_file'])

lite_merged_df, lite_model_stats = analyze_bench(lite_bench_df, swe_df_small)

Number of instances where 'file_changed' == 'true_patch_file': 1196
Total number of instances: 1797
Percentage of matches: 66.56%
shape: (6, 4)
┌─────────────────────────────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path              ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                             ┆ ---             ┆ ---            ┆ ---                │
│ str                             ┆ u32             ┆ u32            ┆ f64                │
╞═════════════════════════════════╪═════════════════╪════════════════╪════════════════════╡
│ 20240829_Isoform                ┆ 300             ┆ 213            ┆ 71.0               │
│ 20240702_codestory_aide_mixed   ┆ 300             ┆ 206            ┆ 68.666667          │
│ 20240811_gru                    ┆ 300             ┆ 213            ┆ 71.0               │
│ 20240627_abanteai_mentatbot_gp… ┆ 300             ┆ 187            ┆ 62.333333          │
│ 20240806_SuperCoder2.0    

# Full bench

## Experiments (5 best)

- 20240820_honeycomb
- 20240509_amazon-q-developer-agent-20240430-dev
- 20240617_factory_code_droid
- 20240628_autocoderover-v20240620
- 20240620_sweagent_claude3.5sonnet

In [244]:
full_experiments = [
    '20240820_honeycomb',
    '20240509_amazon-q-developer-agent-20240430-dev',
    '20240617_factory_code_droid',
    '20240628_autocoderover-v20240620',
    '20240620_sweagent_claude3.5sonnet',
]

full_dir = './experiments/evaluation/test/'

full_bench_df = process_experiments(full_experiments, full_dir)

In [245]:
swe_df = swe_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_df.select(['instance_id', 'true_patch_file'])

full_merged_df, full_model_stats = analyze_bench(full_bench_df, swe_df_small)

Number of instances where 'file_changed' == 'true_patch_file': 6490
Total number of instances: 11329
Percentage of matches: 57.29%
shape: (5, 4)
┌─────────────────────────────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path              ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                             ┆ ---             ┆ ---            ┆ ---                │
│ str                             ┆ u32             ┆ u32            ┆ f64                │
╞═════════════════════════════════╪═════════════════╪════════════════╪════════════════════╡
│ 20240620_sweagent_claude3.5son… ┆ 2274            ┆ 855            ┆ 37.598945          │
│ 20240820_honeycomb              ┆ 2236            ┆ 1219           ┆ 54.516995          │
│ 20240509_amazon-q-developer-ag… ┆ 2294            ┆ 1453           ┆ 63.339146          │
│ 20240628_autocoderover-v202406… ┆ 2241            ┆ 1435           ┆ 64.033913          │
│ 20240617_factory_code_dro