In [22]:
import polars as pl
import pandas as pd
import json
import os
import re

In [23]:
pattern = r'(?:diff --git a/.+? b/|--- a/)(.+)'

In [24]:
def read_jsonl_file_line_by_line(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"JSON decode error in {file_path}: {e}")
                    return None
        return data
    except Exception as e:
        print(f"Error opening {file_path}: {e}")
        return None

In [25]:
def process_experiments(experiments, base_dir):
    all_preds = {}

    for exp in experiments:
        file_path = os.path.join(base_dir, exp, 'all_preds.jsonl')
        results_file_path = os.path.join(base_dir, exp, 'results', 'results.json')
        
        try:
            if not os.path.exists(results_file_path):
                print(f"Results file for experiment {exp} does not exist")
                continue
            
            with open(results_file_path, 'r', encoding='utf-8') as f:
                results_data = json.load(f)
                resolved_list = results_data.get('resolved', [])
        except Exception as e:
            print(f"Error reading results file for experiment {exp}: {e}")
            resolved_list = []

        df_data = read_jsonl_file_line_by_line(file_path)
        
        if df_data is None:
            print(f"Error reading file {file_path} from experiment {exp}")
            continue

        df = pl.DataFrame(df_data)

        df = df.with_columns(
            pl.lit(exp).alias('model_name_or_path')
        )

        if 'model_patch' in df.columns:
            df = df.with_columns(
                pl.col('model_patch').str.extract(pattern, 1).alias('file_changed')
            )
        
        df = df.with_columns(
            pl.col('instance_id').is_in(resolved_list).alias('resolved')
        )
        
        df = df.select(['model_name_or_path', 'instance_id', 'file_changed', 'model_patch', 'resolved'])
        all_preds[exp] = df

    combined_df = pl.concat(list(all_preds.values()))
    return combined_df

In [26]:
def analyze_bench(bench_df, swe_df_small):
    merged_df = bench_df.join(swe_df_small, on='instance_id', how='inner')

    merged_df = merged_df.with_columns([
        pl.col('file_changed').str.strip_chars().str.replace_all(r'\\', '/').alias('file_changed_norm'),
        pl.col('true_patch_file').str.strip_chars().str.replace_all(r'\\', '/').alias('true_patch_file_norm')
    ])

    merged_df = merged_df.with_columns(
        (pl.col('file_changed_norm') == pl.col('true_patch_file_norm')).alias('is_same_file')
    )

    num_same_files = merged_df['is_same_file'].sum()
    total_instances = merged_df.shape[0]
    percentage = (num_same_files / total_instances) * 100 if total_instances > 0 else 0

    print(f"Overall Statistics:")
    print(f"Number of instances where 'file_changed' == 'true_patch_file': {num_same_files}")
    print(f"Total number of instances: {total_instances}")
    print(f"Percentage of matches: {percentage:.2f}%\n")

    for resolved_status in [True, False]:
        status_df = merged_df.filter(pl.col('resolved') == resolved_status)
        num_same_files = status_df['is_same_file'].sum()
        total_instances = status_df.shape[0]
        percentage = (num_same_files / total_instances) * 100 if total_instances > 0 else 0

        status = "Resolved" if resolved_status else "Not Resolved"
        print(f"{status} Instances:")
        print(f"Number of instances where 'file_changed' == 'true_patch_file': {num_same_files}")
        print(f"Total number of instances: {total_instances}")
        print(f"Percentage of matches: {percentage:.2f}%\n")

    model_stats = merged_df.group_by(['model_name_or_path', 'resolved']).agg([
        pl.len().alias('total_instances'),
        pl.col('is_same_file').sum().alias('num_same_files'),
        (pl.col('is_same_file').sum() / pl.len() * 100).alias('percentage_matches')
    ])

    print("Model Statistics by Resolved Status:")
    print(model_stats.filter(pl.col('resolved') == False).sort('percentage_matches'))
    return merged_df, model_stats


In [27]:
swe_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench/data/test-00000-of-00001.parquet')
swe_bench_lite_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench_Lite/data/test-00000-of-00001.parquet')
swe_bench_verified_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench_Verified/data/test-00000-of-00001.parquet')

# Lite bench

In [28]:
lite_dir = './experiments/evaluation/lite/'

lite_experiments = os.listdir(lite_dir)
print(lite_experiments)

lite_bench_df = process_experiments(lite_experiments, lite_dir)
lite_bench_df

['20231010_rag_claude2', '20231010_rag_gpt35', '20231010_rag_swellama13b', '20231010_rag_swellama7b', '20240402_rag_claude3opus', '20240402_rag_gpt4', '20240402_sweagent_claude3opus', '20240402_sweagent_gpt4', '20240509_amazon-q-developer-agent-20240430-dev', '20240523_aider', '20240524_opencsg_starship_gpt4', '20240530_autocoderover-v20240408', '20240604_CodeR', '20240612_IBM_Research_Agent101', '20240612_MASAI_gpt4o', '20240615_appmap-navie_gpt4o', '20240617_factory_code_droid', '20240617_moatless_gpt4o', '20240620_sweagent_claude3.5sonnet', '20240621_autocoderover-v20240620', '20240622_Lingma_Agent', '20240623_moatless_claude35sonnet', '20240627_abanteai_mentatbot_gpt4o', '20240630_agentless_gpt4o', '20240702_codestory_aide_mixed', '20240706_sima_gpt4o', '20240721_amazon-q-developer-agent-20240719-dev', '20240723_marscode-agent-dev', '20240725_opendevin_codeact_v1.8_claude35sonnet', '20240728_sweagent_gpt4o', '20240806_SuperCoder2.0', '20240808_RepoGraph_gpt4o', '20240811_gru', '202

model_name_or_path,instance_id,file_changed,model_patch,resolved
str,str,str,str,bool
"""20231010_rag_claude2""","""matplotlib__matplotlib-24334""","""lib/matplotlib/axis.py""","""--- a/lib/matplotlib/axis.py +…",false
"""20231010_rag_claude2""","""sympy__sympy-18087""","""sympy/simplify/simplify.py""","""--- a/sympy/simplify/simplify.…",false
"""20231010_rag_claude2""","""sympy__sympy-12419""","""sympy/matrices/expressions/mat…",""" Here is a patch file that fix…",false
"""20231010_rag_claude2""","""sympy__sympy-20212""","""sympy/core/power.py""","""--- a/sympy/core/power.py +++ …",false
"""20231010_rag_claude2""","""sympy__sympy-18698""","""sympy/polys/polytools.py""","""--- a/sympy/polys/polytools.py…",false
…,…,…,…,…
"""20241122_devlo""","""sympy__sympy-13471""","""sympy/core/numbers.py""","""diff --git a/sympy/core/number…",true
"""20241122_devlo""","""sympy__sympy-16792""","""sympy/utilities/autowrap.py""","""diff --git a/sympy/utilities/a…",true
"""20241122_devlo""","""sympy__sympy-17655""","""sympy/geometry/point.py""","""diff --git a/sympy/geometry/po…",true
"""20241122_devlo""","""django__django-12470""","""django/db/models/sql/compiler.…","""diff --git a/django/db/models/…",false


In [29]:
swe_bench_lite_df = swe_bench_lite_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_bench_lite_df.select(['instance_id', 'true_patch_file'])

lite_merged_df, lite_model_stats = analyze_bench(lite_bench_df, swe_df_small)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 8267
Total number of instances: 13641
Percentage of matches: 60.60%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 3186
Total number of instances: 3594
Percentage of matches: 88.65%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 5081
Total number of instances: 10047
Percentage of matches: 50.57%

Model Statistics by Resolved Status:
shape: (46, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪══════

In [30]:
lite_model_stats.filter(pl.col('resolved') == False)['percentage_matches'].mean()

51.944920796965064

# Full bench

In [31]:
full_dir = './experiments/evaluation/test/'

full_experiments = os.listdir(full_dir)

full_bench_df = process_experiments(full_experiments, full_dir)

In [32]:
swe_df = swe_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_df.select(['instance_id', 'true_patch_file'])

full_merged_df, full_model_stats = analyze_bench(full_bench_df, swe_df_small)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 16076
Total number of instances: 36549
Percentage of matches: 43.98%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 3293
Total number of instances: 3901
Percentage of matches: 84.41%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 12783
Total number of instances: 32648
Percentage of matches: 39.15%

Model Statistics by Resolved Status:
shape: (16, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪════

In [33]:
full_model_stats.filter(pl.col('resolved') == False)['percentage_matches'].mean()


40.011310648201544

# Verified bench

In [34]:
verified_dir = './experiments/evaluation/verified/'

verified_experiments = os.listdir(verified_dir)
verified_experiments.remove('20241028_solver') # Removed due json decode error
verified_experiments.remove('20240620_sweagent_claude3.5sonnet') # Removed due missing results file

verified_df = process_experiments(verified_experiments, verified_dir)

In [35]:
swe_bench_verified_df = swe_bench_verified_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_verified = swe_bench_verified_df.select(['instance_id', 'true_patch_file'])

verified_merged_df, verified_model_stats = analyze_bench(verified_df, swe_df_verified)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 12609
Total number of instances: 20435
Percentage of matches: 61.70%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 5611
Total number of instances: 6401
Percentage of matches: 87.66%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 6998
Total number of instances: 14034
Percentage of matches: 49.86%

Model Statistics by Resolved Status:
shape: (41, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪═════

In [36]:
verified_model_stats.filter(pl.col('resolved') == False)['percentage_matches'].mean()

51.53731628562242