In [4]:
import polars as pl
import pandas as pd
import json
import os
import re

In [12]:
pattern = r'(?:diff --git a/.+? b/|--- a/)(.+)'

In [None]:
def read_jsonl_file_line_by_line(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"JSON decode error in {file_path}: {e}")
                    return None
        return data
    except Exception as e:
        print(f"Error opening {file_path}: {e}")
        return None

In [25]:
def process_experiments(experiments, base_dir):
    all_preds = {}

    for exp in experiments:
        file_path = os.path.join(base_dir, exp, 'all_preds.jsonl')
        results_file_path = os.path.join(base_dir, exp, 'results', 'results.json')
        
        try:
            if not os.path.exists(results_file_path):
                print(f"Results file for experiment {exp} does not exist")
                continue
            
            with open(results_file_path, 'r', encoding='utf-8') as f:
                results_data = json.load(f)
                resolved_list = results_data.get('resolved', [])
        except Exception as e:
            print(f"Error reading results file for experiment {exp}: {e}")
            resolved_list = []

        df_data = read_jsonl_file_line_by_line(file_path)
        
        if df_data is None:
            print(f"Error reading file {file_path} from experiment {exp}")
            continue

        df = pl.DataFrame(df_data)

        df = df.with_columns(
            pl.lit(exp).alias('model_name_or_path')
        )

        if 'model_patch' in df.columns:
            df = df.with_columns(
                pl.col('model_patch').str.extract(pattern, 1).alias('file_changed')
            )
        
        df = df.with_columns(
            pl.col('instance_id').is_in(resolved_list).alias('resolved')
        )
        
        df = df.select(['model_name_or_path', 'instance_id', 'file_changed', 'model_patch', 'resolved'])
        all_preds[exp] = df

    combined_df = pl.concat(list(all_preds.values()))
    return combined_df

In [26]:
def analyze_bench(bench_df, swe_df_small):
    merged_df = bench_df.join(swe_df_small, on='instance_id', how='inner')

    merged_df = merged_df.with_columns([
        pl.col('file_changed').str.strip_chars().str.replace_all(r'\\', '/').alias('file_changed_norm'),
        pl.col('true_patch_file').str.strip_chars().str.replace_all(r'\\', '/').alias('true_patch_file_norm')
    ])

    merged_df = merged_df.with_columns(
        (pl.col('file_changed_norm') == pl.col('true_patch_file_norm')).alias('is_same_file')
    )

    num_same_files = merged_df['is_same_file'].sum()
    total_instances = merged_df.shape[0]
    percentage = (num_same_files / total_instances) * 100 if total_instances > 0 else 0

    print(f"Overall Statistics:")
    print(f"Number of instances where 'file_changed' == 'true_patch_file': {num_same_files}")
    print(f"Total number of instances: {total_instances}")
    print(f"Percentage of matches: {percentage:.2f}%\n")

    for resolved_status in [True, False]:
        status_df = merged_df.filter(pl.col('resolved') == resolved_status)
        num_same_files = status_df['is_same_file'].sum()
        total_instances = status_df.shape[0]
        percentage = (num_same_files / total_instances) * 100 if total_instances > 0 else 0

        status = "Resolved" if resolved_status else "Not Resolved"
        print(f"{status} Instances:")
        print(f"Number of instances where 'file_changed' == 'true_patch_file': {num_same_files}")
        print(f"Total number of instances: {total_instances}")
        print(f"Percentage of matches: {percentage:.2f}%\n")

    model_stats = merged_df.group_by(['model_name_or_path', 'resolved']).agg([
        pl.len().alias('total_instances'),
        pl.col('is_same_file').sum().alias('num_same_files'),
        (pl.col('is_same_file').sum() / pl.len() * 100).alias('percentage_matches')
    ])

    print("Model Statistics by Resolved Status:")
    print(model_stats.filter(pl.col('resolved') == False).sort('percentage_matches'))
    return merged_df, model_stats


In [27]:
swe_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench/data/test-00000-of-00001.parquet')
swe_bench_lite_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench_Lite/data/test-00000-of-00001.parquet')
swe_bench_verified_df = pl.read_parquet('hf://datasets/princeton-nlp/SWE-bench_Verified/data/test-00000-of-00001.parquet')

# Lite bench

## Experiments (6 best)

- 20240702_codestory_aide_mixed
- 20240820_honeycomb
- 20240627_abanteai_mentatbot_gpt4o
- 20240811_gru
- 20240829_Isoform
- 20240806_SuperCoder2

In [28]:
lite_experiments = [
    '20240702_codestory_aide_mixed',
    '20240820_honeycomb',
    '20240627_abanteai_mentatbot_gpt4o',
    '20240811_gru',
    '20240829_Isoform',
    '20240806_SuperCoder2.0',
]

lite_dir = './experiments/evaluation/lite/'

lite_bench_df = process_experiments(lite_experiments, lite_dir)
lite_bench_df

model_name_or_path,instance_id,file_changed,model_patch,resolved
str,str,str,str,bool
"""20240702_codestory_aide_mixed""","""matplotlib__matplotlib-25433""","""lib/matplotlib/widgets.py""","""diff --git a/lib/matplotlib/wi…",false
"""20240702_codestory_aide_mixed""","""django__django-16820""","""django/db/migrations/autodetec…","""diff --git a/django/db/migrati…",false
"""20240702_codestory_aide_mixed""","""django__django-15781""","""django/core/management/base.py""","""diff --git a/django/core/manag…",false
"""20240702_codestory_aide_mixed""","""sympy__sympy-18087""","""sympy/simplify/trigsimp.py""","""diff --git a/sympy/simplify/tr…",false
"""20240702_codestory_aide_mixed""","""pytest-dev__pytest-5495""","""src/_pytest/_code/code.py""","""diff --git a/src/_pytest/_code…",false
…,…,…,…,…
"""20240806_SuperCoder2.0""","""scikit-learn__scikit-learn-255…","""sklearn/compose/_column_transf…","""diff --git a/sklearn/compose/_…",false
"""20240806_SuperCoder2.0""","""sphinx-doc__sphinx-8801""","""sphinx/ext/autodoc/__init__.py""","""diff --git a/sphinx/ext/autodo…",false
"""20240806_SuperCoder2.0""","""sympy__sympy-14396""","""sympy/polys/polyoptions.py""","""diff --git a/sympy/polys/polyo…",false
"""20240806_SuperCoder2.0""","""django__django-11905""","""django/db/models/lookups.py""","""diff --git a/django/db/models/…",false


In [29]:
swe_bench_lite_df = swe_bench_lite_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_bench_lite_df.select(['instance_id', 'true_patch_file'])

lite_merged_df, lite_model_stats = analyze_bench(lite_bench_df, swe_df_small)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 1196
Total number of instances: 1797
Percentage of matches: 66.56%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 612
Total number of instances: 672
Percentage of matches: 91.07%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 584
Total number of instances: 1125
Percentage of matches: 51.91%

Model Statistics by Resolved Status:
shape: (6, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪════════════

# Full bench

## Experiments (5 best)

- 20240820_honeycomb
- 20240509_amazon-q-developer-agent-20240430-dev
- 20240617_factory_code_droid
- 20240628_autocoderover-v20240620
- 20240620_sweagent_claude3.5sonnet

In [30]:
full_experiments = [
    '20240820_honeycomb',
    '20240509_amazon-q-developer-agent-20240430-dev',
    '20240617_factory_code_droid',
    '20240628_autocoderover-v20240620',
    '20240620_sweagent_claude3.5sonnet',
]

full_dir = './experiments/evaluation/test/'

full_bench_df = process_experiments(full_experiments, full_dir)

In [31]:
swe_df = swe_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_small = swe_df.select(['instance_id', 'true_patch_file'])

full_merged_df, full_model_stats = analyze_bench(full_bench_df, swe_df_small)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 6490
Total number of instances: 11329
Percentage of matches: 57.29%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 1765
Total number of instances: 2113
Percentage of matches: 83.53%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 4725
Total number of instances: 9216
Percentage of matches: 51.27%

Model Statistics by Resolved Status:
shape: (5, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪════════

# Verified bench
## Experiments (All)

In [36]:
verified_dir = './experiments/evaluation/verified/'

verified_experiments = os.listdir(verified_dir)
verified_experiments.remove('20241028_solver')
verified_experiments.remove('20240620_sweagent_claude3.5sonnet')
verified_experiments

['20231010_rag_claude2',
 '20231010_rag_gpt35',
 '20231010_rag_swellama13b',
 '20231010_rag_swellama7b',
 '20240402_rag_claude3opus',
 '20240402_rag_gpt4',
 '20240402_sweagent_claude3opus',
 '20240402_sweagent_gpt4',
 '20240509_amazon-q-developer-agent-20240430-dev',
 '20240615_appmap-navie_gpt4o',
 '20240617_factory_code_droid',
 '20240628_autocoderover-v20240620',
 '20240721_amazon-q-developer-agent-20240719-dev',
 '20240728_sweagent_gpt4o',
 '20240820_epam-ai-run-gpt-4o',
 '20240820_honeycomb',
 '20240824_gru',
 '20240918_lingma-agent_lingma-swe-gpt-72b',
 '20240918_lingma-agent_lingma-swe-gpt-7b',
 '20240920_solver',
 '20240924_solver',
 '20241001_nfactorial',
 '20241002_lingma-agent_lingma-swe-gpt-72b',
 '20241002_lingma-agent_lingma-swe-gpt-7b',
 '20241007_nfactorial',
 '20241016_composio_swekit',
 '20241016_epam-ai-run-gpt-4o',
 '20241022_tools_claude-3-5-haiku',
 '20241022_tools_claude-3-5-sonnet-updated',
 '20241023_emergent',
 '20241025_composio_swekit',
 '20241028_agentless-

In [37]:
verified_df = process_experiments(verified_experiments, verified_dir)
verified_df

model_name_or_path,instance_id,file_changed,model_patch,resolved
str,str,str,str,bool
"""20231010_rag_claude2""","""sympy__sympy-12419""","""sympy/matrices/expressions/mat…",""" Here is a patch file that fix…",false
"""20231010_rag_claude2""","""sympy__sympy-18698""","""sympy/polys/polytools.py""","""--- a/sympy/polys/polytools.py…",false
"""20231010_rag_claude2""","""sympy__sympy-15599""","""sympy/core/mod.py""","""--- a/sympy/core/mod.py +++ b/…",false
"""20231010_rag_claude2""","""astropy__astropy-14539""","""astropy/io/fits/diff.py""","""--- a/astropy/io/fits/diff.py …",false
"""20231010_rag_claude2""","""sympy__sympy-16450""","""sympy/assumptions/posify.py""","""--- a/sympy/assumptions/posify…",false
…,…,…,…,…
"""20241120_artemis_agent""","""sympy__sympy-24213""","""sympy/physics/units/unitsystem…","""diff --git a/sympy/physics/uni…",true
"""20241120_artemis_agent""","""sympy__sympy-24443""","""sympy/combinatorics/homomorphi…","""diff --git a/sympy/combinatori…",true
"""20241120_artemis_agent""","""sympy__sympy-24539""","""sympy/polys/rings.py""","""diff --git a/sympy/polys/rings…",true
"""20241120_artemis_agent""","""sympy__sympy-24562""","""sympy/core/numbers.py""","""diff --git a/sympy/core/number…",false


In [38]:
swe_bench_verified_df = swe_bench_verified_df.with_columns(
    pl.col('patch').str.extract(pattern, 1).alias('true_patch_file')
)
swe_df_verified = swe_bench_verified_df.select(['instance_id', 'true_patch_file'])

verified_merged_df, verified_model_stats = analyze_bench(verified_df, swe_df_verified)

Overall Statistics:
Number of instances where 'file_changed' == 'true_patch_file': 12609
Total number of instances: 20435
Percentage of matches: 61.70%

Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 5611
Total number of instances: 6401
Percentage of matches: 87.66%

Not Resolved Instances:
Number of instances where 'file_changed' == 'true_patch_file': 6998
Total number of instances: 14034
Percentage of matches: 49.86%

Model Statistics by Resolved Status:
shape: (41, 5)
┌───────────────────────────────┬──────────┬─────────────────┬────────────────┬────────────────────┐
│ model_name_or_path            ┆ resolved ┆ total_instances ┆ num_same_files ┆ percentage_matches │
│ ---                           ┆ ---      ┆ ---             ┆ ---            ┆ ---                │
│ str                           ┆ bool     ┆ u32             ┆ u32            ┆ f64                │
╞═══════════════════════════════╪══════════╪═════════════════╪════════════════╪═════

In [None]:
pl.Config.set_tbl_rows(100)

polars.config.Config

In [None]:
verified_model_stats.filter(pl.col('resolved') == False).sort('percentage_matches')

model_name_or_path,resolved,total_instances,num_same_files,percentage_matches
str,bool,u32,u32,f64
"""20231010_rag_swellama7b""",False,489,70,14.314928
"""20231010_rag_swellama13b""",False,475,68,14.315789
"""20240402_sweagent_claude3opus""",False,493,145,29.411765
"""20231010_rag_gpt35""",False,498,163,32.730924
"""20240402_sweagent_gpt4""",False,390,132,33.846154
"""20240728_sweagent_gpt4o""",False,349,131,37.535817
"""20240615_appmap-navie_gpt4o""",False,368,173,47.01087
"""20240402_rag_gpt4""",False,486,229,47.119342
"""20231010_rag_claude2""",False,478,234,48.953975
"""20240820_honeycomb""",False,290,145,50.0
