In [1]:
import pandas as pd

In [2]:
with open("ImportantNotesFromSweBenchVerifiedReport.md", "r") as f:
    print(f.read())

Total 93 independent annotators
nnotated 1699 testcases from SWE-bench
Labels go from [0, 3], with 0 being no or minor issue, 3 being severe
Difficulty annotated as "How long will a developer take to solve"
Freeform text for "other major issues"

Team of OpenAI engineers handlabeled 50 samples to high degree of confidence.
Each annotator had to pass onboarding test.

In the final dataset, each sample labeled 3 times by separate annotators
Take the highest severeity label among 3 as the final label

Annotation Criteria:
1. Are the tasks well specified
2. How valid are the evaluation criteria: Could the FAIL_TO_PASS tests fail even with a valid solution?
3. (Not used for dataset filtering) How long will a developer take to solve the task?

Final dataset: filter out any sample from the original test set where either task 1 or task 2 have ensemble 
label of 2 or above in severity

Also filter out samples with other major issues flagged

Include as many samples with difficulty 1-4 and >4 ho

In [3]:
df_all_annotations = pd.read_csv('samples_with_3_annotations_public.csv')
df_ensembled = pd.read_csv('ensembled_annotations_public.csv')
df_swe_bench_full_test = pd.read_parquet('SWE-bench_hf_dataset_clone/data/test-00000-of-00001.parquet')
df_swe_bench_full_dev = pd.read_parquet('SWE-bench_hf_dataset_clone/data/dev-00000-of-00001.parquet')
df_swe_bench_full_train = pd.read_parquet('SWE-bench_hf_dataset_clone/data/train-00000-of-00001.parquet')

In [4]:
df_swe_bench_full = pd.concat([df_swe_bench_full_test, df_swe_bench_full_dev, df_swe_bench_full_train])

In [5]:
assert df_all_annotations.shape[0] == df_ensembled.shape[0]*3

In [6]:
assert (df_all_annotations['underspecified_problematic'] != (df_all_annotations['underspecified'] >= 2.0)).sum() == 0

In [7]:
assert (df_all_annotations['false_negative_problematic'] != (df_all_annotations['false_negative'] >= 2.0)).sum() == 0

In [8]:
def f(xdf):
    assert len(xdf) == 3
    instance_id = xdf['instance_id'].iloc[0]
    if xdf['problematic'].sum() > 0:
        if xdf['underspecified_problematic'].sum() == 0 and xdf['false_negative_problematic'].sum() == 0:
            assert xdf['other_major_issues'].sum() > 0
            return instance_id
    return None

instance_ids_to_filter_out = df_all_annotations.groupby('instance_id').apply(f).dropna().unique()

df_all_annotations_filtered = df_all_annotations[~df_all_annotations['instance_id'].isin(instance_ids_to_filter_out)]

  instance_ids_to_filter_out = df_all_annotations.groupby('instance_id').apply(f).dropna().unique()


In [9]:
df_swe_bench_full = df_swe_bench_full[df_swe_bench_full['instance_id'].isin(df_all_annotations_filtered['instance_id'])]

In [10]:
def f(row):
    instance_id = row['instance_id']
    row_swe_full = df_swe_bench_full[df_swe_bench_full['instance_id'] == instance_id].iloc[0]
    return row_swe_full.drop(['instance_id', 'repo'])

df_all_annotations_filtered = pd.concat([df_all_annotations_filtered, df_all_annotations_filtered.apply(f, axis=1)], axis=1)

In [11]:
def f(x):
    xdf = df_swe_bench_full[df_swe_bench_full['instance_id'] == x]
    assert len(xdf) == 1, x
    return xdf.iloc[0]['repo']

df_all_annotations_filtered['repo'] = df_all_annotations_filtered['instance_id'].apply(f)

In [12]:
import re

NON_TEST_EXTS = [
    ".json",
    ".png",
    "csv",
    ".txt",
    ".md",
    ".jpg",
    ".jpeg",
    ".pkl",
    ".yml",
    ".yaml",
    ".toml",
]

def get_test_directives(instance) -> list:
    """
    Get test directives from the test_patch of a task instance

    Args:
        instance (dict): task instance
    Returns:
        directives (list): List of test directives
    """
    # For seq2seq code repos, testing command is fixed
    if instance["repo"] == "swe-bench/humaneval":
        return ["test.py"]

    # Get test directives from test patch and remove non-test files
    diff_pat = r"diff --git a/.* b/(.*)"
    test_patch = instance["test_patch"]
    directives = re.findall(diff_pat, test_patch)
    directives = [
        d for d in directives if not any(d.endswith(ext) for ext in NON_TEST_EXTS)
    ]

    # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
    if instance["repo"] == "django/django":
        directives_transformed = []
        for d in directives:
            d = d[: -len(".py")] if d.endswith(".py") else d
            d = d[len("tests/") :] if d.startswith("tests/") else d
            d = d.replace("/", ".")
            directives_transformed.append(d)
        directives = directives_transformed

    return directives

df_swe_bench_full.apply(get_test_directives, axis=1)

0              [astropy/wcs/wcsapi/tests/test_fitswcs.py]
3                 [astropy/io/fits/tests/test_connect.py]
4       [astropy/table/tests/conftest.py, astropy/tabl...
5       [astropy/io/ascii/tests/test_ecsv.py, astropy/...
6                   [astropy/io/ascii/tests/test_ecsv.py]
                              ...                        
2288                    [sympy/polys/tests/test_rings.py]
2289                   [sympy/core/tests/test_numbers.py]
2291           [sympy/parsing/tests/test_sympy_parser.py]
2292     [sympy/stats/tests/test_matrix_distributions.py]
2293         [sympy/physics/units/tests/test_prefixes.py]
Length: 1689, dtype: object

In [13]:
# DIFF_MODIFIED_FILE_REGEX = r"--- a/(.*)"

# def f(test_patch):
#     test_files = re.findall(DIFF_MODIFIED_FILE_REGEX, test_patch)
#     print(test_files)

# df_swe_bench_full['test_patch'].apply(f)

In [14]:
c = 7
print(df_swe_bench_full['repo'].value_counts().iloc[c:])
df_swe_bench_full['repo'].value_counts().iloc[c:].sum()

repo
astropy/astropy      71
pylint-dev/pylint    37
psf/requests         33
mwaskom/seaborn       9
pallets/flask         1
Name: count, dtype: int64


np.int64(151)

In [15]:
df_all_annotations_filtered['repo'].value_counts()/3

repo
django/django                646.0
sympy/sympy                  298.0
scikit-learn/scikit-learn    165.0
sphinx-doc/sphinx            138.0
matplotlib/matplotlib        125.0
pytest-dev/pytest             88.0
pydata/xarray                 78.0
astropy/astropy               71.0
pylint-dev/pylint             37.0
psf/requests                  33.0
mwaskom/seaborn                9.0
pallets/flask                  1.0
Name: count, dtype: float64

In [16]:
num_for_test_from_each_repo_map = {
    'django/django' : 36,
    'sympy/sympy' : 36,
    'scikit-learn/scikit-learn' : 36,
    'sphinx-doc/sphinx' : 36,
    'matplotlib/matplotlib' : 35,
    'pytest-dev/pytest' : 35,
    'pydata/xarray' : 35,
    'astropy/astropy' : 71,
    'pylint-dev/pylint' : 37,
    'psf/requests' : 33,
    'mwaskom/seaborn' : 9,
    'pallets/flask' : 1
}

In [17]:
train_or_val_instances = []
test_instances = []
for repo_name, num_test_samples in num_for_test_from_each_repo_map.items():
    repo_instances = df_all_annotations_filtered[df_all_annotations_filtered['repo'] == repo_name]['instance_id'].unique()
    test_instance_names = pd.Series(repo_instances).sample(n=num_test_samples, replace=False).tolist()
    test_instances.extend(test_instance_names)
    train_or_val_instances.extend(list(set(repo_instances) - set(test_instance_names)))

In [18]:
len(train_or_val_instances), len(test_instances)

(1289, 400)

In [19]:
len(set(train_or_val_instances)), len(set(test_instances))

(1289, 400)

In [20]:
df_annotation_task_trainval_split = df_all_annotations_filtered[df_all_annotations_filtered['instance_id'].isin(train_or_val_instances)]
df_annotation_task_test_split = df_all_annotations_filtered[df_all_annotations_filtered['instance_id'].isin(test_instances)]

In [21]:
assert len(df_annotation_task_trainval_split) == len(train_or_val_instances)*3
assert len(df_annotation_task_test_split) == len(test_instances)*3

In [22]:
df_annotation_task_trainval_split.to_csv('SweBenchVerifiedAnnotationTaskDataset/trainval_split.csv', index=False)
df_annotation_task_test_split.to_csv('SweBenchVerifiedAnnotationTaskDataset/test_split.csv', index=False)