In [63]:
import json
import os

with open('data/codearena_instances.json') as f:
    tasks = json.load(f)

instance_ids = [t['instance_id'] for t in tasks]

def get_task(instance_id):
    return [t for t in tasks if t['instance_id'] == instance_id][0]

yt_ids = [t['instance_id'] for t in tasks if 'youtube' in t['instance_id']]

def save_ids(id_list, filename):
    with open(filename, 'w') as f:
        for instance_id in id_list:
            f.write(f"{instance_id}\n")

N_PYLINT_MESSAGES_SHOWN_TO_AGENT = 20

In [2]:
IGNORE_IDS = [
    'E0401',  # Unable to import
    'W0511',  # TODO
]
def filter_pylint_output(pylint_output, keep_types=['error', 'warning', 'fatal'], max=None):
    output = []
    n_so_far = 0

    for file_output in pylint_output:
        new_output = file_output.copy()
        new_output['messages'] = [
            i for i in file_output['messages']
            if (i['type'] in keep_types
                and i['message-id'] not in IGNORE_IDS)
        ]
        if max and len(new_output['messages']) - n_so_far > max:
            new_output['messages'] = new_output['messages'][:max - n_so_far]

        n_so_far += len(new_output['messages'])
        # get rid of the counts fields
        to_delete = [k for k in new_output if k.endswith('count')]
        for k in to_delete:
            del new_output[k]
        output.append(new_output)

        if max and n_so_far >= max:
            break

    return output

In [None]:
def create_style_review_task(task, pylint_output_path):
    """
    pylint_output_path: path to the pylint_errors.json file
    """

    task = task.copy()
    with open(pylint_output_path, 'r') as f:
        pylint_output = json.load(f)

    pylint_output = filter_pylint_output(pylint_output, keep_types=['warning', 'error', 'fatal'], max=N_PYLINT_MESSAGES_SHOWN_TO_AGENT)
    problem_statement = """<pylint_output>
{pylint_output}
</pylint_output>"""

    task['problem_statement'] = problem_statement.format(
        pylint_output=json.dumps(pylint_output, indent=2)
    )

    return task


In [4]:
from json.decoder import JSONDecodeError

def create_style_review_dataset():
    sr_tasks = []
    for task in tasks:
        instance_id = task['instance_id']
        pylint_path = f'/Users/simon/Downloads/sr2/sweb-style-review/{instance_id}/run_evaluation/style_check2/gold/{instance_id}_styleReview/pylint_errors.json'

        if not os.path.exists(pylint_path):
            continue

        try:
            sr_task = create_style_review_task(task, pylint_path)
        except JSONDecodeError as e:
            print(f'JSONDecodeError for {instance_id}')
            continue
        sr_tasks.append(sr_task)

    print(len(sr_tasks), "style review tasks created")
    # write to file
    with open('data/sweagent_style_review_instances.json', 'w') as f:
        json.dump(sr_tasks, f, indent=2)


In [123]:
def calculate_sweagent_score(old_pylint_report, new_pylint_report, test_report=None, max_fixable=None):
    """
    Score is calculated as 1(tests_passed) * min(1, n_fixed / max_fixable))
    where tests_passed is 1 if the new patch passed tests, and n_fixed is the number of pylint messages resolved.
    max_fixable is the maximum number of messages that can be fixed (the number given to the model to fix)

    Returns old_n, new_n, score where old_n and new_n are the number of messages in the old and new pylint files respectively.
    """
    def n_messages(report):
        return sum(len(file['messages']) for file in report)

    old_n = n_messages(old_pylint_report)
    new_n = n_messages(new_pylint_report)

    tests_passed = test_report['resolved'] if test_report else 1
    max_fixable = max_fixable if max_fixable else old_n

    n_resolved = old_n - new_n
    score = tests_passed * (n_resolved / max_fixable)
    score = max(0., min(1., score))

    return old_n, new_n, score

In [42]:
def import_sr_results_gold(path, run_id):
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_path = f'{path}/{folder}/run_evaluation/{run_id}/gold/{instance_id}_styleReview/pylint_errors.json'
        try:
            with open(results_path, 'r') as f:
                pylint_output = json.load(f)
        except FileNotFoundError:
            # print(f"File not found: {results_path}")
            continue
        except json.JSONDecodeError:
            # print(f"Error decoding JSON in file: {results_path}")
            continue

        sr_results[instance_id] = pylint_output

    return sr_results


In [43]:
def import_sr_results_nongold(path, run_id):
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_path = f'{path}/{folder}/run_evaluation/{run_id}/logs/{instance_id}_styleReview/pylint_errors.json'
        try:
            with open(results_path, 'r') as f:
                pylint_output = json.load(f)
        except FileNotFoundError:
            # print(f"File not found: {results_path}")
            continue
        except json.JSONDecodeError:
            # print(f"Error decoding JSON in file: {results_path}")
            continue

        sr_results[instance_id] = pylint_output

    return sr_results


In [58]:
def import_sweagent_bugfixing_results(path, run_id):
    """
    Import the results from the style review.
    """
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_path = f'{path}/{folder}/run_evaluation/{run_id}/logs/{instance_id}/report.json'
        try:
            with open(results_path, 'r') as f:
                report = json.load(f)
        except FileNotFoundError:
            # print(f"File not found: {results_path}")
            continue
        except json.JSONDecodeError:
            # print(f"Error decoding JSON in file: {results_path}")
            continue

        sr_results[instance_id] = report[instance_id]

    return sr_results

In [17]:
def load_sweagent_results(path):
    results = {}
    # look at sweagent results
    for instance_id in os.listdir(path):
        if instance_id not in instance_ids:
            continue

        all_preds_path = f'{path}/{instance_id}/all_preds.jsonl'
        if not os.path.exists(all_preds_path) or (os.stat(all_preds_path).st_size == 0):
            all_preds_path = f'{path}/{instance_id}/logs/all_preds.jsonl'

            if not os.path.exists(all_preds_path) or (os.stat(all_preds_path).st_size == 0):
                continue

        try:
            with open(all_preds_path, 'r') as f:
                preds = json.load(f)
        except json.JSONDecodeError:
            print(f"Error decoding JSON in file: {all_preds_path}")
            continue

        # not sure why this is the format lol but whatever
        assert type(preds['model_patch']) == dict
        preds = preds['model_patch']

        results[instance_id] = preds

    return results

In [45]:
sr_results = import_sr_results_gold('gc_results/sweb-style-review', run_id='style_check2')

In [46]:
good_ids = list(sr_results.keys())
print(len(good_ids))
# save_ids(good_ids, 'good_ids.txt')

610


In [113]:
sweagent_sr_check_results = import_sr_results_nongold('gc_results/sweb-sweagent-sr-check', run_id='sweagent_sr_check')
len(sweagent_sr_check_results)

602

In [116]:
sweagent_sr_bf_check_results = import_sweagent_bugfixing_results('gc_results/sweb-sweagent-sr-bf-check', run_id='sweagent_sr_bf_check')
len(sweagent_sr_bf_check_results)

548

In [117]:
sweagent_results = load_sweagent_results('gc_results/sweb-sweagent-sr')

In [90]:
still_need_sr_agent_ids = [id for id in good_ids if id not in sweagent_results]
sweagent_sr_done_ids = [id for id in good_ids if id in sweagent_results]

print(len(still_need_sr_agent_ids), len(sweagent_sr_done_ids))
save_ids(still_need_sr_agent_ids, 'still_need_sr_agent_ids.txt')
save_ids(sweagent_sr_done_ids, 'sweagent_sr_done_ids.txt')

7 603


In [80]:
with_pull_error = []
for id in still_need_sr_agent_ids:
    # load the logs for each one
    log_path = f'gc_results/sweb-sweagent-sr/{id}/logs/instance_{id}.log'
    with open(log_path, 'r') as f:
        log = f.read()

    if 'DockerPullError' in log:
        # print(f"{id} has DockerPullError")
        with_pull_error.append(id)
        # pass
    else:
        print(f"{id} does not have DockerPullError")
        # print(log)

django__django-10097 does not have DockerPullError
camel-ai__camel-1395 does not have DockerPullError
camel-ai__camel-1368 does not have DockerPullError
camel-ai__camel-1276 does not have DockerPullError
camel-ai__camel-1259 does not have DockerPullError
camel-ai__camel-1478 does not have DockerPullError
django__django-7530 does not have DockerPullError


In [20]:
DOCKER_PAT = os.environ.get('DOCKER_PAT')

import time, requests

def docker_image_exists(tag, retries=5, delay=1, backoff=2):
    auth = ('sca63', DOCKER_PAT)
    url  = f"https://registry.hub.docker.com/v2/repositories/sca63/codearena/tags/{tag}"

    for attempt in range(retries + 1):
        r = requests.get(url, auth=auth)
        if r.status_code == 200:
            return True
        if r.status_code == 404:
            return False
        if r.status_code == 429:
            if attempt < retries:
                print(f"Rate limit exceeded for {tag}, retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= backoff
        else:
            print(f" Unexpected {r.status_code}")


    print(f"Exhausted retries for {tag}")
    return False

In [91]:
still_need_sweagent_sr_check_ids = [id for id in good_ids if id not in sweagent_sr_check_results and id not in still_need_sr_agent_ids]
print(len(still_need_sweagent_sr_check_ids))
save_ids(still_need_sweagent_sr_check_ids, 'still_need_sweagent_sr_check_ids.txt')

3


In [92]:
still_need_sweagent_sr_bf_check_ids = [id for id in good_ids if id not in sweagent_sr_bf_check_results and id not in still_need_sr_agent_ids]
print(len(still_need_sweagent_sr_bf_check_ids))
save_ids(still_need_sweagent_sr_bf_check_ids, 'still_need_sweagent_sr_bf_check_ids.txt')

61


In [83]:
# create one big all_preds.jsonl file with the results
with open('sweagent_sr_all_preds.jsonl', 'w') as f:
    for instance_id in sweagent_results:
        pred = sweagent_results[instance_id]
        f.write(json.dumps(pred) + "\n")

In [134]:
# saving the overall results
# instance_id: original_style_review, sweagent_patch, new_style_review, report, score
overall_results = {}
have_all_ids = [id for id in good_ids if all([id in r for r in [sweagent_sr_check_results, sweagent_sr_bf_check_results, sweagent_results]])]
# have_all_ids = [id for id in good_ids if all([id in r for r in [sweagent_sr_check_results, sweagent_results]])]
for instance_id in have_all_ids:
    original_style_review_report = sr_results[instance_id]
    sweagent_patch = sweagent_results[instance_id]
    new_style_review_report = sweagent_sr_check_results[instance_id]
    test_report = sweagent_sr_bf_check_results[instance_id] if instance_id in sweagent_sr_bf_check_results else None
    max_fixable = N_PYLINT_MESSAGES_SHOWN_TO_AGENT
    old_n, new_n, score = calculate_sweagent_score(original_style_review_report, new_style_review_report, test_report, max_fixable=max_fixable)
    old_n, new_n, score_ignore_resolved = calculate_sweagent_score(original_style_review_report, new_style_review_report, test_report=None, max_fixable=max_fixable)

    overall_results[instance_id] = {
        'original_style_review_report': original_style_review_report,
        'sweagent_patch': sweagent_patch,
        'new_style_review_report': new_style_review_report,
        'test_report': test_report,
        'old_n_messages': old_n,
        'new_n_messages': new_n,
        'max_fixable': max_fixable,
        # 'resolved': test_report['resolved'],
        'resolved': "N/A",
        'score': score,
        'score_ignore_resolved': score_ignore_resolved,
        'score_formula': f"1(resolved) * min(1, ((old_n_messages - new_n_messages) / max_fixable))",
    }

print(len(overall_results))

547


In [135]:
with open('sweagent_style_review_overall_results.json', 'w') as f:
    json.dump(overall_results, f, indent=2)

In [137]:
overall_score = sum([r['score'] for r in overall_results.values()]) / len(overall_results)
overall_score_ignore_resolved = sum([r['score_ignore_resolved'] for r in overall_results.values()]) / len(overall_results)
print(f"Overall score: {overall_score:.3f}")
print(f"Overall score ignoring whether resolved: {overall_score_ignore_resolved:.3f}")

Overall score: 0.089
Overall score ignoring whether resolved: 0.294


In [104]:
# doing some analytics

deltas = [r['old_n_messages'] - r['new_n_messages'] for r in overall_results.values()]
resolveds = [r['resolved'] for r in overall_results.values()]

from collections import Counter
delta_counts = Counter(deltas)
print("Delta counts:")
for k in sorted(delta_counts.keys()):
    nr = sum([1 for d, r in zip(deltas, resolveds) if d == k and r])
    nur = sum([1 for d, r in zip(deltas, resolveds) if d == k and not r])
    # print(f"{k}: {delta_counts[k]} ({nr}, {nur})")

print('Num resolved:', sum([r['resolved'] for r in overall_results.values()]))

# print id with -105
ds = [-105, -10, 10, 20, 30, 103, 905]
for d in ds:
    l = [(id, r['old_n_messages'], r['new_n_messages'], r['resolved']) for id, r in overall_results.items() if r['old_n_messages'] - r['new_n_messages'] == d]
    print(d, l)


Delta counts:
Num resolved: 282
-105 [('camel-ai__camel-1806', 25, 130, True)]
-10 [('django__django-12209', 276, 286, True)]
10 [('scikit-learn__scikit-learn-14629', 68, 58, False), ('sympy__sympy-16886', 106, 96, False), ('scikit-learn__scikit-learn-14983', 101, 91, False), ('django__django-15569', 27, 17, True), ('scikit-learn__scikit-learn-26323', 38, 28, False), ('django__django-11728', 18, 8, False), ('scikit-learn__scikit-learn-15100', 89, 79, False), ('scikit-learn__scikit-learn-13439', 83, 73, False), ('keras-team__keras-19844', 47, 37, False), ('keras-team__keras-19931', 226, 216, False), ('keras-team__keras-19955', 158, 148, False)]
20 [('django__django-14725', 176, 156, True), ('django__django-13933', 159, 139, True), ('django__django-14915', 174, 154, True), ('psf__requests-1724', 43, 23, False), ('keras-team__keras-20541', 96, 76, False), ('django__django-12273', 278, 258, True), ('matplotlib__matplotlib-24870', 134, 114, True), ('django__django-12143', 299, 279, True), (

In [109]:
overall_results['django__django-12209']['new_style_review_report']

[{'file': 'django/db/models/base.py',
  'messages': [{'type': 'convention',
    'module': 'django.db.models.base',
    'obj': '',
    'line': 129,
    'column': 0,
    'endLine': None,
    'endColumn': None,
    'path': 'django/db/models/base.py',
    'symbol': 'line-too-long',
    'message': 'Line too long (103/100)',
    'message-id': 'C0301'},
   {'type': 'convention',
    'module': 'django.db.models.base',
    'obj': '',
    'line': 138,
    'column': 0,
    'endLine': None,
    'endColumn': None,
    'path': 'django/db/models/base.py',
    'symbol': 'line-too-long',
    'message': 'Line too long (114/100)',
    'message-id': 'C0301'},
   {'type': 'convention',
    'module': 'django.db.models.base',
    'obj': '',
    'line': 186,
    'column': 0,
    'endLine': None,
    'endColumn': None,
    'path': 'django/db/models/base.py',
    'symbol': 'line-too-long',
    'message': 'Line too long (111/100)',
    'message-id': 'C0301'},
   {'type': 'convention',
    'module': 'django.db.mo

In [None]:
with open('sweagent_style_review_results.json', 'w') as f:
    json.dump(overall_results, f, indent=2)

In [1]:
with open('sweagent_style_review_results.json', 'r') as f:
    overall_results = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'sweagent_style_review_results.json'

In [None]:
all_messages = []

for result in sr_results.values():
    for file in result:
        all_messages.extend(file['messages'])


all_messages = [m for m in all_messages if m['type'] in ['error', 'fatal', 'warning']]

In [21]:
# get number of warnings, refactors, errors, and conventions for each sr_results
sr_stats = {}
i = 0
for id, pylint_output in sr_results.items():
    i += 1
    if i == 5: break
    stats = {
        'warning': 0,
        'refactor': 0,
        'error': 0,
        'convention': 0,
        'info': 0,
    }
    for file_output in pylint_output:
        for message in file_output['messages']:
            stats[message['type']] += 1
            if message['type'] == 'error':
                print(message)

    sr_stats[id] = stats

{'type': 'error', 'module': 'django.db.migrations.serializer', 'obj': 'EnumSerializer.serialize', 'line': 137, 'column': 29, 'endLine': 137, 'endColumn': 44, 'path': 'django/db/migrations/serializer.py', 'symbol': 'no-member', 'message': "Module 'enum' has no '_decompose' member", 'message-id': 'E1101'}
{'type': 'error', 'module': 'astropy.io.fits.header', 'obj': '', 'line': 12, 'column': 0, 'endLine': 12, 'endColumn': 32, 'path': 'astropy/io/fits/header.py', 'symbol': 'no-name-in-module', 'message': "No name 'parse_header' in module 'astropy.io.fits._utils'", 'message-id': 'E0611'}
{'type': 'error', 'module': 'astropy.io.fits.header', 'obj': '_CardAccessor._setslice', 'line': 2139, 'column': 16, 'endLine': 2139, 'endColumn': 20, 'path': 'astropy/io/fits/header.py', 'symbol': 'unsupported-assignment-operation', 'message': "'self' does not support item assignment", 'message-id': 'E1137'}
{'type': 'error', 'module': 'sklearn.linear_model.least_angle', 'obj': '', 'line': 19, 'column': 0, 

In [68]:
# get the average stats for each type
avg_stats = {
    'warning': 0,
    'refactor': 0,
    'error': 0,
    'convention': 0,
    'info': 0,
}

for id, stats in sr_stats.items():
    for key, value in stats.items():
        avg_stats[key] += value

avg_stats = {k: v / len(sr_stats) for k, v in avg_stats.items()}
print(avg_stats)
for k, v in avg_stats.items():
    print(f"{k}: {v:.2f}")
print(sum(avg_stats.values()))

refactor: 21.56
error: 9.87
convention: 56.03
info: 0.07
122.95081967213115


In [15]:
import re
from typing import Dict, Set, List

# ---------- 1. from patch → {file: set(lines)} ---------- #
def diff_to_modified_lines(patch: str) -> Dict[str, Set[int]]:
    """
    Return a map {file_path: {modified_line_numbers}} for a unified diff.
    Works for plain `git diff` / `patch` format.
    """
    modified: Dict[str, Set[int]] = {}
    cur_file = None

    for ln in patch.splitlines():
        if ln.startswith("diff --git"):
            # take the RHS (after "b/") so paths match pylint’s “path”
            try:
                cur_file = ln.split(" b/")[1]
            except IndexError:
                cur_file = None
        elif ln.startswith("@@") and cur_file:
            # @@ -orig_start,orig_len +new_start,new_len @@
            m = re.match(r"@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@", ln)
            if not m:
                continue
            start = int(m.group(1))
            length = int(m.group(2) or "1")
            modified.setdefault(cur_file, set()).update(
                range(start, start + length)
            )
    return modified


# ---------- 2. filter pylint JSON list ---------- #
def filter_pylint_output(
    pylint_output: List[dict], modified: Dict[str, Set[int]]
) -> List[dict]:
    """
    Keep only pylint messages whose (path, line) is in the diff.
    """
    new_pylint_output: List[dict] = []
    for file_output in pylint_output:
        new_file_output = file_output.copy()
        filename = file_output["file"]
        new_messages = []
        for msg in file_output["messages"]:
            if filename in modified and msg["line"] in modified[filename]:
                new_messages.append(msg)

        new_file_output["messages"] = new_messages
        if new_messages:
            new_pylint_output.append(new_file_output)
    return new_pylint_output


# ---------- 3. example ---------- #
# patch_str = pred["model_patch"]
# mod = diff_to_modified_lines(patch_str)
# pylint_filtered = filter_pylint_messages(json.load(open("pylint_errors.json")), mod)

In [16]:
sr_filtered_output = {}
for id, pylint_output in sr_results.items():
    task = get_task(id)
    patch = task['patch']
    modified = diff_to_modified_lines(patch)
    filtered_output = filter_pylint_output(pylint_output, modified)
    sr_filtered_output[id] = filtered_output

In [17]:
sr_filtered_stats = {}
for id, pylint_output in sr_filtered_output.items():
    stats = {
        'warning': 0,
        'refactor': 0,
        'error': 0,
        'convention': 0,
        'info': 0,
    }
    for file_output in pylint_output:
        for message in file_output['messages']:
            stats[message['type']] += 1

    sr_filtered_stats[id] = stats

In [18]:
# avg stats
avg_filtered_stats = {
    'warning': 0,
    'refactor': 0,
    'error': 0,
    'convention': 0,
    'info': 0,
}

for id, stats in sr_filtered_stats.items():
    for key, value in stats.items():
        avg_filtered_stats[key] += value

avg_filtered_stats = {k: v / len(sr_filtered_stats) for k, v in avg_filtered_stats.items()}
for key, value in avg_filtered_stats.items():
    print(f"{key}: {value:.2f}")

refactor: 0.56
error: 0.28
convention: 1.61
info: 0.00
