In [4]:
import json
swe_bench_tasks = "test"
tasks = []
with open(f"{swe_bench_tasks}.json", "r") as f:
    tasks = json.load(f)

In [5]:
task_instances = sorted(
    tasks, key=lambda x: x["created_at"], reverse=True
)
task_instances_grouped = {}
for instance in task_instances:
    repo = instance["repo"]
    version = instance["version"] if "version" in instance else None
    if repo not in task_instances_grouped:
        task_instances_grouped[repo] = {}
    if version not in task_instances_grouped[repo]:
        task_instances_grouped[repo][version] = []
    task_instances_grouped[repo][version].append(instance)

In [13]:
import os
from loguru import logger

testbed = "./testbed"
distributed_tasks = []
for repo, map_version_to_instances in task_instances_grouped.items():
    repo_prefix = repo.replace("/", "__")
    for version, instances in map_version_to_instances.items():
        env_name = f"{repo_prefix}__{version}"
        task_set = {
            "task_instances": instances,
            "testbed": os.path.join(testbed, env_name),
            "venv": env_name,
            "version": version,
        }
        distributed_tasks.append(task_set)
logger.info(f"{len(tasks)} tasks distributed to {len(distributed_tasks)} testbeds")

[32m2024-03-25 10:17:38.587[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1m2294 tasks distributed to 126 testbeds[0m


In [14]:
from dotenv import load_dotenv
from git import Repo


load_dotenv()
def clone_repo(repo_name: str, path: str, token: str = None) -> bool:
    """
    Wrapper for cloning repo from swe-bench organization

    Args:
        repo_name (str): Name of repo to clone
        path (str): Path to clone repo to
        token (str): GitHub token to use for cloning
    Returns:
        success (bool): True if repo cloned successfully, False otherwise
    """
    try:
        if token is None:
            token = os.environ.get("TESTING_GITHUB_TOKEN", "git")
        repo_url = (
            f"https://{token}@github.com/swe-bench/"
            + repo_name.replace("/", "__")
            + ".git"
        )
        Repo.clone_from(repo_url, path)
        return True
    except Exception as e:
        print(e)
        return False
    
for task_set in distributed_tasks:
    task_instances = task_set["task_instances"]
    testbed = task_set["testbed"]
    repo = task_instances[0]["repo"]
    repo_path = os.path.join(testbed, "repo")
    if not os.path.exists(repo_path):
        clone_repo(repo, repo_path)
        logger.info(f"[Testbed] Cloned {repo} to {repo_path}")

[32m2024-03-25 10:24:44.562[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1m[Testbed] Cloned matplotlib/matplotlib to ./testbed/matplotlib__matplotlib__3.7/repo[0m
[32m2024-03-25 10:24:53.811[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1m[Testbed] Cloned matplotlib/matplotlib to ./testbed/matplotlib__matplotlib__3.6/repo[0m
[32m2024-03-25 10:25:43.226[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1m[Testbed] Cloned matplotlib/matplotlib to ./testbed/matplotlib__matplotlib__3.5/repo[0m
[32m2024-03-25 10:26:17.492[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1m[Testbed] Cloned matplotlib/matplotlib to ./testbed/matplotlib__matplotlib__3.4/repo[0m
[32m2024-03-25 10:26:35.923[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1m[Testbed] Cloned matplotlib/matplotlib to ./testbed/matplotlib__matplotlib__3.3/repo[0m
[32m2024-03-25

In [15]:
with open(f"distributed_{swe_bench_tasks}.json", "w") as f:
    json.dump(distributed_tasks, f, indent=4)