In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from dotenv import load_dotenv
import nest_asyncio
import os

os.environ["SWE_AGENT_CONFIG_DIR"] = "."
os.environ["SWE_AGENT_TOOLS_DIR"] = "tools"
os.environ["SWE_AGENT_TRAJECTORY_DIR"] = "trajectories"

os.makedirs("replays", exist_ok=True)
os.makedirs("trajectories", exist_ok=True)

load_dotenv()
nest_asyncio.apply()

In [4]:
import asyncio
from sweagent.environment.swe_env import SWEEnv
from swerex.deployment.modal import ModalDeployment

from eval import eval_instance, EvalResult
from instances import Instance


async def verify(instance: Instance) -> EvalResult:
    env = SWEEnv(
        deployment=ModalDeployment(
            image=instance["image_name"], startup_timeout=180.0, runtime_timeout=600.0
        ),
        repo=None,
        post_startup_commands=[],
    )
    await asyncio.to_thread(env.start)
#     # Write patch using heredoc with proper newlines to avoid escaping issues
#     patch_content = instance["patch"]
#     command = f"""cd /testbed && cat > /tmp/patch.patch << 'PATCH_EOF'
# {patch_content}
# PATCH_EOF"""

#     result = env.communicate(command, check="raise")
#     print("Patch written to file:", result)

#     # Apply the patch
#     result = env.communicate("cd /testbed && git apply --verbose --check /tmp/patch.patch", check="ignore")
#     print("Git apply check result:", result)
    eval = await eval_instance(instance, env.deployment.runtime)
    await asyncio.to_thread(env.close)
    return eval

In [5]:
from instances import as_instances_iter, get_filtered_swe_smith_instances_df

tasks = (
    get_filtered_swe_smith_instances_df()
    .sample(fraction=1.0, shuffle=True, seed=42)
    .head(1)
    .pipe(as_instances_iter)
)

eval_results = await asyncio.gather(*(verify(instance) for instance in tasks))

In [6]:
eval_results

[{'num_failed_f2p': 0,
  'num_passed_f2p': 4,
  'num_failed_p2p': 0,
  'num_passed_p2p': 19}]