In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
from dotenv import load_dotenv
import nest_asyncio
import os

os.environ["SWE_AGENT_CONFIG_DIR"] = "."
os.environ["SWE_AGENT_TOOLS_DIR"] = "tools"
os.environ["SWE_AGENT_TRAJECTORY_DIR"] = "trajectories"

os.makedirs("replays", exist_ok=True)
os.makedirs("trajectories", exist_ok=True)

load_dotenv()
nest_asyncio.apply()

In [20]:
from instances import (
    as_instances_iter,
    get_filtered_swe_smith_instances_df,
    get_swe_bench_verified_instances_df,
)

# instances_df = get_filtered_swe_smith_instances_df()
instances_df = get_swe_bench_verified_instances_df()
instances_iter = as_instances_iter(instances_df)
for instance in instances_iter:
    # break
    if (
        # instance["difficulty"] == "<15 min fix"
        instance["repo"] not in ("astropy/astropy", "django/django", "matplotlib/matplotlib")
        # and instance["instance_id"] != "django__django-10097"
    ):
        break
instance

{'repo': 'mwaskom/seaborn',
 'instance_id': 'mwaskom__seaborn-3069',
 'base_commit': '54cab15bdacfaa05a88fbc5502a5b322d99f148e',
 'patch': 'diff --git a/seaborn/_core/plot.py b/seaborn/_core/plot.py\n--- a/seaborn/_core/plot.py\n+++ b/seaborn/_core/plot.py\n@@ -25,7 +25,7 @@\n from seaborn._stats.base import Stat\n from seaborn._core.data import PlotData\n from seaborn._core.moves import Move\n-from seaborn._core.scales import Scale\n+from seaborn._core.scales import Scale, Nominal\n from seaborn._core.subplots import Subplots\n from seaborn._core.groupby import GroupBy\n from seaborn._core.properties import PROPERTIES, Property\n@@ -1238,7 +1238,6 @@ def _setup_scales(\n             # This only affects us when sharing *paired* axes. This is a novel/niche\n             # behavior, so we will raise rather than hack together a workaround.\n             if axis is not None and Version(mpl.__version__) < Version("3.4.0"):\n-                from seaborn._core.scales import Nominal\n        

In [12]:
import polars as pl


pl.scan_parquet(
    "hf://datasets/SWE-bench/SWE-bench_Verified/data/test-00000-of-00001.parquet"
).select("instance_id", "environment_setup_commit").join(
    pl.scan_parquet(
        "hf://datasets/SWE-bench/SWE-bench/data/test-00000-of-00001.parquet"
    ).select("instance_id", "environment_setup_commit"),
    on="instance_id",
    how="left",
).collect().filter(pl.col("environment_setup_commit").ne(pl.col("environment_setup_commit_right")))

instance_id,environment_setup_commit,environment_setup_commit_right
str,str,str


In [None]:
import requests
import time
import random

def fetch_with_retry(url, max_retries=5, backoff_factor=1):
    """
    Fetch a URL with retries in case of rate limiting (HTTP 429).
    Uses exponential backoff with optional jitter.
    """
    for attempt in range(max_retries + 1):
        resp = requests.get(url)
        if resp.status_code == 429:
            # Determine wait time from Retry-After header or use exponential backoff
            retry_after = resp.headers.get("Retry-After")
            if retry_after:
                wait = float(retry_after)
            else:
                wait = backoff_factor * (2 ** attempt) + random.uniform(0, 1)
            print(f"Rate limited (429). Retrying after {wait:.1f}s (attempt {attempt+1}/{max_retries})")
            time.sleep(wait)
            continue
        return resp
    return resp  # return last response even if still 429

missing_images = []

for instance in as_instances_iter(instances_df):
    image = instance["image_name"]
    # Parse out any tag (after the last ':' if it comes after the last '/')
    name = image
    tag = None
    last_slash = name.rfind("/")
    last_colon = name.rfind(":")
    if last_colon > last_slash:
        tag = name[last_colon+1:]
        name = name[:last_colon]
    # Split into namespace and repository (handle extra registry prefixes)
    parts = name.split("/")
    if len(parts) == 1:
        namespace, repo = "library", parts[0]
    elif len(parts) == 2:
        namespace, repo = parts
    else:
        # Drop registry/extra segments, keep last two
        namespace, repo = parts[-2], parts[-1]
    # Construct the appropriate Docker Hub API URL
    if tag:
        url = f"https://hub.docker.com/v2/repositories/{namespace}/{repo}/tags/{tag}/"
    else:
        url = f"https://hub.docker.com/v2/repositories/{namespace}/{repo}/"
    # Fetch, retrying on 429
    resp = fetch_with_retry(url)
    if resp.status_code == 200:
        continue
    elif resp.status_code == 429:
        print(f"Failed to fetch '{image}' due to rate limiting after retries.")
    else:
        print(f"Docker image '{image}'{' with tag ' + tag if tag else ''} NOT found on Docker Hub (status {resp.status_code})")
    missing_images.append(image)

missing_images

In [None]:
len(instances_df)

In [None]:
len(missing_images)

In [5]:
import art
from rollout import ModelConfig, rollout

model = art.Model(
    # name="openrouter/google/gemini-2.5-flash-preview-05-20",
    name="openrouter/anthropic/claude-3.7-sonnet",
    project="sweagent",
    config=ModelConfig(),
)

trajectory, run_single = await rollout(
    model,
    instance,
    # replay_trajectory_path=Path(
    #     "/Users/brad/github/OpenPipe/agent-reinforcement-training/dev/sweagent/trajectories/brad/no_config__openrouter/google/gemini-2.5-flash-preview-05-20__t-1.00__p-1.00__c-0.00___c01334/c01334/c01334.traj"
    # ),
)



In [None]:
trajectory.messages_and_choices

In [None]:
from pathlib import Path

# Locate the first .pred file in the output directory or any nested subdirectories
pred_file = next(run_single.output_dir.rglob("*.pred"), None)
if pred_file is None:
    raise FileNotFoundError(
        f"No .pred file found in {run_single.output_dir} or its subdirectories"
    )
import json

# Load the JSON content from the prediction file
with pred_file.open("r", encoding="utf-8") as f:
    pred_data = json.load(f)

pred_data

In [21]:
from swebench.harness.modal_eval.run_evaluation_modal import app, run_instance_modal
from swebench.harness.test_spec.test_spec import make_test_spec

async with app.run():
    test_output = await run_instance_modal.remote.aio(
        test_spec=make_test_spec(instance),
        pred={
            "model_name_or_path": model.name,
            # "model_patch": run_single.agent.info["submission"],
            "model_patch": instance["patch"],
            "instance_id": instance["instance_id"],
        },
        run_id="my-eval",
        timeout=1000,
    )
test_output



In [None]:
print(test_output.test_output)

In [None]:
print(run_single.agent.info["submission"])

In [None]:
print(instance["patch"])

In [19]:
print(test_output.report_json_str)

{
    "matplotlib__matplotlib-13989": {
        "patch_is_None": false,
        "patch_exists": true,
        "patch_successfully_applied": true,
        "resolved": false,
        "tests_status": {
            "FAIL_TO_PASS": {
                "success": [],
                "failure": [
                    "lib/matplotlib/tests/test_axes.py::test_hist_range_and_density"
                ]
            },
            "PASS_TO_PASS": {
                "success": [],
                "failure": [
                    "lib/matplotlib/tests/test_axes.py::test_get_labels",
                    "lib/matplotlib/tests/test_axes.py::test_spy_invalid_kwargs",
                    "lib/matplotlib/tests/test_axes.py::test_twinx_cla",
                    "lib/matplotlib/tests/test_axes.py::test_twinx_axis_scales[png]",
                    "lib/matplotlib/tests/test_axes.py::test_twin_inherit_autoscale_setting",
                    "lib/matplotlib/tests/test_axes.py::test_inverted_cla",
                  

In [None]:
print(test_output.test_output)

In [None]:
make_test_spec(instance)

In [None]:
pred_data

In [None]:
print(test_output.run_instance_log)

In [None]:
import os
import json
import secrets

# Prepare prediction data
data = {
    "instance_id": instance["instance_id"],
    "model": model.name,
    "prediction": run_single.agent.info["submission"],
}

# Ensure output directory exists
os.makedirs("./predictions", exist_ok=True)

# Generate a short random filename
filename = f"{secrets.token_hex(4)}.json"
filepath = os.path.join("./predictions", filename)

# Save the prediction to the file
with open(filepath, "w") as f:
    json.dump(data, f)

print(f"Saved prediction to {filepath}")

In [6]:
run_single.env.start()

In [None]:
run_single.hooks[-1]._apply_test_patch(instance["test_patch"])

In [None]:
print(instance["test_patch"])

In [None]:
instance["PASS_TO_PASS"]

In [None]:
run_single.hooks[-1]._get_test_results(instance["PASS_TO_PASS"])

In [None]:
instance["base_commit"]

In [None]:
from swerex.runtime.abstract import BashAction

await run_single.env.deployment.runtime.run_in_session(
    # Start of Selection
    BashAction(command="cd /testbed && git rev-parse HEAD", check="raise")
    # End of Selectio
)

In [None]:
from swerex.runtime.abstract import BashAction
import shlex

# Build and apply the test patch, raising on failure to surface any errors
cmd = f"cd /testbed && echo '{instance['test_patch']}' | git apply --reject --whitespace=fix -"
try:
    observation = await run_single.env.deployment.runtime.run_in_session(
        BashAction(command=cmd, check="raise")
    )
    print("Patch applied successfully:\n", observation.output)
except Exception as e:
    print("Failed to apply patch:", e)

In [None]:
from swerex.runtime.abstract import BashAction

HEREDOC_DELIMITER = "EOF_114329324912"

observation = await run_single.env.deployment.runtime.run_in_session(
    BashAction(
        command=f"cd /testbed/astropy/modeling/tests && git apply -v - <<'{HEREDOC_DELIMITER}'\n{instance['test_patch']}\n{HEREDOC_DELIMITER}",
        check="raise",
    )
)
print(observation.output)

In [None]:
from swerex.runtime.abstract import BashAction

observation = await run_single.env.deployment.runtime.run_in_session(
    BashAction(
        command=f"cd /testbed && python -m pytest ./astropy/modeling/tests/test_separable.py",
        check="silent",
    )
)
print(observation.output)

In [6]:
run_single.env.start()

In [8]:
run_single.env.reset()

In [None]:
print(run_single.env.communicate("ls"))

In [None]:
trajectory.reward

In [6]:
run_single.env.start()

In [None]:
import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = pl.read_parquet(
    "hf://datasets/princeton-nlp/SWE-bench_Verified/data/test-00000-of-00001.parquet"
)
df

In [None]:
df.with_columns(
    created_at=pl.col("created_at").str.strptime(pl.Datetime),
    image_name="huyouare/swebench-verified:sweb.eval.x86_64." + pl.col("instance_id"),
)

In [None]:
import polars as pl

instances = pl.read_parquet(
    "hf://datasets/bradhiltonendercorp/SWE-smith-filtered/instances.parquet"
)
instances

In [None]:
for instance in instances.iter_rows(named=True):
    print(instance.keys())
    break

In [None]:
import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = (
    pl.scan_parquet("hf://datasets/SWE-bench/SWE-smith/data/train-*.parquet")
    .filter(pl.col("problem_statement").ne(""))
    .collect()
)
df

In [None]:
df.select(pl.col(col).n_unique() for col in df.columns)

In [None]:
df.with_columns(pl.col("created_at").str.strptime(pl.Datetime))

In [None]:
with pl.StringCache():
    display(
        df.with_columns(pl.col("created_at").str.strptime(pl.Datetime))
        .with_columns(
            pl.col(col).cast(pl.Categorical) for col in ["repo", "image_name"]
        )
        .with_columns(
            pl.col(col).cast(pl.List(pl.Categorical))
            for col in ["FAIL_TO_PASS", "PASS_TO_PASS"]
        )
        .write_parquet("instances.parquet", compression_level=22)
    )

In [None]:
pl.read_parquet("instances.parquet")

In [None]:
df.estimated_size("mb")

In [None]:
df["PASS_TO_PASS"].cast(pl.List(pl.Categorical)).estimated_size("mb")

In [None]:
for repo in df["repo"].unique().to_list():
    print(repo.split("/")[-1].replace("__", "/").split(".")[0])

In [None]:
from datasets import load_dataset
from dotenv import load_dotenv
import nest_asyncio
import os
from pathlib import Path
from pydantic import SecretStr

os.environ["SWE_AGENT_CONFIG_DIR"] = "config"
os.environ["SWE_AGENT_TOOLS_DIR"] = "tools"
os.environ["SWE_AGENT_TRAJECTORY_DIR"] = "trajectories"
from sweagent.agent.agents import DefaultAgent, DefaultAgentConfig, TemplateConfig
from sweagent.agent.models import GenericAPIModelConfig
from sweagent.agent.problem_statement import TextProblemStatement
from sweagent.environment.repo import PreExistingRepoConfig
from sweagent.environment.swe_env import EnvironmentConfig
from sweagent.run.hooks.abstract import RunHook
from sweagent.run.run_replay import RunReplay
from sweagent.run.run_single import RunSingle, RunSingleConfig
from sweagent.tools.bundle import Bundle
from sweagent.tools.parsing import XMLFunctionCallingParser
from sweagent.tools.tools import ToolConfig
from swerex.deployment.config import ModalDeploymentConfig
from typing import cast

load_dotenv()
nest_asyncio.apply()

SYSTEM_TEMPLATE = """
You are a helpful assistant that can interact with a computer to solve tasks.
""".strip()

LONG_SYSTEM_TEMPLATE = """
You are a helpful assistant that can interact with a computer to solve tasks.
<IMPORTANT>
* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
</IMPORTANT>

You have access to the following functions:

---- BEGIN FUNCTION #1: bash ----
Description: Execute a bash command in the terminal.

Parameters:
  (1) command (string, required): The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.
---- END FUNCTION #1 ----

---- BEGIN FUNCTION #2: submit ----
Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
No parameters are required for this function.
---- END FUNCTION #2 ----

---- BEGIN FUNCTION #3: str_replace_editor ----
Description: Custom editing tool for viewing, creating and editing files
* State is persistent across command calls and discussions with the user
* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
* The `create` command cannot be used if the specified `path` already exists as a file
* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
* The `undo_edit` command will revert the last edit made to the file at `path`

Notes for using the `str_replace` command:
* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
* The `new_str` parameter should contain the edited lines that should replace the `old_str`

Parameters:
  (1) command (string, required): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
Allowed values: [`view`, `create`, `str_replace`, `insert`, `undo_edit`]
  (2) path (string, required): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
  (3) file_text (string, optional): Required parameter of `create` command, with the content of the file to be created.
  (4) old_str (string, optional): Required parameter of `str_replace` command containing the string in `path` to replace.
  (5) new_str (string, optional): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
  (6) insert_line (integer, optional): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
  (7) view_range (array, optional): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
---- END FUNCTION #3 ----


If you choose to call a function ONLY reply in the following format with NO suffix:

Provide any reasoning for the function call here.
<function=example_function_name>
<parameter=example_parameter_1>value_1</parameter>
<parameter=example_parameter_2>
This is the value for the second parameter
that can span
multiple lines
</parameter>
</function>

<IMPORTANT>
Reminder:
- Function calls MUST follow the specified format, start with <function= and end with </function>
- Required parameters MUST be specified
- Only call one function at a time
- Always provide reasoning for your function call in natural language BEFORE the function call (not after)
</IMPORTANT>
""".strip()

INSTANCE_TEMPLATE = """
<uploaded_files>
{{working_dir}}
</uploaded_files>
I've uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description:

<pr_description>
{{problem_statement}}
</pr_description>

Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?
I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the <pr_description> is satisfied.
Follow these steps to resolve the issue:
1. As a first step, it might be a good idea to find and read code relevant to the <pr_description>
2. Create a script to reproduce the error and execute it with `python <filename.py>` using the bash tool, to confirm the error
3. Edit the sourcecode of the repo to resolve the issue
4. Rerun your reproduce script and confirm that the error is fixed!
5. Think about edgecases and make sure your fix handles them as well
Your thinking should be thorough and so it's fine if it's very long.
""".strip()

NEXT_STEP_TEMPLATE = """
OBSERVATION:
{{observation}}
""".strip()

NEXT_STEP_NO_OUTPUT_TEMPLATE = """
Your command ran successfully and did not produce any output.
""".strip()

SUBMIT_REVIEW_MESSAGE = """
Thank you for your work on this issue. Please carefully follow the steps below to help review your changes.

1. If you made any changes to your code after runnin''''' k,k,k,                                                ,,,,,j[[`g the reproduction script, please run the reproduction script again.
If the reproduction script is failing, please revisit your changes and make sure they are correct.
If you have already removed your reproduction script, please ignore this step.
2. Remove your reproduction script (if you haven't done so already).
3. If you have modified any TEST files, please revert them to the state they had before you started fixing the issue.
You can do this with `git checkout -- /path/to/test/file.py`. Use below <diff> to find the files you need to revert.
4. Run the submit command again to confirm.

Here is a list of all of your changes:

<diff>
{{diff}}
</diff>
""".strip()

ds = load_dataset("SWE-bench/SWE-smith", streaming=True)
for instance in ds["train"].iter(1):
    if instance["problem_statement"][0]:
        break

run_single = RunSingle.from_config(
    RunSingleConfig(
        env=EnvironmentConfig(
            deployment=ModalDeploymentConfig(
                image="jyangballin/" + instance["image_name"][0].replace("__", "_1776_")
            ),
            repo=PreExistingRepoConfig(
                repo_name="testbed", base_commit=instance["instance_id"][0]
            ),
        ),
        agent=DefaultAgentConfig(
            templates=TemplateConfig(
                system_template=SYSTEM_TEMPLATE,
                instance_template=INSTANCE_TEMPLATE,
                next_step_template=NEXT_STEP_TEMPLATE,
                next_step_no_output_template=NEXT_STEP_NO_OUTPUT_TEMPLATE,
            ),
            tools=ToolConfig(
                bundles=[
                    Bundle(path=Path("tools/registry").absolute()),
                    Bundle(path=Path("tools/edit_anthropic").absolute()),
                    Bundle(path=Path("tools/review_on_submit_m").absolute()),
                ],
                # enable_bash_tool=True,
                # parse_function=XMLFunctionCallingParser(),
                registry_variables={
                    "USE_FILEMAP": "true",
                    "SUBMIT_REVIEW_MESSAGES": [SUBMIT_REVIEW_MESSAGE],
                },
            ),
            model=GenericAPIModelConfig(
                # name="claude-3-7-sonnet-20250219",
                # name="openrouter/anthropic/claude-3.7-sonnet",
                name="openrouter/google/gemini-2.5-flash-preview-05-20",
                max_output_tokens=64000,
                # temperature=0.0,
                # completion_kwargs={"stop": "</function>"},
                # api_key=SecretStr(os.environ["ANTHROPIC_API_KEY"]),
                api_key=SecretStr(os.environ["OPENROUTER_API_KEY"]),
                # per_instance_cost_limit=2.0,
                per_instance_cost_limit=0.0,
                per_instance_call_limit=75,
            ),
        ),
        problem_statement=TextProblemStatement(text=instance["problem_statement"][0]),
    )
)


run_replay = RunReplay(
    traj_path=Path(
        "trajectories/brad/no_config__openrouter/anthropic/claude-3.7-sonnet__t-0.00__p-1.00__c-2.00___65215a/65215a/65215a.traj"
    ),
    deployment=run_single.env.deployment,
    output_dir=Path("replay"),
)

In [None]:
run_replay.main()

In [None]:
import asyncio
import re
from swerex.runtime.abstract import BashAction

observation = asyncio.run(
    run_single.env.deployment.runtime.run_in_session(
        BashAction(
            command=f"cd /testbed && python -m pytest {' '.join(instance['PASS_TO_PASS'][0])}",
            check="silent",
        )
    )
)
summary_line = observation.output.splitlines()[-1]
print(f"Summary line: {summary_line}")

failed_match = re.search(r"(\d+)\s+failed", summary_line)
passed_match = re.search(r"(\d+)\s+passed", summary_line)

num_failed = int(failed_match.group(1)) if failed_match else 0
num_passed = int(passed_match.group(1)) if passed_match else 0

print(f"Number of failed tests: {num_failed}")
print(f"Number of passed tests: {num_passed}")

In [None]:
import asyncio
from sweagent.run.hooks.abstract import RunHook
from sweagent.types import AgentRunResult


class RewardRunHook(RunHook):
    def on_instance_completed(self, *, result: AgentRunResult) -> None:
        import re
        from swerex.runtime.abstract import BashAction

        observation = asyncio.run(
            run_single.env.deployment.runtime.run_in_session(
                BashAction(
                    command=f"cd /testbed && python -m pytest {' '.join(instance['FAIL_TO_PASS'][0])}",
                    check="silent",
                )
            )
        )
        summary_line = observation.output.splitlines()[-1]
        print(f"Summary line: {summary_line}")

        failed_match = re.search(r"(\d+)\s+failed", summary_line)
        passed_match = re.search(r"(\d+)\s+passed", summary_line)

        num_failed = int(failed_match.group(1)) if failed_match else 0
        num_passed = int(passed_match.group(1)) if passed_match else 0

        print(f"Number of failed tests: {num_failed}")
        print(f"Number of passed tests: {num_passed}")


run_single.add_hook(RewardRunHook())
run_single.run()

In [None]:
run_single.env.start()

In [None]:
import re
from swerex.runtime.abstract import BashAction

observation = await run_single.env.deployment.runtime.run_in_session(
    BashAction(
        command=f"cd /testbed && python -m pytest {' '.join(instance['PASS_TO_PASS'][0])}",
        check="silent",
    )
)
summary_line = observation.output.splitlines()[-1]
print(f"Summary line: {summary_line}")

failed_match = re.search(r"(\d+)\s+failed", summary_line)
passed_match = re.search(r"(\d+)\s+passed", summary_line)

num_failed = int(failed_match.group(1)) if failed_match else 0
num_passed = int(passed_match.group(1)) if passed_match else 0

print(f"Number of failed tests: {num_failed}")
print(f"Number of passed tests: {num_passed}")

In [None]:
from install import install_swe_agent

install_swe_agent()

In [None]:
import polars as pl
from run_instance import run_swe_instance

# Load the dataset
df = pl.read_parquet("hf://datasets/SWE-bench/SWE-smith/data/train-*.parquet")

# Select an instance (for example, the first one)
instance = df[0].to_dicts()[0]

# Run with Gemini 1.5 Flash
result_code, stdout, stderr = run_swe_instance(
    instance,
    model_name="gemini-1.5-flash-001",  # Use "gemini-1.5-pro-001" for Pro model
    temperature=0.0,
    output_dir="trajectories/my_test_run",
    verbose=True,
)

print(f"Run completed with exit code: {result_code}")

In [None]:
import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = pl.read_parquet(
    "hf://datasets/SWE-bench/SWE-smith-trajectories/data/train-00000-of-00001.parquet"
)
df

In [None]:
df["messages"][4].to_list()

In [None]:
import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = pl.read_parquet("hf://datasets/SWE-bench/SWE-smith/data/train-*.parquet")
df

In [None]:
df["image_name"].first().replace("__", "_1776_")

In [None]:
from swerex.deployment.modal import ModalDeployment
from typing import cast

deployment = ModalDeployment(
    image="jyangballin/"
    + cast(str, df["image_name"].first()).replace("__", "_1776_")
    + ":latest"
)

In [None]:
await deployment.start()

In [None]:
from swerex.runtime.abstract import Command

await deployment.runtime.execute(Command(command=["echo", "Hello, world!"]))

In [None]:
import art
from art.local import LocalBackend
from dotenv import load_dotenv
import openai


load_dotenv()

backend = LocalBackend()
model = art.TrainableModel(
    name="001",
    project="sweagent",
    base_model="Qwen/Qwen2.5-7B-Instruct",
)
await model.register(backend)


async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100, timeout=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


with_quotes = lambda w: f"'{w}'"

prompts = [
    f"{prefix} with {', '.join([with_quotes(w) if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
    for prefix in ["respond", "just respond"]
    for use_quotes in [True, False]
    for words in [
        ["yes", "no", "maybe"],
        ["maybe", "yes", "no"],
        ["no", "yes", "maybe"],
        ["yes", "maybe", "no"],
        ["yes", "no"],
        ["maybe", "no"],
        ["no", "maybe"],
        ["no", "yes"],
        ["yes", "no"],
    ]
]

openai_client = model.openai_client()
for _ in range(await model.get_step(), 1_000):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(rollout(openai_client, prompt) for _ in range(32))
            for prompt in prompts
        ),
        pbar_desc="gather",
    )
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=1e-4),
    )