In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [7]:
import art
from dotenv import load_dotenv
import random
from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)


api = art.LocalAPI(wandb_project="agent-reinforcement-training")
model = await api.get_or_create_model(
    name="tic-tac-toe-001", base_model="NousResearch/Hermes-2-Theta-Llama-3-8B"
)

OpenPipe client initialized


In [8]:
import art
import asyncio
from dotenv import load_dotenv
import json
import openai
import random
from typing import TypedDict
from openpipe.client import OpenPipe
import time
from typing import Literal
from pydantic import BaseModel

load_dotenv()


class TicTacToeGame(TypedDict):
    board: list[list[str]]
    agent_symbol: Literal["x", "o"]
    opponent_symbol: Literal["x", "o"]

def generate_game(board_length: int = 3) -> TicTacToeGame:
    board = [["" for _ in range(board_length)] for _ in range(board_length)]
    agent_symbol = random.choice(["x", "o"])
    opponent_symbol = "x" if agent_symbol == "o" else "o"
    return {"board": board, "agent_symbol": agent_symbol, "opponent_symbol": opponent_symbol}

def render_board(game: TicTacToeGame) -> str:
    board = game["board"]
    board_length = len(board)
    # print something like this:
    #    1   2   3
    # A  _ | X | X
    # B  O | _ | _
    # C  _ | O | _
    # where _ is an empty cell

    for i in range(board_length):
        for j in range(board_length):
            if board[i][j] == "":
                board[i][j] = "_"
            else:
                board[i][j] = board[i][j].upper()

    board_str = "  " + " ".join([str(i+1) for i in range(board_length)]) + "\n"
    for i in range(board_length):
        board_str += f"{chr(65 + i)}  {board[i][0]} | {board[i][1]} | {board[i][2]}\n"
    return board_str

def get_opponent_move(game: TicTacToeGame) -> tuple[int, int]:
    # get a random empty cell
    empty_cells = [(i, j) for i in range(3) for j in range(3) if game["board"][i][j] == ""]
    return random.choice(empty_cells)

class AgentMove(BaseModel):
    reason: str
    square: str

agent_move_json_schema = AgentMove.model_json_schema()

def parse_agent_move(move: str) -> tuple[int, int]:
    json_move = json.loads(move)
    square = json_move["square"]
    # move is in the format 'A1', 'B2', 'C3', etc.
    return (ord(square[0]) - 65, int(square[1]) - 1)

def check_winner(board: list[list[str]]) -> Literal["x", "o", "draw", None]:
    board_length = len(board)
    # check rows
    for row in board:
        if row.count(row[0]) == board_length and row[0] != "":
            return row[0]
    # check columns
    for col in range(board_length):
        if [board[row][col] for row in range(board_length)].count(board[0][col]) == board_length and board[0][col] != "":
            return board[0][col]
    # check diagonals
    for i in range(1, board_length):
        # check top left to bottom right
        if board[i][i] == board[i-1][i-1] == board[0][0] != "":
            return board[0][0]
        # check top right to bottom left
        if board[i][board_length - i - 1] == board[i-1][board_length - i] == board[0][board_length - 1] != "":
            return board[0][board_length - 1]
    # check for draw
    if all(cell != "" for row in board for cell in row):
        return "draw"
    return None

def get_trajectory_messages(trajectory: art.Trajectory) -> art.Messages:
    messages: art.Messages = []
    for item in trajectory.messages_and_choices:
        # check if item is a choice
        if "message" in item:
            messages.append(item["message"])
        else:
            # otherwise it's a message
            messages.append(item)
    return messages



async def rollout(
    client: openai.AsyncOpenAI, iteration: int, is_validation: bool
) -> art.Trajectory:

    game = generate_game()

    trajectory = art.Trajectory(
        messages_and_choices=[{"role": "system", "content": f"You are a tic-tac-toe player. You are playing against an opponent. Always choose the move most likely to lead to an eventual win. Return the move in the format 'A1', 'B2', 'C3', etc. You are the {game['agent_symbol']} symbol."}], reward=0, metrics={"test": "something"}
    )

    if (game["agent_symbol"] == "o"):
        starting_opponent_move = get_opponent_move(game)
        game["board"][starting_opponent_move[0]][starting_opponent_move[1]] = game["opponent_symbol"]


    while check_winner(game["board"]) is None:

        trajectory.messages_and_choices.append({"role": "user", "content": render_board(game)})

        requested_at = int(time.time() * 1000)
        messages = get_trajectory_messages(trajectory)
        chat_completion = await client.chat.completions.create(
            messages=messages,
            model=model.name,
            response_format={
                "guided_json": agent_move_json_schema
            }
        )

        op_client.report(
            requested_at=requested_at,
            received_at=int(time.time() * 1000),
            req_payload={
                "model": model.name,
                "messages": messages,
                "metadata": {
                    "notebook-id": "tic-tac-toe",
                    "iteration": str(iteration),
                    "validation": str(is_validation),
                    "move_number": str(len(trajectory.messages_and_choices)),
                },
            },
            resp_payload=chat_completion,
            status_code=200,
        )

        choice = chat_completion.choices[0]
        content = choice.message.content
        assert isinstance(content, str)

        agent_move = parse_agent_move(content)
        game["board"][agent_move[0]][agent_move[1]] = game["agent_symbol"]

        if check_winner(game["board"]) is not None:
            break

        opponent_move = get_opponent_move(game)
        game["board"][opponent_move[0]][opponent_move[1]] = game["opponent_symbol"]

    winner = check_winner(game["board"])

    if winner == game["agent_symbol"]:
        trajectory.reward = 1
    elif winner == game["opponent_symbol"]:
        trajectory.reward = 0
    elif winner == "draw":
        trajectory.reward = 0.5
        
    return trajectory


stride = 32
for i in range(await model.get_iteration(), 10):
    async with model.openai_client(
        estimated_completion_tokens=350, verbosity=2
    ) as openai_client:
        val_groups, train_groups = await asyncio.gather(
            art.gather_trajectories(
                (
                    (rollout(openai_client, i, is_validation=True) for _ in range(2))
                ),
                pbar_desc="val",
                stream_chat_completions=8,
            ),
            art.gather_trajectories(
                (
                    (rollout(openai_client, i, is_validation=False) for _ in range(50))
                ),
                pbar_desc="train",
            ),
        )
    await model.log(val_groups)
    await model.clear_iterations()
    await model.tune(
        train_groups, config=art.TuneConfig(plot_tensors=True, verbosity=2)
    )

$ vllm serve NousResearch/Hermes-2-Theta-Llama-3-8B --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-num-seqs=2048 --max-num-batched-tokens=16384 --num-scheduler-steps=8 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=1 --tool-call-parser=hermes --served-model-name=tic-tac-toe-001 --port=8001 --api-key=default


Process SpawnProcess-1:
Traceback (most recent call last):
  File "/root/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/root/sky_workdir/.venv/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 402, in run_mp_engine
    raise e
  File "/root/sky_workdir/.venv/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
    engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/sky_workdir/.venv/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
    return cls(ipc_path=ipc_path,
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/roo

CancelledError: 