In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import random
from typing import TypedDict
from typing import Literal
import xml.etree.ElementTree as ET


class TicTacToeGame(TypedDict):
    board: list[list[str]]
    agent_symbol: Literal["x", "o"]
    opponent_symbol: Literal["x", "o"]


def generate_game(board_length: int = 3) -> TicTacToeGame:
    board = [["_" for _ in range(board_length)] for _ in range(board_length)]
    agent_symbol = random.choice(["x", "o"])
    opponent_symbol = "x" if agent_symbol == "o" else "o"
    return {
        "board": board,
        "agent_symbol": agent_symbol,
        "opponent_symbol": opponent_symbol,
    }


def render_board(game: TicTacToeGame) -> str:
    board = game["board"]
    board_length = len(board)
    # print something like this:
    #    1   2   3
    # A  _ | x | x
    # B  o | _ | _
    # C  _ | o | _
    # where _ is an empty cell

    board_str = "   " + "   ".join([str(i + 1) for i in range(board_length)]) + "\n"
    for i in range(board_length):
        board_str += f"{chr(65 + i)}  {board[i][0]} | {board[i][1]} | {board[i][2]}\n"
    return board_str


def get_opponent_move(game: TicTacToeGame) -> tuple[int, int]:
    # get a random empty cell
    empty_cells = [
        (i, j) for i in range(3) for j in range(3) if game["board"][i][j] == "_"
    ]
    return random.choice(empty_cells)


def apply_agent_move(game: TicTacToeGame, move: str) -> None:
    board_length = len(game["board"])

    try:
        root = ET.fromstring(move)
        square = root.text
    except Exception as e:
        raise ValueError("Invalid xml")

    try:
        row_index = ord(square[0]) - 65
        col_index = int(square[1]) - 1
    except Exception as e:
        print(e)
        raise ValueError("Unable to parse square")

    if (
        row_index < 0
        or row_index >= board_length
        or col_index < 0
        or col_index >= board_length
    ):
        raise ValueError(
            f"Invalid move, row or column out of bounds: {row_index}, {col_index}"
        )

    # check if the move is valid
    if game["board"][row_index][col_index] != "_":
        raise ValueError("Square already occupied")

    game["board"][row_index][col_index] = game["agent_symbol"]


def check_winner(board: list[list[str]]) -> Literal["x", "o", "draw", None]:
    board_length = len(board)
    # check rows
    for row in board:
        if row.count(row[0]) == board_length and row[0] != "_":
            return row[0]
    # check columns
    for col in range(board_length):
        if [board[row][col] for row in range(board_length)].count(
            board[0][col]
        ) == board_length and board[0][col] != "_":
            return board[0][col]

    # top right to bottom left
    upward_diagonal = [board[i][board_length - i - 1] for i in range(board_length)]
    if (
        upward_diagonal.count(upward_diagonal[0]) == board_length
        and upward_diagonal[0] != "_"
    ):
        return upward_diagonal[0]

    # top left to bottom right
    downward_diagonal = [board[i][i] for i in range(board_length)]
    if (
        downward_diagonal.count(downward_diagonal[0]) == board_length
        and downward_diagonal[0] != "_"
    ):
        return downward_diagonal[0]

    # check for draw
    if all(cell != "_" for row in board for cell in row):
        return "draw"
    return None


In [5]:
import art
from dotenv import load_dotenv
import random
from pydantic import BaseModel

from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)

api = art.SkypilotAPI(cluster_name="art")

OpenPipe client initialized
Cluster art exists, using it...


Output()

[0m⚙︎ Job submitted, ID: 1



[36mJob ID: [1m1[0m
📋 Useful Commands
[2m├── [0mTo cancel the job:		[1msky cancel art 1[0m
[2m├── [0mTo stream job logs:		[1msky logs art 1[0m
[2m└── [0mTo view job queue:		[1msky queue art[0m

Cluster name: art
[2m├── [0mTo log into the head VM:	[1mssh art[0m
[2m├── [0mTo submit a job:		[1msky exec art yaml_file[0m
[2m├── [0mTo stop the cluster:	[1msky stop art[0m
[2m└── [0mTo teardown the cluster:	[1msky down art[0m

[?25hTask launched, waiting for it to start...


Output()

Art server task started


Using base_url: http://216.81.245.26:30375


In [9]:
class CustomConfig(BaseModel):
    litellm_model_name: str | None = None

model = art.TrainableModel(
    name="005", project="tic-tac-toe", base_model="Qwen/Qwen2.5-3B-Instruct"
)
await model.register(api)

# gpt_4o_mini = art.Model(
#     name="gpt-4o-mini",
#     project="tic-tac-toe",
#     config=CustomConfig(
#         litellm_model_name="openai/gpt-4o-mini",
#     ),
# )
# await gpt_4o_mini.register(api)

# gpt_4o = art.Model(
#     name="gpt-4o",
#     project="tic-tac-toe",
#     config=CustomConfig(
#         litellm_model_name="openai/gpt-4o",
#     ),
# )
# await gpt_4o.register(api)

Registering model with server
To view logs, run: 'uv run sky logs art'


Output()

In [10]:
import art
import openai
import time
from litellm import acompletion
import math

from art.utils.litellm import convert_litellm_choice_to_openai


@art.retry(exceptions=(openai.LengthFinishReasonError,))
async def rollout(
    model: art.Model, iteration: int, is_validation: bool
) -> art.Trajectory:
    game = generate_game()

    trajectory = art.Trajectory(
        messages_and_choices=[
            {
                "role": "system",
                "content": f"You are a tic-tac-toe player. You are playing against an opponent. Always choose the move most likely to lead to an eventual win. Return your move as an XML object with a single property 'move', like so: <move>A1</move>. Optional moves are 'A1', 'B3', 'C2', etc. You are the {game['agent_symbol']} symbol.",
            }
        ],
        reward=0,
    )

    move_number = 0

    if game["agent_symbol"] == "o":
        starting_opponent_move = get_opponent_move(game)
        game["board"][starting_opponent_move[0]][starting_opponent_move[1]] = game[
            "opponent_symbol"
        ]

    while check_winner(game["board"]) is None:
        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = trajectory.messages()

        try:
            model_id = model.config.litellm_model_name if isinstance(model.config, CustomConfig) else f"hosted_vllm/{model.name}"
            chat_completion = await acompletion(
                base_url=model.base_url,
                api_key=model.api_key,
                model=model_id,
                messages=messages,
                max_completion_tokens=128,
            )
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion")
            print(e)
            global failing_trajectory
            failing_trajectory = trajectory
            raise e

        try:
            op_client.report(
                requested_at=requested_at,
                received_at=int(time.time() * 1000),
                req_payload={
                    "model": model.name,
                    "messages": messages,
                    "metadata": {
                        "notebook-id": "tic-tac-toe",
                        "iteration": str(iteration),
                        "validation": str(is_validation),
                        "move_number": str(move_number),
                    },
                },
                resp_payload=chat_completion,
                status_code=200,
            )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = convert_litellm_choice_to_openai(chat_completion.choices[0])
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        try:
            apply_agent_move(game, content)
        except ValueError as e:
            trajectory.reward = -1 + (math.log(move_number + 1) / math.log(100))
            break

        if check_winner(game["board"]) is not None:
            break
        move_number += 1

        opponent_move = get_opponent_move(game)
        game["board"][opponent_move[0]][opponent_move[1]] = game["opponent_symbol"]

    winner = check_winner(game["board"])

    if winner == game["agent_symbol"]:
        trajectory.reward = 1
    elif winner == game["opponent_symbol"]:
        trajectory.reward = 0
    elif winner == "draw":
        trajectory.reward = 0.5

    try:
        op_client.update_log_metadata(
            filters=[
                {
                    "field": "completionId",
                    "equals": last_completion.id,
                }
            ],
            metadata={
                "reward": str(trajectory.reward),
                "reward_assigned": "true",
            },
        )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

        print(trajectory.reward)

    return trajectory


In [None]:
for i in range(await model.get_step(), 300):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(model, i, is_validation=False) for _ in range(48)
            )
            for _ in range(1)
        ),
        pbar_desc="gather",
    )
    await model.delete_checkpoints()
    await model.train(train_groups, config=art.TrainConfig(learning_rate=1e-4))

gather:   0%|          | 0/48 [00:00<?, ?it/s]

train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/48 [00:00<?, ?it/s]

In [7]:
import asyncio

async def log_comparison_model(comparison_model: art.Model):
    trajectories = await art.gather_trajectory_groups(
            (
                art.TrajectoryGroup(rollout(comparison_model, 0, is_validation=True) for _ in range(12))
            for _ in range(1)
        ),
        pbar_desc=f"gather {comparison_model.name}",
        max_exceptions=1,
    )

    await comparison_model.log(
        trajectories,
        split="val",
    )

promises = []

for comparison_model in [gpt_4o_mini, gpt_4o]:
    promises.append(log_comparison_model(comparison_model))

await asyncio.gather(*promises)

gather gpt-4o-mini:   0%|          | 0/12 [00:00<?, ?it/s]

gather gpt-4o:   0%|          | 0/12 [00:00<?, ?it/s]

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
train/completion_tokens,▁
train/exception_rate,▁
train/reward,▁
train/reward_std_dev,▁
train/test,▁

0,1
train/completion_tokens,3.1
train/exception_rate,0.0
train/reward,-1.0
train/reward_std_dev,0.0
train/test,5.0




0,1
val/exception_rate,0
val/reward,-1
val/reward_std_dev,0
val/test,5


ERROR:asyncio:Task was destroyed but it is pending!
task: <Task pending name='Task-854' coro=<Event.wait() running at /root/miniconda3/lib/python3.10/asyncio/locks.py:214> wait_for=<Future cancelled>>


[None, None]

