In [1]:
import art
from art.local import LocalAPI
from dotenv import load_dotenv
from openpipe.client import AsyncOpenPipe
import random

from rollout import rollout

load_dotenv()

random.seed(42)

# Initialize the server
api = LocalAPI()

In [None]:
# Declare the model
model = art.TrainableModel(
    name="013",
    project="2048",
    base_model="Qwen/Qwen2.5-3B-Instruct",
    # To run on a T4, we need to override some config defaults.
    _internal_config=art.dev.InternalModelConfig(
        init_args=art.dev.InitArgs(
            max_seq_length=8192,
        ),
        engine_args=art.dev.EngineArgs(
            enforce_eager=True,
            gpu_memory_utilization=0.8,
            num_scheduler_steps=1,
        ),
    ),
)
await api._experimental_pull_from_s3(
    model,
    verbose=True,
)
await model.register(api)

for i in range(await model.get_step(), 50):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(model, i, is_validation=False) for _ in range(4)
            )
            for _ in range(1)
        ),
        pbar_desc="gather",
        max_exceptions=1,
    )
    await model.delete_checkpoints()
    await api._experimental_push_to_s3(
        model,
    )
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=3e-5),
        # Lowering the logprob_calculation_chunk_size is a memory saving measure
        # to allow longer sequences (up to 4096 tokens) to be processed on a T4.
        _config={"logprob_calculation_chunk_size": 8},
    )


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-30 01:31:12 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.1. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 78.34%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.1 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Seq

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.38it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.38it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.28it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.28it/s]



INFO 04-30 01:31:29 model_runner.py:1115] Loading model weights took 2.2265 GB
INFO 04-30 01:31:29 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-30 01:31:31 worker.py:267] Memory profiling takes 2.53 seconds
INFO 04-30 01:31:31 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.78) = 61.97GiB
INFO 04-30 01:31:31 worker.py:267] model weights take 2.23GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 2.71GiB; the rest of the memory reserved for KV Cache is 56.89GiB.
INFO 04-30 01:31:32 executor_base.py:111] # cuda blocks: 103556, # CPU blocks: 10922
INFO 04-30 01:31:32 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 50.56x
INFO 04-30 01:31:39 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory err

Capturing CUDA graph shapes: 100%|██████████| 49/49 [00:42<00:00,  1.15it/s]


INFO 04-30 01:32:22 model_runner.py:1562] Graph capturing finished in 43 secs, took 1.30 GiB
INFO 04-30 01:32:22 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 53.16 seconds


Unsloth 2025.3.19 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


gather:   0%|          | 0/4 [00:00<?, ?it/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mopenpipe[0m ([33mopenpipe-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Packed 4 trajectories into 2 sequences of length 6144


train:   0%|          | 0/2 [00:00<?, ?it/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000,000 | Num Epochs = 3 | Total steps = 30,000,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 14,966,784/3,000,000,000 (0.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


gather:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
import asyncio
import os


async def log_comparison_model(comparison_model: art.Model):
    trajectories = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(comparison_model, 0, is_validation=True) for _ in range(12)
            )
            for _ in range(1)
        ),
        pbar_desc=f"gather {comparison_model.name}",
        max_exceptions=1,
    )

    await comparison_model.log(
        trajectories,
        split="val",
    )

gpt_4o_mini = art.Model(
    name="gpt-4o-mini",
    project="2048",
    inference_model_name="gpt-4o-mini",
    inference_base_url="https://api.openai.com/v1",
    inference_api_key=os.getenv("OPENAI_API_KEY"),
)
await gpt_4o_mini.register(api)

gpt_4o = art.Model(
    name="gpt-4o",
    project="2048",
    inference_model_name="gpt-4o",
    inference_base_url="https://api.openai.com/v1",
    inference_api_key=os.getenv("OPENAI_API_KEY"),
)
await gpt_4o.register(api)

gpt_4_1 = art.Model(
    name="gpt-4.1",
    project="2048",
    inference_model_name="gpt-4.1",
    inference_base_url="https://api.openai.com/v1",
    inference_api_key=os.getenv("OPENAI_API_KEY"),
)
await gpt_4_1.register(api)

await api._experimental_push_to_s3(
    gpt_4o_mini,
)
await api._experimental_push_to_s3(
    gpt_4o,
)
await api._experimental_push_to_s3(
    gpt_4_1,
)


# Optional logging client
op_client = AsyncOpenPipe()

promises = []

for comparison_model in [gpt_4o_mini, gpt_4o, gpt_4_1]:
    promises.append(log_comparison_model(comparison_model))

await asyncio.gather(*promises)


gather gpt-4o-mini:   0%|          | 0/12 [00:00<?, ?it/s]

gather gpt-4o:   0%|          | 0/12 [00:00<?, ?it/s]

gather gpt-4.1:   0%|          | 0/12 [00:00<?, ?it/s]

[None, None, None]