In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import peft # type: ignore
import unsloth # type: ignore
from art.config.model import get_model_config
from art.unsloth.state import ModelState

config = get_model_config(
    "Qwen/Qwen2.5-14B-Instruct",
    output_dir="./.art/models/test",
    config={
        "init_args": {
            "enable_sleep_mode": True,
        }
    },
)
state = ModelState(config)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-05 04:24:55 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-14b-instruct-unsloth-bnb-4bit with actual GPU utilization = 79.39%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.11 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Sequences = 368.
Unsloth: vLLM's KV Cache can use up 

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 04-05 04:25:11 model_runner.py:1115] Loading model weights took 10.6011 GB
INFO 04-05 04:25:11 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-05 04:25:15 worker.py:267] Memory profiling takes 3.19 seconds
INFO 04-05 04:25:15 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.79) = 62.81GiB
INFO 04-05 04:25:15 worker.py:267] model weights take 10.60GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 4.26GiB; the rest of the memory reserved for KV Cache is 47.80GiB.
INFO 04-05 04:25:15 executor_base.py:111] # cuda blocks: 16314, # CPU blocks: 2048
INFO 04-05 04:25:15 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 7.97x
INFO 04-05 04:25:17 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 49/49 [00:39<00:00,  1.25it/s]

INFO 04-05 04:25:56 model_runner.py:1562] Graph capturing finished in 39 secs, took 9.48 GiB
INFO 04-05 04:25:56 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 44.99 seconds



Unsloth 2025.3.19 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [3]:
import asyncio
from art.unsloth.vllm import set_vllm_log_file
import random
import torch
import vllm


set_vllm_log_file("./vllm.log")

num_tokens = 1024


async def warmup(request_id: str) -> None:
    max_tokens = random.randint(0, num_tokens * 2)
    async for _ in state.vllm.async_engine.generate(
        prompt={
            "prompt_token_ids": torch.randint(
                0,
                int(state.tokenizer.vocab_size),  # type: ignore
                (
                    max(
                        (
                            (
                                state.vllm.async_engine.engine.cache_config.num_gpu_blocks  # type: ignore
                                * state.vllm.async_engine.engine.cache_config.block_size
                            )
                            // state.vllm.async_engine.engine.scheduler_config.max_num_seqs
                        )
                        - 16
                        - max_tokens,
                        1,
                    ),
                ),
            ).tolist(),
        },
        sampling_params=vllm.SamplingParams(max_tokens=max_tokens),
        request_id=request_id,
    ):
        pass


warmup_future = asyncio.gather(
    *(
        warmup(f"{i}")
        for i in range(state.vllm.async_engine.engine.scheduler_config.max_num_seqs)
    )
)

In [6]:
warmup_future.cancel()
await asyncio.sleep(0.01)
try:
    warmup_future.result()
except asyncio.CancelledError:
    pass

In [None]:
import asyncio
from art.unsloth.train import train

results_queue = asyncio.Queue()
train_task = asyncio.create_task(train(state.trainer, config, results_queue))

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 34,406,400/14,000,000,000 (0.25% trained)


In [10]:
from art.config.model import SequenceLengthTuneArgs
from art.types import TuneConfig
from art.unsloth.service import TuneInputs
import torch
from typing import cast

seq_len_tune_args = config["seq_len_tune_args"] = {
    8192: SequenceLengthTuneArgs(batch_size=8, logprob_calculation_chunk_size=8192),
    16384: SequenceLengthTuneArgs(batch_size=2, logprob_calculation_chunk_size=2048),
    32768: SequenceLengthTuneArgs(batch_size=2, logprob_calculation_chunk_size=8192),
}

seq_len = 32768
batch_size = seq_len_tune_args[seq_len]["batch_size"]
shape = (batch_size, seq_len)

async with state.vllm.train_mode():
    for _ in range(2):
        state.inputs_queue.put_nowait(
            TuneInputs(
                tokens=torch.randint(0, cast(int, state.tokenizer.vocab_size), shape),
                group_ids=torch.randint(0, 10, shape),
                parent_ids=torch.randint(0, 10, shape),
                input_pos=torch.tensor([list(range(shape[1]))]),
                assistant_mask=torch.ones(shape, dtype=torch.bool),
                logprobs=torch.zeros(shape),
                advantages=torch.zeros(shape),
                weights=torch.ones(shape),
                config=TuneConfig(lr=1e-7, kl_coef=0.01),
            )
        )
        display(await results_queue.get())

{'loss': 0.0,
 'grad_norm': 5.24341594427824e-05,
 'lr': 1e-07,
 'policy_loss': 0.0,
 'kl_divergence': 0.00104522705078125}

{'loss': 0.0,
 'grad_norm': 2.993707312270999e-05,
 'lr': 1e-07,
 'policy_loss': 0.0,
 'kl_divergence': 0.001007080078125}