In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
import peft # type: ignore
import unsloth # type: ignore
from art.config.model import get_model_config
from art.unsloth.state import ModelState

config = get_model_config(
    "Qwen/Qwen2.5-7B-Instruct",
    output_dir="./.art/models/test",
    config={
        "init_args": {
            "enable_sleep_mode": True
        },
        "peft_args": {
            # "use_gradient_checkpointing": False,
        },
    },
)
state = ModelState(config)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-06 02:54:32 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 79.48%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.11 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Sequences = 368.
Unsloth: vLLM's KV Cache can use up t

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 04-06 02:54:46 model_runner.py:1115] Loading model weights took 6.6961 GB
INFO 04-06 02:54:46 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-06 02:54:49 worker.py:267] Memory profiling takes 2.17 seconds
INFO 04-06 02:54:49 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.79) = 62.88GiB
INFO 04-06 02:54:49 worker.py:267] model weights take 6.70GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 4.72GiB; the rest of the memory reserved for KV Cache is 51.31GiB.
INFO 04-06 02:54:49 executor_base.py:111] # cuda blocks: 60044, # CPU blocks: 7021
INFO 04-06 02:54:49 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 29.32x
INFO 04-06 02:54:51 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 49/49 [00:30<00:00,  1.61it/s]

INFO 04-06 02:55:22 model_runner.py:1562] Graph capturing finished in 30 secs, took 5.63 GiB
INFO 04-06 02:55:22 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 35.45 seconds



Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [7]:
len(state.model.model.layers) ** 0.5

6.928203230275509

In [3]:
from unsloth_zoo.gradient_checkpointing import prepare_n_gradient_checkpoints

prepare_n_gradient_checkpoints(state.model.model, len(state.model.model.layers) // 2)

In [3]:
import asyncio
from art.unsloth.vllm import set_vllm_log_file
import random
import torch
import vllm


set_vllm_log_file("./vllm.log")

num_tokens = 1024


async def warmup(request_id: str) -> None:
    max_tokens = random.randint(0, num_tokens * 2)
    async for _ in state.vllm.async_engine.generate(
        prompt={
            "prompt_token_ids": torch.randint(
                0,
                int(state.tokenizer.vocab_size),  # type: ignore
                (
                    max(
                        (
                            (
                                state.vllm.async_engine.engine.cache_config.num_gpu_blocks  # type: ignore
                                * state.vllm.async_engine.engine.cache_config.block_size
                            )
                            // state.vllm.async_engine.engine.scheduler_config.max_num_seqs
                        )
                        - 16
                        - max_tokens,
                        1,
                    ),
                ),
            ).tolist(),
        },
        sampling_params=vllm.SamplingParams(max_tokens=max_tokens),
        request_id=request_id,
    ):
        pass


warmup_future = asyncio.gather(
    *(
        warmup(f"{i}")
        for i in range(state.vllm.async_engine.engine.scheduler_config.max_num_seqs)
    )
)

In [6]:
warmup_future.cancel()
await asyncio.sleep(0.01)
try:
    warmup_future.result()
except asyncio.CancelledError:
    pass

In [4]:
import asyncio
from art.unsloth.train import train

results_queue = asyncio.Queue()
train_task = asyncio.create_task(train(state.trainer, config, results_queue))

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 20,185,088/7,000,000,000 (0.29% trained)


In [5]:
train_task.cancel()

True

In [11]:
from art.config.model import SequenceLengthTuneArgs
from art.types import TuneConfig
from art.unsloth.service import TuneInputs
import time
import torch
from typing import cast

seq_len_tune_args = config["seq_len_tune_args"] = {
    8192: SequenceLengthTuneArgs(batch_size=6, logprob_calculation_chunk_size=8192),
    16384: SequenceLengthTuneArgs(
        batch_size=1, logprob_calculation_chunk_size=16384 // 8
    ),
    32768: SequenceLengthTuneArgs(
        batch_size=1, logprob_calculation_chunk_size=32768 // 2
    ),
}

seq_len = 32768
batch_size = seq_len_tune_args[seq_len]["batch_size"]
shape = (batch_size, seq_len)
num_steps = 1
async with state.vllm.train_mode():
    start_time = time.time()
    for _ in range(num_steps):
        state.inputs_queue.put_nowait(
            TuneInputs(
                tokens=torch.randint(0, cast(int, state.tokenizer.vocab_size), shape),
                group_ids=torch.randint(0, 10, shape),
                parent_ids=torch.randint(0, 10, shape),
                input_pos=torch.tensor([list(range(shape[1]))]),
                assistant_mask=torch.ones(shape, dtype=torch.bool),
                logprobs=torch.zeros(shape),
                advantages=torch.zeros(shape),
                weights=torch.ones(shape),
                config=TuneConfig(lr=1e-7, kl_coef=0.01),
                _config={
                    "compile_calculate_logprobs": True,
                },
            )
        )
        done, _ = await asyncio.wait(
            [asyncio.create_task(results_queue.get()), train_task],
            return_when=asyncio.FIRST_COMPLETED,
        )
        for task in done:
            result = task.result()
            # If `result` is `None`, the training task finished somehow.
            assert result is not None, "The training task should never finish."
            results_queue.task_done()
            display(result)
    total_tokens = num_steps * batch_size * seq_len
    elapsed_time = time.time() - start_time
    tokens_per_second = total_tokens / elapsed_time
    print(f"Tokens per second: {tokens_per_second:.2f} tokens/s")
    print(f"Total time: {elapsed_time:.2f} seconds")

CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62


RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62

In [8]:
from art.unsloth.train import _calculate_logprobs

type(_calculate_logprobs)


function

In [7]:
from art.unsloth.train import free_memory
free_memory()

In [6]:
train_task.cancel()

False