In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
import peft # type: ignore
import unsloth # type: ignore
from art.config.model import get_model_config
from art.unsloth.state import ModelState

config = get_model_config(
    "Qwen/Qwen2.5-72B-Instruct",
    output_dir="./.art/models/test",
    config={
        "init_args": {
            "enable_sleep_mode": True,
            "enforce_eager": True,
            "gpu_memory_utilization": 0.9,
        },
        "peft_args": {
            # "use_gradient_checkpointing": False,
        },
    },
)
state = ModelState(config)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


KeyboardInterrupt: 

In [7]:
len(state.model.model.layers) ** 0.5

6.928203230275509

In [3]:
from unsloth_zoo.gradient_checkpointing import prepare_n_gradient_checkpoints

prepare_n_gradient_checkpoints(state.model.model, len(state.model.model.layers) // 2)

In [3]:
import asyncio
from art.unsloth.vllm import set_vllm_log_file
import random
import torch
import vllm


set_vllm_log_file("./vllm.log")

num_tokens = 16384


async def warmup(request_id: str) -> None:
    max_tokens = random.randint(0, num_tokens * 2)
    async for _ in state.vllm.async_engine.generate(
        prompt={
            "prompt_token_ids": torch.randint(
                0,
                int(state.tokenizer.vocab_size),  # type: ignore
                (
                    max(
                        (
                            (
                                state.vllm.async_engine.engine.cache_config.num_gpu_blocks  # type: ignore
                                * state.vllm.async_engine.engine.cache_config.block_size
                            )
                            // state.vllm.async_engine.engine.scheduler_config.max_num_seqs
                        )
                        - 16
                        - max_tokens,
                        1,
                    ),
                ),
            ).tolist(),
        },
        sampling_params=vllm.SamplingParams(max_tokens=max_tokens),
        request_id=request_id,
    ):
        pass


warmup_future = asyncio.gather(
    *(
        warmup(f"{i}")
        for i in range(state.vllm.async_engine.engine.scheduler_config.max_num_seqs)
    )
)

In [6]:
warmup_future.cancel()
await asyncio.sleep(0.01)
try:
    warmup_future.result()
except asyncio.CancelledError:
    pass

In [None]:
import asyncio
from art.unsloth.train import train

results_queue = asyncio.Queue()
train_task = asyncio.create_task(train(state.trainer, config, results_queue))

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 105,267,200/72,000,000,000 (0.15% trained)


In [5]:
train_task.cancel()

True

In [None]:
from art.config.model import SequenceLengthTuneArgs
from art.types import TuneConfig
from art.unsloth.service import TuneInputs
import time
import torch
from typing import cast

seq_len_tune_args = config["seq_len_tune_args"] = {
    1024: SequenceLengthTuneArgs(batch_size=1, logprob_calculation_chunk_size=1024),
    8192: SequenceLengthTuneArgs(batch_size=6, logprob_calculation_chunk_size=8192 // 8),
    16384: SequenceLengthTuneArgs(
        batch_size=1, logprob_calculation_chunk_size=16384 // 16
    ),
    32768: SequenceLengthTuneArgs(
        batch_size=1, logprob_calculation_chunk_size=32768 // 32
    ),
}

seq_len = 32768
batch_size = seq_len_tune_args[seq_len]["batch_size"]
shape = (batch_size, seq_len)
num_steps = 1
async with state.vllm.train_mode():
    start_time = time.time()
    for _ in range(num_steps):
        state.inputs_queue.put_nowait(
            TuneInputs(
                tokens=torch.randint(0, cast(int, state.tokenizer.vocab_size), shape),
                group_ids=torch.randint(0, 10, shape),
                parent_ids=torch.randint(0, 10, shape),
                input_pos=torch.tensor([list(range(shape[1]))]),
                assistant_mask=torch.ones(shape, dtype=torch.bool),
                logprobs=torch.zeros(shape),
                advantages=torch.zeros(shape),
                weights=torch.ones(shape),
                config=TuneConfig(lr=1e-7, kl_coef=0.01),
                _config={
                    "compile_calculate_logprobs": True,
                },
            )
        )
        done, _ = await asyncio.wait(
            [asyncio.create_task(results_queue.get()), train_task],
            return_when=asyncio.FIRST_COMPLETED,
        )
        for task in done:
            result = task.result()
            # If `result` is `None`, the training task finished somehow.
            assert result is not None, "The training task should never finish."
            results_queue.task_done()
            display(result)
    total_tokens = num_steps * batch_size * seq_len
    elapsed_time = time.time() - start_time
    tokens_per_second = total_tokens / elapsed_time
    print(f"Tokens per second: {tokens_per_second:.2f} tokens/s")
    print(f"Total time: {elapsed_time:.2f} seconds")

{'loss': 0.0,
 'grad_norm': 5.741737913922407e-05,
 'lr': 1e-07,
 'policy_loss': 0.0,
 'kl_div': 0.000751495361328125}

Tokens per second: 535.58 tokens/s
Total time: 61.18 seconds


: 

In [8]:
from art.unsloth.train import _calculate_logprobs

type(_calculate_logprobs)


function

In [7]:
from art.unsloth.train import free_memory
free_memory()

In [6]:
train_task.cancel()

False