In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [None]:
import peft
import unsloth
import art
from dotenv import load_dotenv
import openai

load_dotenv()


api = art.LocalAPI(wandb_project="agent-reinforcement-training")
model = await api.get_or_create_model(
    name="yes-or-no-unsloth-001", base_model="Qwen/Qwen2.5-7B-Instruct"
)


async def rollout(client: openai.AsyncOpenAI, prompt: str) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.name, max_tokens=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


prompts = [
    f"{prefix} with {', '.join([f"'{w}'" if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
    for prefix in ["respond", "just respond"]
    for use_quotes in [True, False]
    for words in [
        ["yes", "no", "maybe"],
        ["maybe", "yes", "no"],
        ["no", "yes", "maybe"],
        ["yes", "maybe", "no"],
        ["yes", "no"],
        ["maybe", "no"],
        ["no", "maybe"],
        ["no", "yes"],
        ["yes", "no"],
    ]
]

openai_client = await model.openai_client()
for i in range(await model.get_iteration(), 1_000):
    train_groups = await art.gather_trajectories(
        ((rollout(openai_client, prompt) for _ in range(64)) for prompt in prompts),
        pbar_desc="train",
        stream_chat_completions=8,
        return_exceptions=False,
    )
    await model.tune(
        train_groups,
        config=art.TuneConfig(lr=1e-4),
    )

In [7]:
f"{api._get_output_dir(model.name)}/0005"

'./.art/models/yes-or-no-unsloth-001/0005'

In [None]:
api._services["yes-or-no-unsloth-001"].state.vllm.async_engine.engine.list_loras()

set()

In [23]:
api._services["yes-or-no-unsloth-001"].state.vllm.async_engine.engine.remove_lora(1)

True

In [13]:
api._services["yes-or-no-unsloth-001"].state.peft_model.load_lora(f"{api._get_output_dir(model.name)}/0005", load_tensors=True)

LoRARequest(lora_name='6', lora_int_id=6, lora_path='', lora_tensors={'base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight': tensor([[-0.0079,  0.0112, -0.0142,  ..., -0.0132,  0.0130, -0.0099],
        [ 0.0087, -0.0007,  0.0017,  ..., -0.0151,  0.0121,  0.0053],
        [ 0.0163, -0.0090, -0.0106,  ...,  0.0055, -0.0130,  0.0085],
        ...,
        [-0.0077, -0.0077,  0.0107,  ..., -0.0166,  0.0066, -0.0090],
        [ 0.0070,  0.0146,  0.0005,  ..., -0.0020, -0.0092,  0.0002],
        [-0.0084, -0.0010,  0.0085,  ...,  0.0034, -0.0016, -0.0025]],
       device='cuda:0'), 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight': tensor([[-3.7452e-05, -1.5438e-05, -5.5295e-06,  ...,  2.2900e-05,
         -4.0993e-05, -1.7901e-05],
        [-2.5805e-05,  1.6765e-05, -2.5650e-05,  ...,  3.4787e-05,
         -2.8503e-06,  3.3868e-05],
        [ 3.7658e-06,  2.6915e-05, -6.9747e-06,  ...,  1.5141e-05,
          2.5434e-05,  3.9321e-05],
        ...,
        [ 1.7496e