In [1]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from reward_model import AnswerReward, FormatReward

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 4096 # Can increase for longer reasoning traces
lora_rank = 64

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

INFO 04-03 16:13:52 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.14: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.676 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 48.03%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 4096. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 8.95 GB. Also swap space = 5 GB.
INFO 04-03 16:14:09 config.py:549] This model supports multiple tasks: {'generate', 'classify', 'score



INFO 04-03 16:14:13 weight_utils.py:254] Using model weights format ['*.safetensors']
INFO 04-03 16:14:14 weight_utils.py:270] Time spent downloading weights for unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit: 1.202680 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-03 16:14:15 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 04-03 16:14:15 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-03 16:14:17 worker.py:267] Memory profiling takes 1.73 seconds
INFO 04-03 16:14:17 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.48) = 11.37GiB
INFO 04-03 16:14:17 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.24GiB; the rest of the memory reserved for KV Cache is 7.86GiB.
INFO 04-03 16:14:17 executor_base.py:111] # cuda blocks: 14300, # CPU blocks: 9102
INFO 04-03 16:14:17 executor_base.py:116] Maximum concurrency for 4096 tokens per request: 55.86x
INFO 04-03 16:14:20 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:17<00:00,  1.79it/s]

INFO 04-03 16:14:37 model_runner.py:1562] Graph capturing finished in 17 secs, took 0.67 GiB
INFO 04-03 16:14:37 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 21.82 seconds



Unsloth 2025.3.14 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [5]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
You are given a PDDL (Planning Domain Definition Language) domain file and a problem file. Your task is to solve the planning problem described using the rules and actions defined in the domain file.
Respond in the following format:
<reasoning>
...
</reasoning>

<answer>
...
</answer>

Only use actions defined in the domain file. Do not invent new actions or predicates. If the problem is already solved, return empty in answer.
"""

def get_pddl_probelms() -> Dataset:
    data = load_dataset("csv", data_files="problems/train.csv")["train"] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': f'''Domain File: \n{x["domain_content"]} \n Problem File: \n{x["problem_content"]} \n'''}
        ]
    }) # type: ignore
    return data

dataset = get_pddl_probelms()

In [6]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 3e-5,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 400,
    max_completion_length = 400,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 200,
    save_steps = 200,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [7]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        FormatReward.strict_format_reward_func,
        FormatReward.soft_format_reward_func,
        FormatReward.xmlcount_reward_func,
        AnswerReward.plan_reward_func
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 2 | Total steps = 200
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 119,734,272/3,000,000,000 (3.99% trained)


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / strict_format_reward_func,rewards / soft_format_reward_func,rewards / xmlcount_reward_func,rewards / plan_reward_func
1,0.0,-0.294625,0.746721,297.25,0.0,0.0,0.0,-0.482125,0.1875
2,0.0,-0.139875,0.348638,228.25,0.0,0.0,0.0,-0.389875,0.25
3,0.0001,0.36425,0.448702,241.75,0.001495,0.0,0.0,0.11425,0.25
4,0.0001,0.1375,0.110092,261.25,0.001675,0.0,0.0,0.075,0.0625
5,0.0,-0.255,0.43184,340.375,0.000954,0.0,0.0,-0.3175,0.0625
6,0.0001,0.012,0.434283,200.625,0.00222,0.0,0.0,-0.3005,0.3125
7,0.0001,-0.01475,0.515238,261.25,0.002059,0.0,0.0,-0.13975,0.125
8,0.0001,-0.01225,0.633857,297.875,0.002129,0.0,0.0,-0.26225,0.25
9,0.0001,-0.20675,0.759347,294.5,0.002324,0.0,0.0,-0.39425,0.1875
10,0.0005,0.13075,0.506275,201.75,0.012038,0.0,0.0,-0.11925,0.25


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=200, training_loss=0.022840627827139484, metrics={'train_runtime': 2151.9456, 'train_samples_per_second': 0.744, 'train_steps_per_second': 0.093, 'total_flos': 0.0, 'train_loss': 0.022840627827139484})

In [8]:
model.save_lora("grpo_saved_lora")

In [23]:
domain_file = "problems/domain.pddl"
problem_file = "problems/test/5/problem_test_5_scenario_2/problem.pddl"

with open(domain_file, "r") as f:
    domain_content = f.read()

with open(problem_file, "r") as f:
    problem_content = f.read()

content = f"Domain File: \n{domain_content} \n Problem File: \n{problem_content} \n"

content = f"What actions are defind in this domains? {domain_content}. What each action do?"

text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : content},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it, est. speed input: 144.79 toks/s, output: 93.65 toks/s]

<reasoning>
The actions defined in the domain are:
- **pick-up**: Picks up a block and puts it in the robot arm's holding position if the block is on the table and the arm is empty.
- **put-down**: Places a block that is being held back down on the table if the arm is empty.
- **stack**: Stacks a block that is being held on top of an empty position if the arm is empty and the top block is not already occupied.
- **unstack**: Unstacks a block that is being held off the table and on top of another block, making the top block clear and placing the held block on the table again.
</reasoning>

<answer>
The actions in the block-world domain perform the following functions:
- **pick-up**: Picks up a block from the table and puts it in the arm's holding position, ensuring the arm is empty and the block is on the table.
- **put-down**: Places a block that was being held back down on the table, with the arm becoming empty and the block no longer being held.
- **stack**: Stacks a block that is be




In [6]:
domain_file = "problems/domain.pddl"
problem_file = "problems/test/3/problem_test_3_scenario_0/problem.pddl"

text = '''
(stack orange yellow)
(stack yellow red)
(put-down yellow)
(pick-up red)
(put-down red)'''

In [1]:
import re

In [2]:
output = '''
<reasoning>
The actions defined in the domain are:
- **pick-up**: Picks up a block and puts it in the robot arm's holding position if the block is on the table and the arm is empty.
- **put-down**: Places a block that is being held back down on the table if the arm is empty.
- **stack**: Stacks a block that is being held on top of an empty position if the arm is empty and the top block is not already occupied.
- **unstack**: Unstacks a block that is being held off the table and on top of another block, making the top block clear and placing the held block on the table again.
</reasoning>

<answer>
The actions in the block-world domain perform the following functions:
- **pick-up**: Picks up a block from the table and puts it in the arm's holding position, ensuring the arm is empty and the block is on the table.
- **put-down**: Places a block that was being held back down on the table, with the arm becoming empty and the block no longer being held.
- **stack**: Stacks a block that is being held on top of an empty position on the table, while the arm becomes empty and the block is placed on top of the empty position.
- **unstack**: Unstacks a block that is being held off the table and on top of another block, making the top block clear and placing the held block on the table.
</answer>
'''

In [3]:
pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"

In [5]:
if re.search(r"<reasoning>.*?</reasoning>.*?<answer>.*?</answer>", output, re.DOTALL):
    print(1)

1
