In [None]:
# =============================================================================
# COLAB 4: GRPO (Group Relative Policy Optimization) ‚Äî REASONING MODEL
# - Uses TRL's GRPOTrainer with a real reward_fn (math accuracy)
# - Hard token budgets (prompt+completion) to avoid >max_seq issues
# - LoRA r=32 on attention+MLP; 4-bit loading; W&B off by default
# =============================================================================

# Cell 1: Install deps
# -----------------------------------------------------------------------------
print("üì¶ Installing Unsloth + TRL (GRPO) ...")
!pip install -q unsloth bitsandbytes accelerate datasets transformers
!pip install -q "trl>=0.15.0"   # GRPO is available in recent TRL

print("‚úÖ Installation complete!")

# Cell 2: Imports & env
# -----------------------------------------------------------------------------
import os, re, math, statistics, torch
from datasets import Dataset
from unsloth import FastLanguageModel
from transformers import set_seed
from trl import GRPOConfig, GRPOTrainer

# Disable Weights & Biases by default
os.environ["WANDB_DISABLED"] = "true"
# If you hit odd TorchDynamo traces, uncomment:
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

print(f"üî• PyTorch: {torch.__version__}")
print(f"üéÆ CUDA: {torch.cuda.is_available()} | GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Cell 3: Config
# -----------------------------------------------------------------------------
# Overall context
max_seq_length       = 1024     # total context window for model
max_prompt_length    = 512      # GRPO will enforce budgets
max_completion_length= 384      # 512+384=896 <= 1024 (headroom for specials)

# Model load
dtype         = None
load_in_4bit  = True

# LoRA
lora_r        = 32
lora_alpha    = 32
lora_dropout  = 0.05

# GRPO params
num_generations = 4     # per prompt
temperature     = 0.8
beta_kl         = 0.0   # KL off by default; set >0 to enable
loss_type       = "dapo"  # length-bias resistant (also try "dr_grpo")

# Train params
batch_size   = 1
grad_accum   = 8
num_epochs   = 1
lr           = 5e-6
max_steps    = 50
seed         = 3407
set_seed(seed)

print(f"""
üîß Config:
 ‚Ä¢ LoRA r/Œ±/drop: {lora_r}/{lora_alpha}/{lora_dropout}
 ‚Ä¢ GRPO: G={num_generations}, T={temperature}, KL Œ≤={beta_kl}, loss='{loss_type}'
 ‚Ä¢ Budgets: prompt‚â§{max_prompt_length}, completion‚â§{max_completion_length}, ctx‚â§{max_seq_length}
 ‚Ä¢ Train: bs={batch_size}, accum={grad_accum}, steps={max_steps}, lr={lr}
""")

# Cell 4: Load base model (Unsloth)
# -----------------------------------------------------------------------------
print("üì• Loading base model...")
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Ensure tokenizer/pad/eos/truncation are explicit
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = max_seq_length
tokenizer.truncation_side = "right"
tokenizer.padding_side = "right"

print("‚úÖ Model loaded.")

# Cell 5: Apply LoRA
# -----------------------------------------------------------------------------
print("üîß Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_r,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",     # attention
        "gate_proj", "up_proj", "down_proj",        # MLP
    ],
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=seed,
    max_seq_length=max_seq_length,
)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params     = sum(p.numel() for p in model.parameters())
print(f"‚úÖ Trainable: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.2f}%)")

# Cell 6: Build small math dataset (prompt + answer)
# -----------------------------------------------------------------------------
print("üìö Building toy math dataset (for demo)...")
math_problems = [
    {
        "problem": "If a store has 45 apples and sells 17, then buys 23 more, how many apples does it have?",
        "answer": "51",
    },
    {
        "problem": "A rectangle has length 8 cm and width 5 cm. What is its perimeter?",
        "answer": "26",
    },
    {
        "problem": "If 3x + 5 = 20, what is x?",
        "answer": "5",
    },
    {
        "problem": "A train travels 120 km in 2 hours. What is its average speed?",
        "answer": "60",
    },
    {
        "problem": "If you have 3 bags with 4 apples each, and 2 bags with 6 apples each, how many apples total?",
        "answer": "24",
    },
] * 100  # 500 problems

def format_problem(p):
    return (
        "Solve this problem step by step.\n\n"
        f"Problem: {p}\n\n"
        "Let me think through this:"
    )

dataset = Dataset.from_list([
    {"prompt": format_problem(item["problem"]), "answer": item["answer"]}
    for item in math_problems
])

print(f"‚úÖ Dataset size: {len(dataset)}")
print("üìù Sample prompt:\n", dataset[0]["prompt"][:200], "...")

# Cell 7: Reward function (math accuracy)
# -----------------------------------------------------------------------------
def extract_answer(text: str):
    """Heuristic extraction of final numeric answer."""
    patterns = [
        r'answer is (\d+)',
        r'answer:\s*(\d+)',
        r'=\s*(\d+)',
        r'equals\s+(\d+)',
        r'total is (\d+)',
    ]
    t = text.lower()
    for pat in patterns:
        m = re.search(pat, t)
        if m:
            return m.group(1)
    nums = re.findall(r'\d+', t)
    return nums[-1] if nums else None

def math_accuracy_reward(completions, **kwargs):
    """
    completions: list[list[{"content": str}]]
    kwargs may include: 'answer' from dataset (vectorized to a list)
    Returns list[float]: 1.0 if predicted == answer else 0.0
    """
    answers = kwargs.get("answer", None)
    rewards = []
    for i, comp in enumerate(completions):
        # TRL passes each completion as [{"content": "..."}]
        text = comp[0]["content"] if isinstance(comp, list) and comp else str(comp)
        pred = extract_answer(text)
        gold = answers[i] if isinstance(answers, list) else answers
        rewards.append(1.0 if (pred is not None and gold is not None and str(pred) == str(gold)) else 0.0)
    return rewards

print("‚úÖ Reward function ready (exact numeric match).")

# Cell 8: GRPO config
# -----------------------------------------------------------------------------
print("‚öôÔ∏è Building GRPOConfig...")
train_args = GRPOConfig(
    output_dir="./grpo_reasoning_smollm2",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    warmup_steps=5,
    max_steps=max_steps,                       # takes precedence over epochs
    num_train_epochs=num_epochs,
    learning_rate=lr,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_bnb_8bit",                    # 4-bit friendly
    lr_scheduler_type="linear",
    seed=seed,
    save_strategy="steps",
    save_steps=25,
    report_to=[] if os.environ.get("WANDB_DISABLED","true").lower()=="true" else ["wandb"],

    # GRPO-specific
    num_generations=num_generations,
    temperature=temperature,
    max_prompt_length=max_prompt_length,
    max_completion_length=max_completion_length,
    beta=beta_kl,                              # KL term weight (0.0 by default)
    loss_type=loss_type,                       # "dapo" recommended; try "dr_grpo"
    scale_rewards="batch",                     # robust reward scaling
    mask_truncated_completions=True,           # ignore clipped completions in loss
)
print("‚úÖ GRPOConfig ready.")

# Cell 9: Trainer
# -----------------------------------------------------------------------------
print("üèãÔ∏è Initializing GRPOTrainer...")
trainer = GRPOTrainer(
    model=model,                    # Unsloth model object works fine
    processing_class=tokenizer,     # tokenizer (aka processing_class)
    reward_funcs=math_accuracy_reward,
    args=train_args,
    train_dataset=dataset,          # expects 'prompt' column + extra cols (e.g., 'answer')
    # eval_dataset=...,             # (optional) add held-out split if you have one
)
print("‚úÖ Trainer ready!")

# Cell 10: Train
# -----------------------------------------------------------------------------
print("üöÄ Starting GRPO training ...")
print("="*60)
result = trainer.train()
metrics = result.metrics or {}
print("="*60)
print("‚úÖ Training complete!")
print("üìä Stats:")
print("   ‚Ä¢ Steps:", metrics.get("train_steps", metrics.get("global_step", "N/A")))
print("   ‚Ä¢ Train loss:", metrics.get("train_loss", "N/A"))
print("   ‚Ä¢ Time (s):", metrics.get("train_runtime", "N/A"))

# Cell 11: Save (adapters + optional merged)
# -----------------------------------------------------------------------------
print("üíæ Saving GRPO-trained adapters...")
model.save_pretrained("smollm2_grpo_adapters")
tokenizer.save_pretrained("smollm2_grpo_adapters")
print("‚úÖ Adapters ‚Üí ./smollm2_grpo_adapters")

print("\nüîß Saving merged model (optional)...")
merged_ok = False
try:
    # If available in your Unsloth version:
    model.save_pretrained_merged(
        "smollm2_grpo_merged",
        tokenizer,
        save_method="merged_16bit",
    )
    merged_ok = True
except Exception as e:
    print("   save_pretrained_merged unavailable, trying manual merge:", repr(e))
    try:
        from unsloth import FastLanguageModel as _FLM
        _FLM.merge_lora_weights(model)
        model.save_pretrained("smollm2_grpo_merged")
        tokenizer.save_pretrained("smollm2_grpo_merged")
        merged_ok = True
    except Exception as e2:
        print("   Manual merge failed (not critical):", repr(e2))

print("‚úÖ Merged ‚Üí ./smollm2_grpo_merged" if merged_ok else "‚ÑπÔ∏è Skipping merge; adapters saved and usable.")

# Cell 12: Inference (reasoning demo)
# -----------------------------------------------------------------------------
print("\nüß™ Reasoning demo\n")
from torch import inference_mode
FastLanguageModel.for_inference(model)

def solve(problem_text, temp=0.3, max_new=256):
    prompt = (
        "Solve this problem step by step.\n\n"
        f"Problem: {problem_text}\n\n"
        "Let me think through this:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    with inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new,
            temperature=temp,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text[len(prompt):].strip() if text.startswith(prompt) else text

tests = [
    "A bakery made 96 cupcakes. They sold 37 in the morning and 28 in the afternoon. How many are left?",
    "If a book costs $12 and you buy 3 books, how much do you spend?",
    "A triangle has sides of length 5 cm, 12 cm, and 13 cm. Is it a right triangle?",
]
print("="*60)
print("REASONING TEST RESULTS")
print("="*60)
for i, q in enumerate(tests, 1):
    print(f"\n[Test {i}]")
    print("Problem:", q)
    print("Solution:\n", solve(q))
    print("-"*60)

# Cell 13: Summary
# -----------------------------------------------------------------------------
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë            GRPO REASONING TRAINING ‚Äî SUMMARY               ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
‚Ä¢ Base: HuggingFaceTB/SmolLM2-135M-Instruct
‚Ä¢ Method: GRPO (loss='dapo', Œ≤_KL=0.0), LoRA r=32 (attn+MLP)
‚Ä¢ Data: 500 toy math problems (prompt+answer)
‚Ä¢ Safety: budgets prompt‚â§512, completion‚â§384, total‚â§1024
‚Ä¢ Saved:
    - Adapters: ./smollm2_grpo_adapters
    - Merged (optional): ./smollm2_grpo_merged
‚Ä¢ Tips:
    - Try loss_type='dr_grpo' and different scale_rewards ('batch'|'none')
    - Increase num_generations (e.g., 8) for stronger exploration
    - Train longer and with real datasets (GSM8K, MATH) for better gains
""")


üì¶ Installing Unsloth + TRL (GRPO) ...
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.5/61.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m348.7/348.7 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m506.8/506.8 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m



ü¶• Unsloth Zoo will now patch everything to make training faster!
üî• PyTorch: 2.8.0+cu126
üéÆ CUDA: True | GPU: Tesla T4

üîß Config:
 ‚Ä¢ LoRA r/Œ±/drop: 32/32/0.05
 ‚Ä¢ GRPO: G=4, T=0.8, KL Œ≤=0.0, loss='dapo'
 ‚Ä¢ Budgets: prompt‚â§512, completion‚â§384, ctx‚â§1024
 ‚Ä¢ Train: bs=1, accum=8, steps=50, lr=5e-06

üì• Loading base model...
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

HuggingFaceTB/SmolLM2-135M-Instruct does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


‚úÖ Model loaded.
üîß Applying LoRA adapters...


Unsloth 2025.10.12 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úÖ Trainable: 9.77M / 91.20M (10.71%)
üìö Building toy math dataset (for demo)...
‚úÖ Dataset size: 500
üìù Sample prompt:
 Solve this problem step by step.

Problem: If a store has 45 apples and sells 17, then buys 23 more, how many apples does it have?

Let me think through this: ...
‚úÖ Reward function ready (exact numeric match).
‚öôÔ∏è Building GRPOConfig...
Unsloth: The DAPO paper recommends `epsilon_high = 0.28`
Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4
‚úÖ GRPOConfig ready.
üèãÔ∏è Initializing GRPOTrainer...
‚úÖ Trainer ready!
üöÄ Starting GRPO training ...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 9,768,960 of 144,283,968 (6.77% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / math_accuracy_reward / mean,rewards / math_accuracy_reward / std
5,-0.0,0.0375,0.143771,245.975,1.0,384.0,0.475,121.66292,1.0,335.8,0,0,0,0,0,0.0,0.0375,0.143771
10,-0.0,0.03125,0.155253,245.25625,1.0,384.0,0.4125,146.96351,1.0,312.6,No Log,No Log,No Log,No Log,No Log,0.0,0.03125,0.155253
15,0.0,0.0125,0.049187,232.4375,2.6,384.0,0.40625,128.270284,2.6,343.8,No Log,No Log,No Log,No Log,No Log,0.0,0.0125,0.049187
20,0.0,0.01875,0.084542,243.15,1.0,384.0,0.45625,120.906668,1.0,326.6,No Log,No Log,No Log,No Log,No Log,0.0,0.01875,0.084542
25,0.0,0.025,0.141421,229.33125,1.0,384.0,0.4,124.250952,1.0,309.2,No Log,No Log,No Log,No Log,No Log,0.0,0.025,0.141421
30,-0.0,0.04375,0.182916,217.875,1.0,384.0,0.3625,124.132523,1.0,355.0,No Log,No Log,No Log,No Log,No Log,0.0,0.04375,0.182916
35,-0.0,0.03125,0.108416,248.0125,1.0,384.0,0.46875,126.987895,1.0,345.4,No Log,No Log,No Log,No Log,No Log,0.0,0.03125,0.108416
40,-0.0,0.025,0.141421,245.29375,8.4,384.0,0.44375,139.028317,8.4,337.4,No Log,No Log,No Log,No Log,No Log,0.0,0.025,0.141421
45,-0.0,0.03125,0.155253,252.56875,1.0,384.0,0.44375,148.777831,1.0,368.6,No Log,No Log,No Log,No Log,No Log,0.0,0.03125,0.155253
50,-0.0,0.04375,0.157603,261.00625,7.4,384.0,0.475,149.616037,7.4,363.4,No Log,No Log,No Log,No Log,No Log,0.0,0.04375,0.157603


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


‚úÖ Training complete!
üìä Stats:
   ‚Ä¢ Steps: N/A
   ‚Ä¢ Train loss: -9.685754776000977e-10
   ‚Ä¢ Time (s): 1833.6566
üíæ Saving GRPO-trained adapters...
‚úÖ Adapters ‚Üí ./smollm2_grpo_adapters

üîß Saving merged model (optional)...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `smollm2_grpo_merged`: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.42s/it]


Successfully copied all 1 files from cache to `smollm2_grpo_merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12483.05it/s]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.06s/it]


Unsloth: Merge process complete. Saved to `/content/smollm2_grpo_merged`
‚úÖ Merged ‚Üí ./smollm2_grpo_merged

üß™ Reasoning demo

REASONING TEST RESULTS

[Test 1]
Problem: A bakery made 96 cupcakes. They sold 37 in the morning and 28 in the afternoon. How many are left?
Solution:
 First, I'll start by calculating the number of cupcakes sold in the morning. I'll find the number of cupcakes sold in the morning by subtracting the number of cups sold in the afternoon from the number of cups sold in the morning.

So, I'll find the number of cupcakes sold in the morning by subtracting the number of cups sold in the afternoon from the number of cups sold in the morning.

Now, I'll find the number of cupcakes sold in the afternoon by subtracting the number of cups sold in the afternoon from the number of cups sold in the morning.

So, I'll find the number of cupcakes sold in the afternoon by subtracting the number of cups sold in the afternoon from the number of cups sold in the morning.

Ne