In [None]:
# 移除舊版與可能衝突的套件（不存在也沒關係）
!pip uninstall -y transformers trl peft accelerate datasets pyarrow fsspec gcsfs || true
!pip uninstall -y cudf-cu12 pylibcudf-cu12 dask-cudf-cu12 cuml-cu12 cugraph-cu12 bigframes || true

# 清除 pip 快取，避免抓到壞輪子
!pip cache purge

In [None]:
!pip install -U --no-cache-dir \
  "pyarrow==21.0.0" "fsspec==2024.5.0" "gcsfs==2024.5.0" \
  "transformers==4.57.1" "trl==0.23.0" "peft==0.13.2" "accelerate==1.4.0" "datasets==4.3.0"

# 若上面跑完仍抓不到 transformers，就強制重裝一次：
!pip install --no-cache-dir --force-reinstall "transformers==4.57.1"

In [None]:
import transformers, datasets, peft, accelerate, trl
print("transformers:", transformers.__version__)   # 期待 4.57.1
print("trl:", trl.__version__)                     # 期待 0.23.0
print("peft:", peft.__version__)                   # 期待 0.13.x
print("accelerate:", accelerate.__version__)       # 期待 1.4.0
print("datasets:", datasets.__version__)           # 期待 4.3.0

In [None]:
import os, json, random, re
os.makedirs("/content/data", exist_ok=True)

qa = [
    {"question": "If Anna has 3 apples and buys 2 more, then gives 1 to Bob, how many apples does she have?", "answer": 4},
    {"question": "Tom had 12 candies. He ate 5 and then bought 4 more. How many does he have now?", "answer": 11},
    {"question": "A box has 8 pencils. Sarah adds 7, then loses 3. How many pencils are in the box?", "answer": 12},
    {"question": "There are 15 birds on a tree. 6 fly away, then 4 come back. How many now?", "answer": 13},
    {"question": "John read 9 pages on Monday and 7 on Tuesday. He reread 3 pages. Total new pages read?", "answer": 13},
]
with open("/content/data/grpo_math.jsonl","w") as f:
    for r in qa: f.write(json.dumps(r)+"\n")
print("題目數：", len(qa))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map={"": "cpu"},
    trust_remote_code=True,
)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.eval()
print("Loaded.")

In [None]:
import json, math, random, re

def format_prompt(q):
    return (
        "### Instruction:\nYou are a helpful reasoning assistant. Solve step by step, then end with the final number.\n\n"
        f"### Input:\n{q}\n\n### Response:\n"
    )

def safe_generate(prompt, max_new_tokens=96, sample=True, temperature=0.8, top_p=0.95):
    # 為了穩定，預設走抽樣；若你遇到 NaN，可把 sample=False 改貪婪生成
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=sample,
            temperature=temperature if sample else None,
            top_p=top_p if sample else None,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    # 取 "### Response:" 後的內容
    return decoded.split("### Response:")[-1].strip()

def extract_last_int(text):
    nums = re.findall(r"-?\d+", text)
    if not nums: return None
    return int(nums[-1])

# 讀題 → 生成候選 → 打分 → 組成偏好對
pairs = []
with open("/content/data/grpo_math.jsonl") as f:
    rows = [json.loads(x) for x in f]

K = 3  # 每題生成 3 個候選
for r in rows:
    q, ans = r["question"], int(r["answer"])
    cand = []
    for i in range(K):
        txt = safe_generate(format_prompt(q), sample=True, temperature=0.9 - 0.2*i)  # 稍微改溫度做多樣性
        guess = extract_last_int(txt)
        reward = 1 if guess == ans else 0
        cand.append({"text": txt, "guess": guess, "reward": reward})
    # 排序，最高分當 preferred；如同分，取第一個
    cand_sorted = sorted(cand, key=lambda x: x["reward"], reverse=True)
    preferred = cand_sorted[0]["text"]
    # 從剩下挑一個當 rejected；若全部都一樣（極端情況），就複製一份讓 DPO 也能跑
    rejected = random.choice(cand_sorted[1:])["text"] if len(cand_sorted) > 1 else cand_sorted[0]["text"]
    pairs.append({"prompt": q, "chosen": preferred, "rejected": rejected})

# 存成 DPO 可吃的偏好資料
with open("/content/data/grpo_pref_pairs.jsonl","w") as f:
    for p in pairs:
        f.write(json.dumps(p, ensure_ascii=False)+"\n")

print("產生偏好對數量：", len(pairs))
!head -n 2 /content/data/grpo_pref_pairs.jsonl

In [None]:
from datasets import load_dataset

ds = load_dataset("json", data_files={"train": "/content/data/grpo_pref_pairs.jsonl"})
train_ds = ds["train"]

INSTR = "You are a helpful assistant. Answer briefly but correctly with the final number at the end."

def format_for_dpo(batch):
    prompts = [f"### Instruction:\n{INSTR}\n\n### Input:\n{p}\n\n### Response:\n" for p in batch["prompt"]]
    return {"prompt": prompts, "chosen": batch["chosen"], "rejected": batch["rejected"]}

train_dpo = train_ds.map(format_for_dpo, batched=True, remove_columns=train_ds.column_names)
train_dpo[0]

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

In [None]:
import trl
from trl import DPOTrainer, DPOConfig
print("TRL version:", trl.__version__)

MAX_LEN = 256
dpo_args = DPOConfig(
    output_dir="/content/grpo_dpo_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    max_steps=120,                 # 小步數示範；想更好可調 200
    learning_rate=5e-5,
    warmup_ratio=0.05,
    weight_decay=0.0,
    logging_steps=10,
    save_steps=60,
    save_total_limit=1,
    report_to=[],

    # CPU 必關
    bf16=False, fp16=False, bf16_full_eval=False, fp16_full_eval=False,

    # 長度與 padding
    max_length=MAX_LEN,
    max_prompt_length=MAX_LEN//2,
    padding_value=int(tokenizer.pad_token_id),

    optim="adamw_torch",
    remove_unused_columns=False,
    dataloader_pin_memory=False,

    model_init_kwargs={},
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,              # 內部自建 reference model（凍結，做相對比較）
    args=dpo_args,
    train_dataset=train_dpo,     # 由 Step 3 產生的（GRPO 風格）preferred/rejected
    processing_class=tokenizer,  # 這版用 processing_class 傳 tokenizer
)

trainer.train()

In [None]:
save_dir = "/content/grpo_lora_smolm2_cpu"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)

def chat(prompt, max_new_tokens=64):
    text = f"### Instruction:\n{INSTR}\n\n### Input:\n{prompt}\n\n### Response:\n"
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # 先用貪婪，避免 NaN 機率
            temperature=None,
            top_p=None,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return decoded.split("### Response:")[-1].strip()

tests = [
    "If Anna has 3 apples and buys 2 more, then gives 1 to Bob, how many apples does she have?",
    "There are 15 birds on a tree. 6 fly away, then 4 come back. How many now?"
]
for t in tests:
    print("Q:", t)
    print("A:", chat(t))
    print("---")