### 1. Environment set up

#### Pytorch and Cuda installation

In [1]:
# !nvcc -V

In [2]:
# !pip3 install torch --index-url https://download.pytorch.org/whl/cu128

In [1]:
import torch
print(torch.__version__)

2.7.0+cu128


In [2]:
print(torch.cuda.is_available())

True


In [5]:
# !nvidia-smi

#### Import

In [None]:
# %pip install  --upgrade \
#   "transformers==4.48.1" \
#   "datasets==3.1.0" \
#   "accelerate==1.3.0" \
#   "hf-transfer==0.1.9" \
#   "trl==0.14.0" \
#   "wandb"
# !pip install -U peft

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import json
import os
import re
from datetime import datetime
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from huggingface_hub import login

login(token="", add_to_git_credential=True) # ADD YOUR TOKEN HERE

### 2. Model performance test without finetuning

Test DeepSeek-R1-Distill-Qwen-1.5B and Qwen2.5-1.5B-Instruct performance on commonsense_qa dataset

In [19]:
def extract_answer_tag(output_text):
    match = re.search(r"<answer>\s*([A-E])\s*</answer>", output_text)
    if match:
        return match.group(1)
    return "X"

def run_eval_experiment(
    model_id: str,
    dataset_id: str,
    prompt_builder_fn,
    max_new_tokens: int = 1024,
    num_examples: int = 100,
    seed: int = 42,
    save_dir: str = "./experiment_logs"
):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        trust_remote_code=True
    )

    # Load and process dataset
    dataset = load_dataset(dataset_id, split="validation")
    dataset = dataset.shuffle(seed=seed).select(range(num_examples))
    dataset = dataset.map(lambda x: {"prompt": prompt_builder_fn(x)})

    generation_kwargs = {
        "max_new_tokens": max_new_tokens,
        "do_sample": False,
        "pad_token_id": tokenizer.eos_token_id
    }

    results = []
    correct = 0

    for i in tqdm(range(num_examples), desc=f"Evaluating {model_id}"):
        prompt = dataset[i]["prompt"]
        label = dataset[i]["answerKey"]

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        output_ids = model.generate(**inputs, **generation_kwargs)[0]
        output_text = tokenizer.decode(output_ids, skip_special_tokens=True)

        pred = extract_answer_tag(output_text[len(prompt):])
        is_correct = pred == label
        correct += is_correct

        results.append({
            "index": i,
            "prompt": prompt,
            "label": label,
            "pred": pred,
            "is_correct": is_correct,
            "output_text": output_text[len(prompt):].strip()
        })

    accuracy = correct / num_examples
    print(f"\n📊 Accuracy: {accuracy:.2%} on {num_examples} samples")

    # Save results
    os.makedirs(save_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(save_dir, f"eval_{model_id.replace('/', '_')}_{timestamp}_{num_examples}.jsonl")

    with open(log_file, "w", encoding="utf-8") as f:
        for item in results:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    return accuracy, results, log_file


In [13]:
dataset[0]

NameError: name 'dataset' is not defined

In [20]:
def build_r1_prompt(example, tokenizer):
    question = example["question"]
    choices = example["choices"]["text"]
    labels = example["choices"]["label"]
    choices_str = "\n".join([f"{label}: {text}" for label, text in zip(labels, choices)])
    
    r1_prefix = [
        {"role": "system", "content": "You are a helpful assistant. You first think through the reasoning, then provide the correct answer clearly in an <answer>...</answer> tag."},
        {"role": "user", "content": f"Given question: {question}, choose from these options {choices_str}. Show your thoughts in <think> </think> tags. And return the final answer option in <answer> </answer> tags, for example: <answer>A</answer>."},
        {"role": "assistant", "content": "Let me solve this question.\n<think>"}
    ]
    return tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True)

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
dataset_id="tau/commonsense_qa"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

prompt_fn = partial(build_r1_prompt, tokenizer=tokenizer)

acc, results, logfile = run_eval_experiment(
    model_id=model_id,
    dataset_id=dataset_id,
    prompt_builder_fn=prompt_fn,
    max_new_tokens=1024,
    num_examples=100,
    seed=42
)

Map: 100%|██████████| 100/100 [00:00<00:00, 4887.90 examples/s]
Evaluating deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 100%|██████████| 100/100 [16:31<00:00,  9.91s/it]


📊 Accuracy: 28.00% on 100 samples





In [21]:
def build_r1_prompt(example):
    question = example["question"]
    choices = example["choices"]["text"]
    labels = example["choices"]["label"]
    choices_str = "\n".join([f"{label}: {text}" for label, text in zip(labels, choices)])
    r1_prefix = [
        {"role": "system", "content": "You are a helpful assistant. You first think through the reasoning, then provide the correct answer clearly in an <answer>...</answer> tag."},
        {"role": "user", "content": f"Given question: {question}, choose from these options {choices_str}. Show your thoughts in <think> </think> tags. And return the final answer option in <answer> </answer> tags, for example: <answer>A</answer>."},
        {"role": "assistant", "content": "Let me solve this question.\n<think>"}
    ]
    return tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True)

acc, results, logfile = run_eval_experiment(
    model_id="Qwen/Qwen2.5-1.5B-Instruct",
    dataset_id="tau/commonsense_qa",
    prompt_builder_fn=build_r1_prompt,
    max_new_tokens=1024,
    num_examples=100,
    seed=42
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 100/100 [00:00<00:00, 4647.12 examples/s]
Evaluating Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [06:40<00:00,  4.01s/it]


📊 Accuracy: 60.00% on 100 samples





In [22]:
def build_r1_prompt(example, tokenizer):
    question = example["question"]
    choices = example["choices"]["text"]
    labels = example["choices"]["label"]
    choices_str = "\n".join([f"{label}: {text}" for label, text in zip(labels, choices)])
    
    r1_prefix = [
        {"role": "system", "content": "You are a helpful assistant. You first think through the reasoning, then provide the correct answer clearly in an <answer>...</answer> tag."},
        {"role": "user", "content": f"Given question: {question}, choose from these options {choices_str}. Show your thoughts in <think> </think> tags. And return the final answer option in <answer> </answer> tags, for example: <answer>A</answer>."},
        {"role": "assistant", "content": "Let me solve this question.\n<think>"}
    ]
    return tokenizer.apply_chat_template(r1_prefix, tokenize=False, continue_final_message=True)

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
dataset_id="tau/commonsense_qa"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

prompt_fn = partial(build_r1_prompt, tokenizer=tokenizer)

acc, results, logfile = run_eval_experiment(
    model_id=model_id,
    dataset_id=dataset_id,
    prompt_builder_fn=prompt_fn,
    max_new_tokens=2048,
    num_examples=100,
    seed=42
)

Evaluating deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B: 100%|██████████| 100/100 [21:10<00:00, 12.70s/it]


📊 Accuracy: 28.00% on 100 samples





### 3. Train model with rule-based reward

In [7]:
def format_reward_func(completions, **kwargs):
    """
    Format: <think>...</think><answer>...</answer>
    Args:
        completions (list[str]): Generated outputs
        target (list[str]): Expected answers
      
      Returns:
          list[float]: Reward scores
    """
    rewards = []

    for completion in completions:

      try:
        # add synthetic <think> as its already part of the prompt and prefilled for the assistant to more easily match the regex
        completion = "<think>" + completion        
        # Check if the format is correct
        regex = r"^<think>([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n<answer>([\s\S]*?)<\/answer>$"

        match = re.search(regex, completion, re.DOTALL) 
        # if the format is not correct, reward is 0
        if match is None or len(match.groups()) != 2:
            rewards.append(0.0)
        else:
            rewards.append(1.0)
      except Exception:
        rewards.append(0.0)
    return rewards

def accuracy_reward_func(completions, targets, **kwargs):
    import re
    rewards = []
    for c, gt in zip(completions, targets):
        match = re.search(r"<answer>\s*([A-E])\s*</answer>", c)
        if match and match.group(1).strip().upper() == gt.strip().upper():
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    return rewards


In [None]:
!python run_r1.py --config grpo-rule_based-Qwen-2.5-1.5B-Instruct-commonsense_qa_test.yaml

{'train_runtime': 101.8214, 'train_samples_per_second': 0.02, 'train_steps_per_second': 0.01, 'train_loss': -7.450580596923828e-09, 'completion_length': 104.125, 'rewards/format_reward_func': 0.5, 'rewards/accuracy_reward_func': 0.875, 'reward': 1.375, 'reward_std': 0.6208146214485168, 'kl': 0.0, 'epoch': 0.02}
***** train metrics *****
  total_flos               =        0GF
  train_loss               =       -0.0
  train_runtime            = 0:01:41.82
  train_samples            =         90
  train_samples_per_second =       0.02
  train_steps_per_second   =       0.01


2025-04-26 21:05:06,212 - __main__ - INFO - Model parameters: ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='float16', trust_remote_code=False, attn_implementation=None, use_peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
INFO:__main__:Model parameters: ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='float16', trust_remote_code=False, attn_implementation=None, use_peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2025-04-26 21:05:06,212 - __main__ - INFO - Training parameters: GRPOConfig(
_n_gpu=

In [None]:
!python run_r1.py --config grpo-rule_based-Qwen-2.5-1.5B-Instruct-commonsense_qa.yaml

2025-04-26 21:33:51,548 - __main__ - INFO - Model parameters: ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='float16', trust_remote_code=False, attn_implementation=None, use_peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
INFO:__main__:Model parameters: ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='float16', trust_remote_code=False, attn_implementation=None, use_peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=True, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2025-04-26 21:33:51,548 - __main__ - INFO - Training parameters: GRPOConfig(
_n_gpu=

In [None]:
!tensorboard --logdir runs/grpo-rule_based-Qwen-2.5-1.5B-Instruct-commonsense_qa

^C
