In [63]:
# train_grpo.py
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, TaskType, get_peft_model
import multiprocessing
import psutil
import time
import builtins
import io
import sys
import re
import os
import json
import sys
from tqdm import trange

# This is to avoid the error of too many digits in the input_output
sys.set_int_max_str_digits(0)

if (os.path.exists('./taco_train')):
    TACO_train = load_from_disk('./taco_train')
else:
    TACO_train = load_dataset("BAAI/TACO", split="train")
    TACO_train.save_to_disk('./taco_train')

if (os.path.exists('./taco_valid')):
    TACO_valid = load_from_disk('./taco_valid')
else:
    TACO_valid = load_dataset("BAAI/TACO", split="test")
    TACO_valid.save_to_disk('./taco_valid')

TACO_train = TACO_train \
    .rename_column('question', 'prompt') \
    .rename_column('solutions', 'completion')

TACO_valid = TACO_valid \
    .rename_column('question', 'prompt') \
    .rename_column('solutions', 'completion')
#prompt_to_completion_valid = {TACO_valid['prompt'][i]: TACO_valid['completion'][i] for i in range(len(TACO_valid))}


prompt_to_completion_train = {TACO_train[i]['prompt']: json.loads(TACO_train[i]['input_output']) for i in trange(len(TACO_train))}

prompt_to_time = {TACO_train[i]['prompt']: TACO_train[i]['time_limit'] for i in trange(len(TACO_train))}

#

100%|██████████| 25443/25443 [00:32<00:00, 787.82it/s] 
100%|██████████| 25443/25443 [00:20<00:00, 1253.39it/s]


In [88]:
test = TACO_train.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": x["prompt"]},
    ]
})
test[0]['prompt']

Map:   0%|          | 0/25443 [00:00<?, ? examples/s]

Map: 100%|██████████| 25443/25443 [00:19<00:00, 1323.55 examples/s]


[{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <think> and </think>.\nThen, provide your solution between <solution></solution>',
  'role': 'system'},
 {'content': 'This is an interactive problem.\n\nIn good old times dwarves tried to develop extrasensory abilities:\n\n  * Exactly n dwarves entered completely dark cave. \n  * Each dwarf received a hat — white or black. While in cave, none of the dwarves was able to see either his own hat or hats of other Dwarves. \n  * Dwarves went out of the cave to the meadow and sat at an arbitrary place one after the other. When a dwarf leaves the cave, he sees the colors of all hats of all dwarves that are seating on the meadow (i.e. left the cave before him). However, he is not able to see the color of his own hat and none of the dwarves can give him this information. \n  * The task for dwarves was to got diverged into two parts — one with dwarves with white hats and one with black 

In [85]:
for i in range(1000):
    if TACO_train[i]['time_limit'] != None:
        print(re.compile("(.*) second").findall(TACO_train[i]['time_limit']))

TACO_train[100]['time_limit'][:-8]

['2.0']
['1.0']
['1.0']
['1']
['1.0']
['2.0']
['1.0']
['0.5']
['1']
['2.0']
['2']
['3.0']
['2.0']
['5.0']
['1.0']
['2.0']
['2.0']
['1.5']
['1.0']
['3.0']
['3.0']
['2.0']
['2.0']
['3.0']
['1.5']
['1']
['1.0']
['1']
['1.0']
['1']
['8.0']
['2']
['3.0']
['2']
['1.0']
['1']
['2']
['2.0']
['4.0']
['2.0']
['1.0']
['4']
['6.0']
['1']
['1']
['2']
['3.0']
['1']
['1.0']
['2.0']
['6.0']
['6.0']
['0.5']
['8.0']
['1']
['1']
['1']
['3.0']
['2']
['4.0']
['0.35122']
['2']
['1.0']
['1']
['1.0']
['2']
['2.0']
['2.0']
['1']
['1']
['1.0']
['1.0']
['2.5']
['1']
['2.0']
['1']
['1.0']
['1.0']
['0.235294']
['3.0']
['2.0']
['2.0']
['2.0']
['1']
['0.5']
['8.0']
['1']
['1.0']
['1.0']
['1.0']
['2.0']
['2']
['8.0']
['2']
['1.5']
['1.0']
['1.0']
['1']
['0.5']
['1.0']
['2.0']
['1.0']
['2.0']
['2.0']
['1.0']
['4.0']
['2.0']
['1.0']
['1.0']
['2.0']
['2.0']
['1.0']
['1.0']
['3.0']
['1']
['3.0']
['1']
['8.0']
['3.0']
['2']
['2.0']
['3.0']
['1']
['5.0']
['2.0']
['1']
['8.0']
['1']
['0.5']
['8.0']
['2.0']
['1']
['2.0']
['2

'6.0'

In [17]:
#this is the code that will be used to test the code
def test_code(code, cases, ex_out, time_limit):
    score = 0
    correct_cases = 0

    def run_code(code, case_input, output_queue):
        # Mock input
        builtins.input = lambda: case_input

        # Capture output
        buf = io.StringIO()
        sys.stdout = buf
        try:
            exec(code)
        except Exception as e:
            output_queue.put(("ERROR\n", 0))
            return
        finally:
            sys.stdout = sys.__stdout__

        output_queue.put((buf.getvalue(), 0))  # memory will be added later

    for i in range(len(cases)):
        output_queue = multiprocessing.Queue()
        p = multiprocessing.Process(target=run_code, args=(code, cases[i], output_queue))
        p.start()
        proc = psutil.Process(p.pid)

        max_mem = 0
        start_time = time.time()
        first_mem = proc.memory_info().rss / 1024
        while p.is_alive() and time.time() - start_time < time_limit:
            try:
                mem = proc.memory_info().rss / 1024  # in KB
                max_mem = max(max_mem, mem)
            except psutil.NoSuchProcess:
                break
            time.sleep(0.05)

        if p.is_alive():
            p.terminate()
            out = "TIME LIMIT EXCEEDED\n"
            score -= 1 / len(cases) * 10
        else:
            try:
                out, _ = output_queue.get(timeout=1)
            except:
                out = "ERROR\n"
                score -= 1 / len(cases) * 50

        #print(f"Case {i+1}: Memory used ≈ {int(max_mem)} KB")

        if out == ex_out[i]:
            correct_cases += 1
    score += correct_cases / len(cases) * 100
    if correct_cases == len(cases):
        score += 25
    return score

def extract_tags(completion, tag_start, tag_end):
    return re.compile(f"{tag_start}(.*?){tag_end}").find(completion)

def reward_check(completions, **kwargs):
    #completions is the code???
    rewards=[]
    for completion in completions:
        code = extract_tags(completion, "<solution>", "</solution>")
        time_limit = prompt_to_time[kwargs['prompt']]
        cases = prompt_to_completion_train[kwargs['prompt']]['inputs']
        ex_out = prompt_to_completion_train[kwargs['prompt']]['outputs']
        reward = test_code(code, cases, ex_out, time_limit)
        rewards.append(reward)
    return rewards

In [18]:
reasoning_start = "<think>" 
reasoning_end   = "</think>"
solution_start  = "<solution>"
solution_end    = "</solution>"

system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""

In [19]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", logging_steps=10, per_device_train_batch_size=24)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model = get_peft_model(model, peft_config)

In [20]:
chat_template = \
    "{% if messages[0]['role'] == 'system' %}"\
        "{{ messages[0]['content'] + eos_token }}"\
        "{% set loop_messages = messages[1:] %}"\
    "{% else %}"\
        "{{ '{system_prompt}' + eos_token }}"\
        "{% set loop_messages = messages %}"\
    "{% endif %}"\
    "{% for message in loop_messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ message['content'] }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ message['content'] + eos_token }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}{{ '{reasoning_start}' }}"\
    "{% endif %}"

# Replace with out specific template:
chat_template = chat_template\
    .replace("'{system_prompt}'",   f"'{system_prompt}'")\
    .replace("'{reasoning_start}'", f"'{reasoning_start}'")
tokenizer.chat_template = chat_template

In [24]:
tokenizer.apply_chat_template([
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing well, thank you!"},
    {"role": "user", "content": "What is the capital of France?"},
], tokenize = False, add_generation_prompt = True)

"You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <think> and </think>.\nThen, provide your solution between <solution></solution><|im_end|>Hello, how are you?I'm doing well, thank you!<|im_end|>What is the capital of France?<think>"

KeyError: 0

KeyboardInterrupt: 

In [66]:
test = TACO_train.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": x["prompt"]},
    ],
    "answer": str(json.loads(x["input_output"])['outputs'])
})
test[0]

Map: 100%|██████████| 25443/25443 [00:35<00:00, 723.01 examples/s] 


{'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <think> and </think>.\nThen, provide your solution between <solution></solution>',
   'role': 'system'},
  {'content': 'This is an interactive problem.\n\nIn good old times dwarves tried to develop extrasensory abilities:\n\n  * Exactly n dwarves entered completely dark cave. \n  * Each dwarf received a hat — white or black. While in cave, none of the dwarves was able to see either his own hat or hats of other Dwarves. \n  * Dwarves went out of the cave to the meadow and sat at an arbitrary place one after the other. When a dwarf leaves the cave, he sees the colors of all hats of all dwarves that are seating on the meadow (i.e. left the cave before him). However, he is not able to see the color of his own hat and none of the dwarves can give him this information. \n  * The task for dwarves was to got diverged into two parts — one with dwarves with white hats and on

In [22]:
trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    reward_funcs=reward_check,
    args=training_args,
    train_dataset=[TACO_train[0]],
    eval_dataset=[TACO_valid[0]]
)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: argument of type 'NoneType' is not iterable