In [77]:
%load_ext autoreload
%autoreload 2
import re 
import os 
os.environ["TOKENIZERS_PARALLELIS"] = "true"
import torch 
from torch.utils.data import DataLoader 
import sympy 

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

import datasets 
from datasets import load_dataset 

from curious.data import ReasoningGymDataset, GSM8KDataset
from curious.utils import tokenize_questions
from curious.grpo import rollout
from curious.reward import GSM8KRewardModel

import reasoning_gym 

from math_verify import verify, parse
from pprint import pprint

EACH_DATASET_SIZE = 100
SEED = 42
MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
gsm8k = GSM8KDataset()
gsm8k_rm = GSM8KRewardModel(use_format_reward=False)

In [95]:
idx = 8
item = gsm8k[idx]
pprint(item)

{'answer': 'He has 36 eggs because 3 x 12 = <<3*12=36>>36\n'
           'He can make 9 omelets because 36 / 4 = <<36/4=9>>9\n'
           'Each person gets 3 omelets because 9 / 3 = <<9/3=3>>3\n'
           '#### 3',
 'oracle_answer': '3',
 'question': 'Pauly is making omelets for his family. There are three dozen '
             'eggs, and he plans to use them all. Each omelet requires 4 eggs. '
             'Including himself, there are 3 people. How many omelets does '
             'each person get?'}


In [99]:
from tqdm import tqdm
for i in tqdm(range(len(gsm8k))):
    item = gsm8k[i]
    completion = item["answer"]
    completion = completion.replace("####", "<answer>").strip()
    completion += "</answer>"
    #print(completion)

    parsed_answer, outcome_reward, outcome_info = gsm8k_rm.outcome_reward(completion, item["oracle_answer"])
    #print(parsed_answer, reward, outcome_info)


    parsed_reasoning, format_reward, format_info = gsm8k_rm.format_reward(completion)
    #print(parsed_reasoning, reward, format_info)
    assert format_reward == 0.0, print(format_info)


    reward, info = gsm8k_rm.instance_reward(completion, item["oracle_answer"])
    #print(reward, info)
    assert reward == 1.0, print(info)

100%|██████████| 7473/7473 [00:08<00:00, 908.17it/s]


In [75]:
print(type(parsed_answer[0]))

<class 'sympy.core.numbers.Integer'>


In [78]:
isinstance(parsed_answer[0], sympy.Expr)

True

In [65]:
item["oracle_answer"]

['3', '55']

In [64]:
verify(parsed_answer, item["oracle_answer"])

False

In [40]:
text = """<think> 
because the number is too big, we need to use scientific notation
</think>
"""
think_match = re.search(r"<think>(.*?)</think>", text, flags=re.DOTALL)
print(think_match.group(1))
parsed_answer = parse(text, fallback_mode="first_match", extraction_mode="first_match")
print(parsed_answer)
print(type(parsed_answer[0]), type(parsed_answer[1]))

 
because the number is too big, we need to use scientific notation

[]


IndexError: list index out of range

# Load the dataset
*** 

In [91]:
datasets = [
    "complex_arithmetic",
    "intermediate_integration",
    "polynomial_equations",
    "simple_equations",
]

dataset = ReasoningGymDataset(
    datasets_name=datasets,
    size=EACH_DATASET_SIZE,
    seed=SEED
)

print(len(dataset))


400


In [92]:
dataset[0]

{'question': 'Add the complex numbers: (-10.0 - 2.0i) + (-3.0 - 3.0i)',
 'answer': '-13.0 - 5.0i',
 'dataset_name': 'complex_arithmetic',
 'dataset_idx': 0,
 'item_idx': 0,
 'global_idx': 0}

In [93]:
dataset[9]

{'question': 'Subtract the complex numbers: (6.0 + 7.0i) - (-5.0 - 3.0i)',
 'answer': '11.0 + 10.0i',
 'dataset_name': 'complex_arithmetic',
 'dataset_idx': 0,
 'item_idx': 9,
 'global_idx': 9}

In [94]:
data_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0
)
print(len(data_loader))    


13


In [53]:
for batch in data_loader:
    print(len(batch["question"]))
    print(batch["dataset_name"])
    print('*'*100)

32
['complex_arithmetic', 'intermediate_integration', 'polynomial_equations', 'polynomial_equations', 'intermediate_integration', 'simple_equations', 'polynomial_equations', 'polynomial_equations', 'simple_equations', 'simple_equations', 'simple_equations', 'polynomial_equations', 'complex_arithmetic', 'polynomial_equations', 'polynomial_equations', 'simple_equations', 'simple_equations', 'simple_equations', 'polynomial_equations', 'simple_equations', 'intermediate_integration', 'complex_arithmetic', 'intermediate_integration', 'simple_equations', 'intermediate_integration', 'polynomial_equations', 'polynomial_equations', 'simple_equations', 'polynomial_equations', 'complex_arithmetic', 'intermediate_integration', 'intermediate_integration']
****************************************************************************************************
32
['intermediate_integration', 'complex_arithmetic', 'complex_arithmetic', 'polynomial_equations', 'simple_equations', 'simple_equations', 'polyno

# Load the model 
*** 

In [54]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [55]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    #attn_implementation="flash_attention_2",
)

# Tokenization 
*** 

In [56]:
samples = next(iter(data_loader))

In [83]:
encodings = tokenize_questions(
    tokenizer,
    questions= samples["question"],
    max_length= 1024,
)

Adjusting padding side from right to left for training


In [84]:
encodings["attention_mask"].shape 

torch.Size([32, 1024])

In [85]:
tokenizer.padding_side

'left'

In [86]:
print(
    tokenizer.decode(encodings["input_ids"][0])
)


<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [76]:
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}

In [79]:
tokenizer.all_special_ids

[151645, 151643, 151644]

In [81]:
tokenizer.convert_ids_to_tokens(tokenizer.all_special_ids)

['<|im_end|>', '<|endoftext|>', '<|im_start|>']

# Policy Rollout 
*** 

In [95]:
generation_config = GenerationConfig(
    max_new_tokens=512,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    temperature=0.7,
    num_return_sequences=4,
)

In [99]:
seq_ids, returns, action_mask, completions = rollout(
    model,
    tokenizer,
    batch_inputs=encodings,
    oracle_answers=samples["answer"],
    generation_config=generation_config,
)



RuntimeError: Placeholder storage has not been allocated on MPS device!