# RFT - GMS8K 
- Candidate: Eric Liu 

# Loading

In [1]:
%load_ext autoreload
%autoreload 2

import os  
import torch 
import numpy as np 

from tqdm import tqdm 
from textwrap import dedent  

import utils 
import prompt 
from utils import GSM8KParser, GMS8KEvaluator
from datasets import load_dataset
from main import GSM8KDataset, Phi3LightningModule 

from sympy.parsing.sympy_parser import parse_expr 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from torch.utils.data import DataLoader

import pytorch_lightning as pl 
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger
from lightning.pytorch.callbacks import RichProgressBar 

import wandb 

os.environ["TOKENIZERS_PARALLELISM"] = "true"
MODEL_NAME = "microsoft/Phi-3.5-mini-instruct"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.special_tokens_map_extended

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

{'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'pad_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)}

# Dataset Exploration 

## Instpect text 

In [4]:
train_dataset = load_dataset('gsm8k', 'main')['train']
val_dataset = load_dataset('gsm8k', 'main')['test'] 
print(f"Num Training instances: {len(train_dataset)}")
print(f"Num Validation instances: {len(val_dataset)}")
print(type(train_dataset)) 

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Num Training instances: 7473
Num Validation instances: 1319
<class 'datasets.arrow_dataset.Dataset'>


In [5]:
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [6]:
for _ in range(5):
    seed = np.random.randint(0, len(train_dataset))
    print("*"*100)
    print(f"Checking instance {seed}:")
    utils.inspect_instance(train_dataset, seed)

****************************************************************************************************
Checking instance 3559:
question
Tim buys 3 loaves of bread.  Each loaf of bread has 20 slices.  He pays for the 3 loaves of bread with 2 $20 bills.  He gets $16 change.  How much does each slice cost, in cents?
answer
He gave 2*20=$<<2*20=40>>40.
So the total cost was 40-16=$<<40-16=24>>24.
That means each slice costs 24/3=$<<24/3=8>>8.
That means it costs 8*100=<<8*100=800>>800 cents.
So each slice is 800/20=<<800/20=40>>40 cents.
#### 40
**************************************************
****************************************************************************************************
Checking instance 1948:
question
Tonya has opened a lemonade stand selling lemonade in small, medium, and large cups at $1, $2 and $3 respectively. At the close of business she ended up making $50.  As she went back through her inventory she noticed that she sold $11 worth of small lemonades and $24 w

## Extract statistics 

### Calculate Num Tokens

We only look at train set now for certain information that will be used
during inference 

- Maximum length (num_tokens) of question: 239
- Maximum length (num_tokens) of answer: 475 

In [7]:
train_dataset = train_dataset.map(
    lambda x: GSM8KParser.get_question_length(x['question'], tokenizer)
)

train_dataset = train_dataset.map(
    lambda x: GSM8KParser.get_answer_length(x['answer'], tokenizer) 
)
print(f"Maximum answer num_tokens: {max(train_dataset['answer_length'])}")
print(f"Maximum question num_tokens: {max(train_dataset['question_length'])}")

Maximum answer num_tokens: 475
Maximum question num_tokens: 239


### Extract answers

In [8]:
# infer number of hops 
train_dataset = train_dataset.map(
    lambda x: GSM8KParser.get_num_hops(x['answer'])
)

# infer answes using ground truth parser 
train_dataset = train_dataset.map(
    lambda x: GSM8KParser.get_answer_from_gt(x['answer'])
)

In [9]:
# Optinal Cell (Only to verify that parsing from 
# ground truth and parsing from completion would 
# yield the same result 

# infer answers using prediction parser
answer_str_inf = [
    GSM8KParser.get_answer_from_pred(x)['answer_str_digit'] \
    for x in train_dataset['answer']
]
assert answer_str_inf == train_dataset['answer_str_digit']

### Instance Generation 
We selected the longest dataset 

In [None]:
generation_config = {
    "max_new_tokens" : MAX_NEW_TOKNES_SAMPE,
    "temperature": 0.7,
    "num_return_sequences":1,
    "top_p": 0.9,
    "eos_token_id":tokenizer.eos_token_id,  # Specify the EOS token
    "pad_token_id":tokenizer.eos_token_id, 
    "do_sample":True,
    "output_scores":True,
    "return_dict_in_generate":True,
}

In [None]:
instance = sorted(
    train_dataset, 
    key=lambda x: x['answer_length'], 
    reverse=True
)[50]

chat = [
    {
        "role": "system",
        "content": prompt.EvalTemplate.system
    },
    {
        "role": "user",
        "content": prompt.EvalTemplate.user.format(
            question=instance['question'],
            eos_token = tokenizer.eos_token,
        )
    }
]
prompts = tokenizer.apply_chat_template(
    [chat],  
    add_generation_prompt=True,
    tokenize = False,
    return_tensors='pt',
    )

print(len(prompts))
print(prompts[0])

In [None]:
print(instance["answer"], instance["answer_str_digit"])

In [None]:
model.eval()
outs = utils.sample_answers(
    tokenizer,
    model,
    prompts,
    **generation_config,
)

In [None]:
print(outs[0])

In [None]:
preds = [GSM8KParser.get_answer_from_pred(out)["answer_str_digit"] for out in outs]
print(preds)

evaluator = GMS8KEvaluator()
refs =  [instance["answer_str_digit"]]
print(refs)

maj_accs = [
    evaluator.get_maj_at_k(pred, ref) \
    for pred, ref in zip(preds, refs)
]

print(maj_accs)

In [None]:
probs.shape 

In [None]:
print(out[1])

In [None]:
print(
    GSM8KParser.get_answer_from_pred(out[1])["answer_str_digit"]
)

print(instance["answer"])


In [None]:
evaluator = GMS8KEvaluator()

# Base Model Eval 
***

**Before we start, let's get a good hang of the performance of the base model**

In [7]:
valData = GSM8KDataset(val_dataset, tokenizer)
val_dataloader = DataLoader(
    valData, 
    batch_size=4, 
    shuffle=False, 
    num_workers=16,
)
generation_config = {
    "max_new_tokens" : valData.inf_seq_length,
    "temperature": 0.7,
    "num_return_sequences":1,
    "top_p": 0.9,
    "eos_token_id":tokenizer.eos_token_id,  # Specify the EOS token
    "pad_token_id":tokenizer.eos_token_id, 
    "do_sample":True,
    "output_scores":True,
    "return_dict_in_generate":True,
    #"cache_implementation":"static"
}#     val_loader = DataLoader(val_data, batch_size=4, num_workers=4)
print(f"Maximum num_tokens for inference: {valData.inf_seq_length}")

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Maximum answer num_tokens: 430
Maximum question num_tokens: 289
Maximum sequence num_tokens: 719
Maximum new tokens in generation: 1024


Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Setup Completed dataset:
Dataset({
    features: ['question', 'answer', 'answer_str_digit', 'question_length', 'answer_length', 'question_input_ids', 'question_attention_mask', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1319
})
Maximum num_tokens for inference: 1024


In [8]:
module = Phi3LightningModule(
    MODEL_NAME, 
    generation_config=generation_config
)

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [9]:
wandb_logger = WandbLogger(
    project="phi3-gsm8k-training", 
    log_model="all"
)

pbar = RichProgressBar()
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="auto",
    devices=-1,
    logger=wandb_logger,
    #strategy='DDP',
    #callbacks=[RichProgressBar()]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [10]:
trainer.test(
    module,  
    dataloaders=val_dataloader, 
)

You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.

Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

# Rejection Sampling 

In [None]:
idx = 0 
instance = sorted_data[idx]

chat = [
    {
        "role": "system",
        "content": prompt.Template.system
    },
    {
        "role": "user",
        "content": prompt.Template.user.format(question=instance['question'])
    }
]


In [None]:
# idxs = [100, 1000, 2000]

# convs = []
# for idx in idxs:
#     instance = train_dataset[idx] 
#     conv =[
#         {
#             "role": "system",
#             "content": prompt.Template.system
#         },
#         {
#             "role": "user",
#             "content": prompt.Template.user.format(question=instance['question'])
#         }
#     ]
#     convs.append(conv)

chats = tokenizer.apply_chat_template(
    [chat],  
    add_generation_prompt=True,
    tokenize = False,
    return_tensors='pt',
    )
print(type(chat), len(chats))
print(chats[0])

In [None]:
#tokenizer.batch_encode_plus(chats, return_tensors='pt', padding='longest')["input_ids"]

In [None]:
from transformers.utils import is_flash_attn_2_available 
is_flash_attn_2_available()

In [None]:
model.eval()
samples = utils.sample_answers(
    tokenizer=tokenizer, 
    model=model, 
    chats = chats,
    max_new_tokens=256, 
    temperature=0.5,
    num_samples=10,
    top_p= 0.85,
)

In [None]:
print(len(samples))

In [None]:
rand_samples_base = ''

for sample in samples:
    rand_samples_base += (sample + '\n') 
    rand_samples_base += ("*"*50 + '\n') 

print(rand_samples_base)
with open("long_hop.txt", 'w') as f:
    f.write(rand_samples_base)
f.close()