In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
import numpy as np
import datasets

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import sys
sys.path.append('..')
from utils.reasoning import make_segment, split_cot
from torch.nn.utils.rnn import pad_sequence

In [16]:
device = 'cuda'
model_name = "HuggingFaceTB/SmolLM2-135M"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


checkpoint_path = "/home/user33/kashurin/TR_SmolLM2-135M/cot/checkpoint-1500/pytorch_model.bin"
model.load_state_dict(torch.load(checkpoint_path), strict=False)

model.to(device)
print(':)')

:)


In [17]:
pad = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
bos = [tokenizer.bos_token_id]
eos = [tokenizer.eos_token_id]
think = tokenizer.encode("<issue_start>")
ans = tokenizer.encode("<issue_closed>")

In [18]:
def collate_fn(batch):
    input_ids, labels, labels_mask, attention_mask = [], [], [], []
    for sample in batch:
        task, lab, cot = sample['task'], sample['labels'], sample['cot']
        task_tokens = tokenizer.encode(task, add_special_tokens=False)
        labels_tokens = tokenizer.encode(lab, add_special_tokens=False)
        cot_tokens = tokenizer.encode(cot, add_special_tokens=False)

        inp_ids = torch.tensor(task_tokens + think)
        input_ids.append(inp_ids)

        full_input = task_tokens + think + cot_tokens + ans + labels_tokens + eos
        lab = torch.tensor(full_input)
        lab[:inp_ids.shape[0]] = -100
        labels.append(lab)

        lab_mask = torch.ones_like(lab)
        lab_mask[:inp_ids.shape[0]] = 0
        labels_mask.append(lab_mask)
        attention_mask.append(torch.ones_like(inp_ids))

    input_ids = pad_sequence(input_ids, padding_value=pad, batch_first=True, padding_side='left')
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True, padding_side='left')
    labels = pad_sequence(labels, padding_value=-100, batch_first=True, padding_side='left')
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True, padding_side='left')

    collated = {'input_ids': input_ids,
                'labels': labels,
                'attention_mask': attention_mask,
                }
    return collated

In [5]:
dataset = 'booydar/gsm8k'
train_dataset = datasets.load_dataset(dataset, split='train')
valid_dataset = datasets.load_dataset(dataset, split='valid')

In [6]:
class Holder:
    def __init__(self):
        pass
args = Holder()
# args.use_cot = False
args.max_new_tokens = 200
args.task_name = 'gsm8k'

In [7]:
# tokenizer.padding_side = "left"

In [8]:
prompts = [
    "The future of AI is",
    "In a galaxy far far away",
    "To solve this problem we need",
    "Hello"
]

In [12]:
tokenizer.padding_side

'right'

In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [86]:
tokenizer.batch_encode_plus(prompts, padding_side='right')

{'input_ids': [[504, 1774, 282, 5646, 314], [788, 253, 13247, 1869, 1869, 2025], [2068, 5482, 451, 1732, 392, 737], [19556]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1]]}

In [10]:
inputs = tokenizer(prompts, return_tensors="pt", padding=True)

In [11]:
inputs

{'input_ids': tensor([[  504,  1774,   282,  5646,   314,     0],
        [  788,   253, 13247,  1869,  1869,  2025],
        [ 2068,  5482,   451,  1732,   392,   737],
        [19556,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1],
        [1, 0, 0, 0, 0, 0]])}

In [72]:
tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=False)

'<|endoftext|><|endoftext|>The future of AI is'

In [None]:
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=50
)

In [30]:
all_preds, all_labels = [], []
all_preds_cot, all_labels_cot = [], []
all_preds_ans, all_labels_ans = [], []

batch = valid_dataset.select(range(4))
collated = collate_fn(batch)
task = {k:v.to(device) for k,v in collated.items()}

task_length = collated['input_ids'].shape[1]

with torch.no_grad():
    preds_full = model.generate(**task, max_new_tokens=args.max_new_tokens)

labels = collated['labels']
for i, (lab_tokens, pred_tokens) in enumerate(zip(labels, preds_full)):
    labels_mask = lab_tokens != -100
    lab_tokens = lab_tokens[labels_mask].tolist()

    pred_tokens = pred_tokens[task_length:].tolist()
    
    ans_start_index_l = max(i for i, x in enumerate(lab_tokens) if x == ans[0])
    ans_end_index_l = min(i for i, x in enumerate(lab_tokens) if x == eos[0])

    if ans[0] in pred_tokens:
        ans_start_index_p = max(i for i, x in enumerate(pred_tokens) if x == ans[0])
    else:
        ans_start_index_p = ans_start_index_l

    if eos[0] in pred_tokens:
        ans_end_index_p = min(i for i, x in enumerate(pred_tokens) if x == eos[0])
    else:
        ans_end_index_p = ans_end_index_l

    pred_cot_tokens = pred_tokens[:ans_start_index_p]
    lab_cot_tokens = lab_tokens[:ans_start_index_l]

    all_preds_cot.append(pred_cot_tokens)
    all_labels_cot.append(lab_cot_tokens)

    pred_and_tokens = pred_tokens[ans_start_index_p+1:ans_end_index_p]
    lab_ans_tokens = lab_tokens[ans_start_index_l+1:ans_end_index_l]

    all_preds_ans.append(pred_and_tokens)
    all_labels_ans.append(lab_ans_tokens)

    all_preds.append(pred_tokens)
    all_labels.append(lab_tokens)

cot_correct = [p == l for p, l in zip(all_preds_cot, all_labels_cot)]
ans_correct = [p == l for p, l in zip(all_preds_ans, all_labels_ans)]

res = {'accuracy_cot': np.mean(cot_correct), 'accuracy_ans': np.mean(ans_correct)}
data = {"all_preds_cot": all_preds_cot,
        "all_labels_cot": all_labels_cot,
        "all_preds_ans": all_preds_ans,
        "all_labels_ans": all_labels_ans,
        "all_preds": all_preds,
        "all_labels": all_labels}

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [31]:
tokenizer.decode(preds_full[1])

'Hannah has three dogs. The first dog eats 1.5 cups of dog food a day. The second dog eats twice as much while the third dog eats 2.5 cups more than the second dog. How many cups of dog food should Hannah prepare in a day for her three dogs?<issue_start><<1.5*2=3>> <<3+3=6>> <<1.5+3+6=10.5>><issue_closed>10.5<|endoftext|><|endoftext|><|endoftext|>'

In [32]:
task_length

62

In [33]:
tokenizer.decode(pred_tokens)

'<<21/7=3>> <<3*5=15>><issue_closed>15<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [34]:
res

{'accuracy_cot': np.float64(0.25), 'accuracy_ans': np.float64(0.75)}

In [35]:
tokenizer.decode(data["all_labels_cot"][0])

'<<4-2=2>> <<2/.5=4>> <<12/4=3>> <<100*3=300>>'

In [39]:
tokenizer.decode(data["all_labels_ans"][1])

'10'

In [40]:
tokenizer.decode(data["all_preds_ans"][1])

'10.5'

In [46]:
print("Pred", tokenizer.decode(data["all_preds"][3]))
print("Lab", tokenizer.decode(data["all_labels"][3]))

Pred <<21/7=3>> <<3*5=15>><issue_closed>15<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
Lab <<21/7=3>> <<5*3=15>><issue_closed>15<|endoftext|>


In [26]:
all_preds_ans

[[], [33, 32, 30, 37], [], []]

In [87]:
all_labels_ans

[[35, 32, 32]]

In [60]:
ans_correct = [p == l for p, l in zip(all_preds_ans, all_labels_ans)]

In [61]:
ans_correct

[False]