In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from kegg_utils import load_model_and_tokenizer, load_kegg_dataset, parse_llm_responses
import torch
from tqdm import tqdm
import re
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-26 22:11:22 [__init__.py:239] Automatically detected platform cuda.


In [None]:
model, tokenizer = load_model_and_tokenizer(max_seq_length=4000)

In [None]:
ds = load_kegg_dataset(split='test')
len(ds)

In [4]:
def get_responses(tokenizer, model,prompts, num_times_to_repeat: int = 1):
    generation_kwargs = {
        "max_new_tokens": 600,
        "use_cache": True,
        "temperature": 0.9,
        "top_k": None,
        "do_sample": True,
    }

     # Create all formatted prompts at once
    all_formatted_prompts = []
    for prompt in prompts:
        formatted_prompt = tokenizer.apply_chat_template(
            [{'role': 'user', 'content': prompt}],
            tokenize=False, add_generation_prompt=True)
        all_formatted_prompts.extend([formatted_prompt] * num_times_to_repeat)

    # Tokenize in larger batches
    inputs = tokenizer(all_formatted_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    
    with torch.no_grad():  # Disable gradient computation for inference
        outputs = model.generate(**inputs, **generation_kwargs)
    
    outputs = outputs[:, inputs.input_ids.shape[1]:]
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # Reshape outputs back to per-prompt format
    result = []
    for i in range(len(prompts)):
        start_idx = i * num_times_to_repeat
        end_idx = start_idx + num_times_to_repeat
        result.append(decoded_outputs[start_idx:end_idx])
    return result

In [5]:
batch_size = 4
all_outputs = []
for i in tqdm(range(0, len(ds), batch_size), desc="Processing batches"):

    batch_end = min(i + batch_size, len(ds))
    batch = ds[i:batch_end]
    
    
    if batch_size == 1 or batch_end - i == 1:
        qs = [batch['text']]
        answers = [batch['answer']]
    else:
        qs = batch['text']
        answers = batch['answer']
    batch_outputs = get_responses(tokenizer, model, qs, num_times_to_repeat=1)
    all_outputs.extend(batch_outputs)

Processing batches: 100%|██████████| 37/37 [21:26<00:00, 34.77s/it]


In [24]:
new_outputs = []
for a in all_outputs:
    assert isinstance(a, list)
    assert len(a) == 1
    new_outputs.append(a[0])
all_outputs = new_outputs

In [25]:
answers = [ds[i]['answer'] for i in range(len(all_outputs))]
obj_to_dump = {'responses' : all_outputs, 'answer' : answers}
with open('outputs/kegg_test.json', 'w') as f:
    json.dump(obj_to_dump, f, )

In [35]:
predictions = [parse_llm_responses(x) for x in all_outputs]
len(predictions), len(answers)

(146, 146)

In [40]:
ct_exact, ct_in, ct_none, ct_wrong = 0,0,0, 0
for pred, ans in zip(predictions, answers):
    if pred is None:
        ct_none += 1
    else:
        if pred == ans:
            ct_exact += 1
        elif ans in pred:
            ct_in += 1
        else:
            ct_wrong += 1
print(f'Exact: {ct_exact}, Has Answer: {ct_in}, Error Process: {ct_none}, Wrong: {ct_wrong}, Total: {len(predictions)}')

Exact: 49, Has Answer: 17, Error Process: 22, Wrong: 58, Total: 146


In [43]:
(49 + 17) / 146

0.4520547945205479