In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from kegg_utils import load_test_model_and_tokenizer, load_kegg_dataset, parse_llm_responses
import torch
from tqdm import tqdm
import re
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-27 17:10:01 [__init__.py:239] Automatically detected platform cuda.


In [None]:
model, tokenizer = load_test_model_and_tokenizer(adapter_path='/home/ubuntu/code/grpo_difficulty/kegg/models/kegg_correctness/lora',
                                            max_seq_length=4000)

==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.52.4. vLLM: 0.8.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Meta-Llama-3.1-8B-Instruct with actual GPU utilization = 88.82%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 22.07 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 4000. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 4.57 GB. Also swap space = 0 GB.
INFO 06-27 17:10:14 [config.py:585] This model supports multiple tasks: {'score', 'classify', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 06-27 17:10:14 [arg_utils.py:1865] LORA is e

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


In [None]:
ds = load_kegg_dataset(split='test')
len(ds)

In [4]:
def get_responses(tokenizer, model,prompts, num_times_to_repeat: int = 1):
    generation_kwargs = {
        "max_new_tokens": 600,
        "use_cache": True,
        "temperature": 0.9,
        "top_k": None,
        "do_sample": True,
    }

     # Create all formatted prompts at once
    all_formatted_prompts = []
    for prompt in prompts:
        formatted_prompt = tokenizer.apply_chat_template(
            [{'role': 'user', 'content': prompt}],
            tokenize=False, add_generation_prompt=True)
        all_formatted_prompts.extend([formatted_prompt] * num_times_to_repeat)

    # Tokenize in larger batches
    inputs = tokenizer(all_formatted_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    
    with torch.no_grad():  # Disable gradient computation for inference
        outputs = model.generate(**inputs, **generation_kwargs)
    
    outputs = outputs[:, inputs.input_ids.shape[1]:]
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # Reshape outputs back to per-prompt format
    result = []
    for i in range(len(prompts)):
        start_idx = i * num_times_to_repeat
        end_idx = start_idx + num_times_to_repeat
        output = decoded_outputs[start_idx:end_idx]
        if num_times_to_repeat == 1:
            output = output[0]
        result.append(output)
    return result

In [5]:
batch_size = 4
all_outputs = []
for i in tqdm(range(0, len(ds), batch_size), desc="Processing batches"):

    batch_end = min(i + batch_size, len(ds))
    batch = ds[i:batch_end]
    
    
    if batch_size == 1 or batch_end - i == 1:
        qs = [batch['text']]
        answers = [batch['answer']]
    else:
        qs = batch['text']
        answers = batch['answer']
    batch_outputs = get_responses(tokenizer, model, qs, num_times_to_repeat=1)
    all_outputs.extend(batch_outputs)

Processing batches: 100%|██████████| 37/37 [21:26<00:00, 34.77s/it]


In [25]:
answers = [ds[i]['answer'] for i in range(len(all_outputs))]
obj_to_dump = {'responses' : all_outputs, 'answer' : answers}
with open('outputs/grpo_correctness_test_1000.json', 'w') as f:
    json.dump(obj_to_dump, f, )

In [35]:
predictions = [parse_llm_responses(x) for x in all_outputs]
len(predictions), len(answers)

(146, 146)

In [40]:
ct_exact, ct_in, ct_none, ct_wrong = 0,0,0, 0
for pred, ans in zip(predictions, answers):
    if pred is None:
        ct_none += 1
    else:
        if pred == ans:
            ct_exact += 1
        elif ans in pred:
            ct_in += 1
        else:
            ct_wrong += 1
print(f'Exact: {ct_exact}, Has Answer: {ct_in}, Error Process: {ct_none}, Wrong: {ct_wrong}, Total: {len(predictions)}')

Exact: 49, Has Answer: 17, Error Process: 22, Wrong: 58, Total: 146


In [43]:
model.to('cpu')
del model
del tokenizer
torch.cuda.empty_cache()
model, tokenizer = load_test_model_and_tokenizer(adapter_path='/home/ubuntu/code/grpo_difficulty/kegg/runs/checkpoint-400',
                                            max_seq_length=4000)

all_outputs = []
for i in tqdm(range(0, len(ds), batch_size), desc="Processing batches"):

    batch_end = min(i + batch_size, len(ds))
    batch = ds[i:batch_end]
    
    
    if batch_size == 1 or batch_end - i == 1:
        qs = [batch['text']]
        answers = [batch['answer']]
    else:
        qs = batch['text']
        answers = batch['answer']
    batch_outputs = get_responses(tokenizer, model, qs, num_times_to_repeat=1)
    all_outputs.extend(batch_outputs)

answers = [ds[i]['answer'] for i in range(len(all_outputs))]
obj_to_dump = {'responses' : all_outputs, 'answer' : answers}
with open('outputs/grpo_correctness_test_400.json', 'w') as f:
    json.dump(obj_to_dump, f, )

predictions = [parse_llm_responses(x) for x in all_outputs]
len(predictions), len(answers)

ct_exact, ct_in, ct_none, ct_wrong = 0,0,0, 0
for pred, ans in zip(predictions, answers):
    if pred is None:
        ct_none += 1
    else:
        if pred == ans:
            ct_exact += 1
        elif ans in pred:
            ct_in += 1
        else:
            ct_wrong += 1
print(f'Exact: {ct_exact}, Has Answer: {ct_in}, Error Process: {ct_none}, Wrong: {ct_wrong}, Total: {len(predictions)}')

0.4520547945205479