In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from datasets import load_dataset

from peft import AutoPeftModelForCausalLM

def load_io_dataset(split, uf=40):
    data_files = []
    data_files.append(f"TTE_uf{uf}/TTE_with_IO_{split}.json")
    dataset = load_dataset("json", data_files=f"TTE_uf{uf}/TTE_with_IO_{split}.json")

    def format_example(example):
        return {
            "text": f"Prompt: {example['input']}\nResponse: {example['output']}\n"
        }

    dataset = dataset["train"].map(format_example)
    return dataset

train_dataset = load_io_dataset("train")
test_dataset = load_io_dataset("test")
eval_dataset = load_io_dataset("eval")
print(train_dataset)
print(test_dataset)
print(eval_dataset)



#model_name = "./fine_tuned_qwen3-4b/checkpoint-277"
#model_name = "./fine_tuned_adapters_qwen3-4b"
model_name = "./LA-Framework/results/quantized_model/FT_Qwen3-4B"

# load the tokenizer and the model

inference_model = AutoPeftModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    #torch_dtype=torch.bfloat16
)

model_name = "../Qwen/models--Qwen--Qwen3-4B/snapshots/1cfa9a7208912126459214e8b04321603b3df60c"

tokenizer = AutoTokenizer.from_pretrained(model_name)

#print(tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4430 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4430 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4430 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'text'],
    num_rows: 4430
})
Dataset({
    features: ['input', 'output', 'text'],
    num_rows: 4430
})
Dataset({
    features: ['input', 'output', 'text'],
    num_rows: 4430
})


Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

In [3]:
prompt = eval_dataset[1]['input']
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output = inference_model.generate(**inputs, max_new_tokens=600)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Based on the user's watching history and the film's rating, order the 50 candidate films. Order the films where the first one of the list is the most likely to be watched by the user. The output should only contain the ordered list of the recommended films bounded by the  special strings '%% START RECOMMENDED LIST %%' and '%% END LIST %%'. Here follows the user's history and the list of candidate films.
%% START HISTORY %%
Movie name: Payback (1999)	Rating:3
Movie name: Under Siege (1992)	Rating:3
Movie name: Independence Day (ID4) (1996)	Rating:3
Movie name: Double Jeopardy (1999)	Rating:3
Movie name: Maverick (1994)	Rating:4
Movie name: Man in the Iron Mask, The (1998)	Rating:3
Movie name: Die Hard: With a Vengeance (1995)	Rating:3
Movie name: Getaway, The (1994)	Rating:3
Movie name: Demolition Man (1993)	Rating:3
Movie name: Conspiracy Theory (1997)	Rating:3
%% END HISTORY %%
%% START CANDIDATES %%
Movie name: Them! (1954)
Movie name: Penitentiary II (1982)
Movie name: Life and Time

In [39]:
import torch
from tqdm.auto import tqdm   # optional, for nicer progress


def split_list(text, start_token, end_token):
    phrase = text.split(start_token + "\n")
    movie_list = phrase[1].split("\n" + end_token)[0]
    movie_list = movie_list.split("\n")
    for i, movie in enumerate(movie_list):
        movie_list[i] = movie.split("\t")
    return movie_list


def perform_and_return_comparison(sample):
    #prompt = sample['text']
    inputs = tokenizer(sample['input'], return_tensors="pt").to("cuda")
    while True:
        output = inference_model.generate(**inputs, max_new_tokens=600)
        rating = tokenizer.decode(output[0], skip_special_tokens=True)
        if ("%% START RECOMMENDED LIST %%\n" in rating):
            break
    
    rating = split_list(rating, "%% START RECOMMENDED LIST %%", "%% END LIST %%")
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    return {'input': sample['input'],
            'rating': rating}

In [None]:
####   Parallel evaluation code:

In [5]:
import torch
from tqdm.auto import tqdm   # optional, for nicer progress

# Parallel LLM generation of evaluation's results

def perform_and_return_comparison_batched(batch):
    # batch is a dict with lists: {'input': [...], 'text': [...]}
    inputs = tokenizer(
        batch['input'],
        return_tensors="pt",padding=True,
        truncation=True,
        max_length=1024,  # ← set sensible value, e.g. 2048
        padding_side='left'
    ).to("cuda")

    with torch.inference_mode():           # saves a bit of memory
        outputs = inference_model.generate(
            **inputs,
            max_new_tokens=600
        )

    decoded = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True
    )

    results = []
    for i, rating_full in enumerate(decoded):
        rating = split_list(
            rating_full,
            "%% START RECOMMENDED LIST %%",
            "%% END LIST %%"
        )
        results.append({
            'input': batch['input'][i],
            'rating': rating
        })

    return {'results': results}   # list of dicts — datasets will flatten it

# This is usually enough parallelism for one GPU
eval_output = eval_dataset.map(
    perform_and_return_comparison_batched,
    batched=True,
    batch_size=8,          # ← tune this: 4–16 most common sweet spot
                           # larger = better GPU util, but risk of OOM
    desc="Evaluating",
    remove_columns=eval_dataset.column_names,  # optional — keep only new fields
)

eval_output.to_json("peft_quantized_eval_processed_parallel.json")

Evaluating:   0%|          | 0/4430 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

19514488