In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import *

In [None]:
split = "test"
data_path_pairs = '../../data/results/prompt_reply_pairs_5_generated_test_t5-sl-small.csv'
step = 5

In [None]:
data = pd.read_csv(data_path_pairs, sep=";")
eval_data = Dataset.from_pandas(data[["prompt", "generated"]])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vh-student/sloberta-si-rrhf")
model = AutoModelForSequenceClassification.from_pretrained("vh-student/sloberta-si-rrhf")

In [None]:
def convert_to_features(examples):
    prefix_in = "UPORABNIK: "
    examples["prompt"] = [prefix_in + prompt for prompt in examples["prompt"]]
    prefix_out = "ASISTENT: "
    examples["generated"] = [prefix_out + reply for reply in examples["generated"]]
    
    examples["PROMPT"] = [prompt + " " + reply for prompt, reply in zip(examples["prompt"], examples["generated"])]
    model_inputs = tokenizer(examples['PROMPT'], pad_to_max_length=True, max_length=512, truncation=True, return_tensors='pt')

    return model_inputs

In [None]:
eval_data = eval_data.map(convert_to_features, batched=True, load_from_cache_file=False)
eval_data.set_format(type="torch", columns=["prompt", "generated", "PROMPT", "input_ids", "attention_mask"])

In [None]:
indices = np.concatenate([np.arange(0, eval_data.num_rows, step), [eval_data.num_rows]])

outputs = []
for i in tqdm(range(len(indices[:-1]))):
    scores = []
    for j in range(step):
        with torch.no_grad():
            outputs_proba = model(input_ids = eval_data[int(indices[i]) + j]["input_ids"].unsqueeze(dim = 1), 
                                  attention_mask = eval_data[int(indices[i]) + j]["attention_mask"].unsqueeze(dim = 1))
        probas = torch.nn.functional.softmax(outputs_proba.logits, dim=1).detach().numpy()[0]
        p = probas[0] ** 1 + probas[1] ** 2 + probas[2] ** 3 + probas[3] ** 4 + probas[4] ** 5
        scores.append(p)
    P = np.argmax(np.array(scores))
    outputs.append([eval_data[i + int(P)]["prompt"], eval_data[i + int(P)]["generated"]])

In [None]:
outputs = pd.DataFrame(outputs, columns=["prompt", "generated"])
outputs.to_csv(f"{data_path_pairs.split('.')[0]}_best.csv", sep=";", index=False)