In [46]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import Dataset

In [47]:
split = "test"
data_path_pairs = '../../data/results/prompt_reply_pairs_5_generated_test_t5-sl-small.csv'
step = 5

DEVICE = 'cuda:0'

In [48]:
data = pd.read_csv(data_path_pairs, sep=";")
eval_data = Dataset.from_pandas(data[["prompt", "generated"]])

In [49]:
tokenizer = AutoTokenizer.from_pretrained("vh-student/sloberta-si-rrhf")
model = AutoModelForSequenceClassification.from_pretrained("vh-student/sloberta-si-rrhf")
model.eval()

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

In [50]:
def convert_to_features(examples):
    prefix_in = "UPORABNIK: "
    examples["prompt"] = [prefix_in + prompt for prompt in examples["prompt"]]
    prefix_out = "ASISTENT: "
    examples["generated"] = [prefix_out + reply for reply in examples["generated"]]
    
    examples["PROMPT"] = [prompt + " " + reply for prompt, reply in zip(examples["prompt"], examples["generated"])]
    model_inputs = tokenizer(examples['PROMPT'], pad_to_max_length=True, max_length=512, truncation=True, return_tensors='pt')

    return model_inputs

In [51]:
eval_data = eval_data.map(convert_to_features, batched=True, load_from_cache_file=False)
eval_data.set_format(type="torch", columns=["prompt", "generated", "PROMPT", "input_ids", "attention_mask"])

                                                                   

In [52]:
indices = np.concatenate([np.arange(0, eval_data.num_rows, step), [eval_data.num_rows]])

outputs = []
model = model.to(DEVICE)
with torch.no_grad():
    for i in tqdm(range(len(indices[:-1]))):
        scores = []
        for j in range(step):
            j = int(j)
            with torch.no_grad():
                outputs_proba = model(input_ids = eval_data[int(indices[i]) + j]["input_ids"].unsqueeze(dim = 1).to(DEVICE), 
                                    attention_mask = eval_data[int(indices[i]) + j]["attention_mask"].unsqueeze(dim = 1).to(DEVICE))
            probas = torch.nn.functional.softmax(outputs_proba.logits, dim=1).cpu().detach().numpy()[0]
            p = probas[0] ** 1 + probas[1] ** 2 + probas[2] ** 3 + probas[3] ** 4 + probas[4] ** 5
            scores.append(p)
        P = np.argmax(np.array(scores))
        outputs.append([eval_data[i + int(P)]["prompt"], eval_data[i + int(P)]["generated"].replace("ASISTENT: ", "")])

100%|██████████| 4875/4875 [05:28<00:00, 14.85it/s]


In [53]:
outputs = pd.DataFrame(outputs, columns=["prompt", "generated"])
outputs.to_csv(f"{data_path_pairs.split('.')[0]}_best.csv", sep=";", index=False)