In [1]:
import torch
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

wandb_logger = WandbLogger(project="RRHF")

ModuleNotFoundError: No module named 'pytorch_lightning'

In [3]:
# Set the number of categories
num_categories = 5

# Load the base model configuration
config = AutoConfig.from_pretrained("cjvt/sloberta-si-nli")
config.num_labels = num_categories

tokenizer = AutoTokenizer.from_pretrained("cjvt/sloberta-si-nli")
model = AutoModelForSequenceClassification.from_pretrained("cjvt/sloberta-si-nli", config=config, ignore_mismatched_sizes=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/800k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cjvt/sloberta-si-nli and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data = pd.read_csv('/kaggle/input/oasst1-sl/context_reply_pairs.csv', sep=';')

In [18]:
data.head()

Unnamed: 0,prompt,reply,rank,split
0,UPORABNIK: V mojo hišo bodo prispeli 3 prijate...,ASISTENT: Kaj pa igranje lažnivca? Za to igro ...,0,train
1,UPORABNIK: V mojo hišo bodo prispeli 3 prijate...,ASISTENT: Uno: Je klasična igra za otroke in o...,1,train
2,UPORABNIK: V mojo hišo bodo prispeli 3 prijate...,ASISTENT: Igrate lahko:\r\n - 8 noro\r\n - cel...,2,train
3,UPORABNIK: Napišite hvalnico šolskemu taboru v...,ASISTENT: Spomnim se vas prijatelji\r\nIn spet...,0,train
4,UPORABNIK: Ali kaj bolj veselega?,ASISTENT: ♪ Spakirajte in se pridružite zabavi...,0,train


In [19]:
data.loc[data['rank'] > 4, 'rank'] = 4
data['combined'] = "UPORABNIK: " + data['prompt'] + " ASISTENT: " + data['reply']

In [20]:
train_data = data[data.split == 'train']
val_data   = data[data.split == 'val']

train_texts = train_data['combined'].tolist()
train_labels = train_data['rank'].tolist()

val_texts = val_data['combined'].tolist()
val_labels = val_data['rank'].tolist()

In [21]:
# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert labels to numerical format
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [22]:
# Create a custom dataset class
class ReplyClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = ReplyClassificationDataset(train_encodings, train_labels)
val_dataset = ReplyClassificationDataset(val_encodings, val_labels)

In [10]:
class ReplyClassificationModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.cross_entropy(outputs.logits, labels)

        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)



    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)

In [11]:
# Initialize the PyTorch Lightning model
pl_model = ReplyClassificationModel(model)

checkpoint = pl.callbacks.ModelCheckpoint(
    dirpath="/kaggle/working",
    filename='best-model',
    save_top_k=2,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

# Set up the PyTorch Lightning trainer
trainer = pl.Trainer(accelerator="auto", 
                     devices=2, 
                     max_epochs=2,
                     callbacks = checkpoint,
                     logger=wandb_logger)

In [None]:
# Train the model
trainer.fit(pl_model, DataLoader(train_dataset, batch_size=8, shuffle=True), DataLoader(val_dataset, batch_size=16))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



In [23]:
k = 422
train_data.iloc[k].combined

'UPORABNIK: UPORABNIK: Mi lahko poveste, katera odločitev vrhovnega sodišča je dala ljudem v Združenih državah individualno pravico do nošenja strelnega orožja? Ali lahko povzamete primer in tudi odločitev? ASISTENT: ASISTENT: Odločitev vrhovnega sodišča, ki je ljudem v Združenih državah dala individualno pravico do nošenja strelnega orožja, je bila District of Columbia proti Hellerju (2008). Primer je vložil Dick Heller, prebivalec okrožja Columbia, ki je izpodbijal prepoved posedovanja pištol v okrožju. Vrhovno sodišče je razsodilo, da drugi amandma ščiti posameznikovo pravico do posedovanja strelnega orožja, ki ni povezano s službo v milici, in da okrožna prepoved posedovanja pištole krši to pravico.'

In [None]:
model_name = 'sloberta-si-rrhf-context'
pl_model.model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

In [25]:
tokenizer = AutoTokenizer.from_pretrained("vh-student/sloberta-si-rrhf")
model = AutoModelForSequenceClassification.from_pretrained("vh-student/sloberta-si-rrhf")

# Define your custom reply
custom_reply = '''UPORABNIK: Kako se imenujejo očetje domovine Dominikanske republike? ASISTENT: V Dominikanski republiki se za očete države štejejo Juan Pablo Duarte, Francisco del Rosario Sánchez in Matías Ramón Mella. Ti trije možje so vodili boj za neodvisnost Dominikanske republike od Haitija in februarja 1844 ustanovili republiko kot neodvisno državo.\n\nJuan Pablo Duarte velja za ustanovitelja Dominikanske republike in je v državi zelo cenjen narodni heroj. Francisco del Rosario Sánchez in Matías Ramón Mella prav tako veljata za pomembna narodna junaka zaradi njune vloge v boju za neodvisnost.\n\nTi trije možje so sestavni del zgodovine in kulture Dominikanske republike ter se jih spominjajo in častijo zaradi njihove žrtve in odločnosti v boju za neodvisnost in svobodo svoje države.'''
# Tokenize the custom reply
inputs = tokenizer.encode_plus(custom_reply, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted label
predicted_label = torch.argmax(outputs.logits).item()

# Print the predicted label
print("Predicted label:", predicted_label)
probas = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
s  = probas[0]^1 + probas[1]^2 + probas[2]^3 + probas[3]^4 + probas[4]^5
print("Predictes probability: " + s)

Predicted label: 1
Probabilities: tensor([[0.2470, 0.4846, 0.2554, 0.0101, 0.0028]])


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import *

In [2]:
split = "test"
data_path_pairs = '../../data/results/prompt_reply_pairs_5_generated_test_t5-sl-small.csv'
num_return_sequences = 5

In [3]:
data = pd.read_csv(data_path_pairs, sep=";")
eval_data = Dataset.from_pandas(data[["prompt", "generated"]])

In [4]:
tokenizer = AutoTokenizer.from_pretrained("vh-student/sloberta-si-rrhf")
model = AutoModelForSequenceClassification.from_pretrained("vh-student/sloberta-si-rrhf")

In [5]:
def convert_to_features(examples):
    prefix_in = "Uporabnik: "
    examples["prompt"] = [prefix_in + prompt for prompt in examples["prompt"]]
    prefix_out = "Asistent: "
    examples["generated"] = [prefix_out + reply for reply in examples["generated"]]
    
    examples["PROMPT"] = [prompt + " " + reply for prompt, reply in zip(examples["prompt"], examples["generated"])]
    model_inputs = tokenizer(examples['PROMPT'], pad_to_max_length=True, max_length=512, truncation=True, return_tensors='pt')

    return model_inputs

In [8]:
eval_data = eval_data.map(convert_to_features, batched=True, load_from_cache_file=False)
eval_data.set_format(type="torch", columns=["PROMPT", "input_ids", "attention_mask"])
display(eval_data[0])

Dataset({
    features: ['prompt', 'generated', 'PROMPT', 'input_ids', 'attention_mask'],
    num_rows: 24375
})

Map:   0%|          | 0/24375 [00:00<?, ? examples/s]

{'PROMPT': 'Uporabnik: Vzemimo naključno matriko $n\\krat n$ z realnimi elementi, od katerih je vsak element izbran neodvisno od drugih iz standardne normalne porazdelitve. Kako oceniti verjetnost, da imajo vse lastne vrednosti te matrike pozitiven realni del? Kakšno je asimptotično obnašanje te verjetnosti, ko $n$ teži k neskončnosti? Asistent: ݄',
 'input_ids': tensor([    5, 23231, 31825, 23231, 31825, 27679, 17440, 16551,  9235,    20,
         11791, 31778, 31895,   377,  1512, 31890,    19,   131, 12103,  6448,
         31791,    87,  1113,    38,  1154,  6724, 10565, 11913,    87,   989,
            76, 22063, 21425, 19932, 31795,  2406, 12829,  7891, 31791,    67,
           777,   197,  4777,  1417,    71, 27617,  9510, 21932,   691, 31846,
         20026,    38,  7347,  9041,  6513, 10944,    71, 11351, 31791,    68,
         11791, 31778, 31890, 12367,    30, 10681,  2123, 31846,   180, 10482,
          2202, 31825,   180, 10482,  2202, 31825, 31773,     3,     6,     1,
   

In [7]:
# split so it fits on a 12gb gpu - couldn't find a better way (model.generate() options that would do this automatically...)
# depends on the model
step = 5 # makeshift batch size - idk how to do this better
indices = np.concatenate([np.arange(0, eval_data.num_rows, step), [eval_data.num_rows]])

outputs = []
for i in tqdm(range(len(indices[:-1]))):
    with torch.no_grad():
        outputs_proba = model(eval_data[indices[i]:indices[i+1]]["PROMPT"])
    probas = torch.nn.functional.softmax(outputs_proba.logits, dim=1).detach().numpy()[0]
    print(probas)
    outputs.append(s)

  0%|          | 0/4875 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'size'