In [1]:
!pip install datasets peft



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "google-t5/t5-large"

In [4]:
from datasets import load_dataset

dataset = load_dataset("lenatr99/Slovene_SuperGLUE_CB")

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)



In [6]:
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=True
)
model.config.use_cache = False

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
CONTEXT_COL = "premise"
HYPOTHESIS_COL = "hypothesis"

def preprocess_function(examples):
    """
    The preprocessing function prepares examples for processing by the model.
    It concatenates premise and hypothesis for each example to form a single input string.
    """
    inputs = [f"{premise} {hypothesis}" for premise, hypothesis in zip(examples[CONTEXT_COL], examples[HYPOTHESIS_COL])]
    tokenized_examples = tokenizer(inputs, truncation=True)
    if "label" in examples:
        tokenized_examples["labels"] = [label2id[label] for label in examples["label"]]
    return tokenized_examples

In [8]:
tokenized_dataset = dataset.map(preprocess_function,
                                remove_columns=['idx', 'premise', 'hypothesis', 'label'], batched=True)

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 110
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 22
    })
})

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import numpy as np

from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    label_ids = pred.label_ids
    # Check if predictions is a tuple and take the first element if it is
    preds = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    preds = preds.argmax(-1)
    f1 = f1_score(label_ids, preds, average="weighted")
    acc = accuracy_score(label_ids, preds)
    return {
        "accuracy": acc,
        "f1": f1,
    }


In [12]:
from transformers import set_seed

set_seed(42)

In [13]:
from transformers import TrainingArguments

new_model_name = "fine_tuned_cb_t5"

training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',
    max_steps=400,
    use_cpu=False,
    load_best_model_at_end=True
)

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
50,0.7918,1.480896,0.318182,0.153605
100,0.6056,1.56711,0.318182,0.153605
150,0.496,1.60202,0.318182,0.153605
200,0.3563,1.862158,0.272727,0.195105
250,0.254,2.177458,0.318182,0.263636
300,0.1867,2.200944,0.363636,0.320158
350,0.1237,2.344317,0.363636,0.320158
400,0.1154,2.401106,0.363636,0.320158


TrainOutput(global_step=400, training_loss=0.3661857092380524, metrics={'train_runtime': 20379.8831, 'train_samples_per_second': 0.157, 'train_steps_per_second': 0.02, 'total_flos': 3431067706525200.0, 'train_loss': 0.3661857092380524, 'epoch': 28.571428571428573})

In [16]:
trainer.evaluate()

{'eval_loss': 2.4011058807373047,
 'eval_accuracy': 0.36363636363636365,
 'eval_f1': 0.3201581027667984,
 'eval_runtime': 62.2217,
 'eval_samples_per_second': 0.354,
 'eval_steps_per_second': 0.048,
 'epoch': 28.571428571428573}

In [17]:
trainer.push_to_hub(new_model_name)

events.out.tfevents.1716151661.5ae1dec5b007.18690.0:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

events.out.tfevents.1716151951.5ae1dec5b007.20008.0:   0%|          | 0.00/6.26k [00:00<?, ?B/s]

events.out.tfevents.1716147574.5ae1dec5b007.1926.0:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

events.out.tfevents.1716152300.5ae1dec5b007.21594.0:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

events.out.tfevents.1716172742.5ae1dec5b007.21594.1:   0%|          | 0.00/457 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lenatr99/fine_tuned_cb_t5/commit/f52391ad2ae9b95ee73ef672fc92ef2e7b94b2c9', commit_message='fine_tuned_cb_t5', commit_description='', oid='f52391ad2ae9b95ee73ef672fc92ef2e7b94b2c9', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Example
hypothesis ="Valence je pomagal"
premise = "Valence praznoglavi, Valence krepostni kreten. Zakaj si ga tip raje ni zataknil v ustrezen del lastne titanske anatomije? Je morda mislil, da mi pomaga?"

In [None]:
# We need to set the seed, otherwise some weights of the model are initialized differently every time, and consequently the result can be different each time as well
set_seed(42)
import torch

adapter_name = "lenatr99/" + new_model_name

tokenizer = AutoTokenizer.from_pretrained(adapter_name)
inputs = tokenizer(f"{premise} {hypothesis}", return_tensors="pt")
label = torch.tensor([1]).unsqueeze(0)  # Batch size 1

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=True
)
outputs = model(**inputs, labels=label)
logits = outputs.logits

In [None]:
# Print prediction
logits.argmax().item()