In [1]:
!pip install datasets peft

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m327.7/542.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "EMBEDDIA/sloberta"

In [4]:
from datasets import load_dataset

dataset = load_dataset("lenatr99/Slovene_SuperGLUE_RTE")

Downloading readme:   0%|          | 0.00/470 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/232 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/29 [00:00<?, ? examples/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/800k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

In [6]:
id2label = {0: "not_entailment", 1: "entailment"}
label2id = {"not_entailment": 0, "entailment": 1}

In [7]:
CONTEXT_COL = "premise"
HYPOTHESIS_COL = "hypothesis"
LABEL_COL = "label"

def preprocess_function(examples):
    context_inputs = [f"{CONTEXT_COL} : {x} " for x in examples[CONTEXT_COL]]
    hypothesis_inputs = [f"{HYPOTHESIS_COL} : {x} " for x in examples[HYPOTHESIS_COL]]
    inputs = [context_input + hypothesis_input for context_input, hypothesis_input in zip(context_inputs, hypothesis_inputs)]

    tokenized_examples = tokenizer(inputs, truncation=True)
    tokenized_examples[LABEL_COL] = [label2id[label] for label in examples[LABEL_COL]]

    return tokenized_examples

In [8]:
tokenized_dataset = dataset.map(preprocess_function, remove_columns=['idx', 'hypothesis', 'premise'], batched=True)

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

In [9]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    label_ids = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(label_ids, preds, average="weighted")
    acc = accuracy_score(label_ids, preds)
    return {"accuracy": acc, "f1": f1}

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
from transformers import set_seed

set_seed(42)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
)
model.config.use_cache = False

config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from peft import LoHaConfig, get_peft_model, TaskType

loha_alpha = 32
loha_rank_dropout = 0.1
loha_module_dropout = 0.0
loha_r = 16

peft_config = LoHaConfig(
    task_type=TaskType.SEQ_CLS,
    r=loha_r,
    alpha=loha_alpha,
    target_modules=['query', 'value'],
    base_model_name_or_path=model_name
)

In [13]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,771,778 || all params: 112,395,268 || trainable%: 1.5764


In [14]:
from transformers import TrainingArguments

new_model_name = "loha_fine_tuned_rte_sloberta"

training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',
    max_steps=400,
    use_cpu=False,
    load_best_model_at_end=True
)



In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [16]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
50,0.6991,0.686959,0.586207,0.433283
100,0.6913,0.686481,0.586207,0.433283
150,0.6957,0.691286,0.689655,0.668714
200,0.6913,0.693739,0.448276,0.414469
250,0.6909,0.693108,0.448276,0.444335
300,0.6946,0.692856,0.413793,0.413793
350,0.6925,0.692357,0.551724,0.553898
400,0.6891,0.692494,0.517241,0.520723


TrainOutput(global_step=400, training_loss=0.6930549144744873, metrics={'train_runtime': 41.3367, 'train_samples_per_second': 77.413, 'train_steps_per_second': 9.677, 'total_flos': 273845309567424.0, 'train_loss': 0.6930549144744873, 'epoch': 13.793103448275861})

In [17]:
trainer.evaluate()

{'eval_loss': 0.6924943327903748,
 'eval_accuracy': 0.5172413793103449,
 'eval_f1': 0.5207228116710876,
 'eval_runtime': 0.1459,
 'eval_samples_per_second': 198.815,
 'eval_steps_per_second': 27.423,
 'epoch': 13.793103448275861}

In [18]:
trainer.push_to_hub(new_model_name)



adapter_model.safetensors:   0%|          | 0.00/7.10M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/800k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1716495924.a58e1acdfa49.459.0:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

events.out.tfevents.1716495966.a58e1acdfa49.459.1:   0%|          | 0.00/457 [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/lenatr99/loha_fine_tuned_rte_sloberta/commit/adaee4c35fd0191fbafdf431de0210e3e1277d36', commit_message='loha_fine_tuned_rte_sloberta', commit_description='', oid='adaee4c35fd0191fbafdf431de0210e3e1277d36', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
from transformers import pipeline

set_seed(42)

adapter_name = "lenatr99/" + new_model_name

# Example
text="premise : Potem ko je sklenila zavezništvo z glavno proti-sirijsko koalicijo, so njegovo ženo Strido na nedavnih volitvah izvolili v parlament. hypothesis : Strido so izvolili v parlament."

classifier = pipeline("sentiment-analysis", model=adapter_name)
classifier(text)

adapter_config.json:   0%|          | 0.00/514 [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': 'LABEL_1', 'score': 0.5385059118270874}]

In [19]:
#GPU RAM = 1.9 GB