In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd "drive/MyDrive/nlp"

In [None]:
!pip install -q -U trl transformers accelerate peft datasets bitsandbytes evaluate git+https://github.com/huggingface/huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "EMBEDDIA/crosloengual-bert"

In [2]:
from datasets import load_dataset

dataset = load_dataset("lenatr99/Slovene_SuperGLUE_CB")

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)



In [4]:
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=True
)
model.config.use_cache = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
CONTEXT_COL = "premise"
HYPOTHESIS_COL = "hypothesis"

def preprocess_function(examples):
    """
    The preprocessing function prepares examples for processing by the model.
    It concatenates premise and hypothesis for each example to form a single input string.
    """
    inputs = [f"{premise} {hypothesis}" for premise, hypothesis in zip(examples[CONTEXT_COL], examples[HYPOTHESIS_COL])]
    tokenized_examples = tokenizer(inputs, truncation=True)
    if "label" in examples:
        tokenized_examples["labels"] = [label2id[label] for label in examples["label"]]
    return tokenized_examples

In [6]:
tokenized_dataset = dataset.map(preprocess_function,
                                remove_columns=['idx', 'premise', 'hypothesis', 'label'], batched=True)

In [7]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 110
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 22
    })
})

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [10]:
from transformers import set_seed

set_seed(42)


In [11]:
from peft import  get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

NUM_VIRTUAL_TOKENS = 12

peft_config = PromptTuningConfig(
    peft_type="PROMPT_TUNING",
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=NUM_VIRTUAL_TOKENS,
    num_layers=6,
    token_dim=768,
    num_attention_heads=12,
    tokenizer_name_or_path=model_name #The pre-trained model
)

In [12]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 11,523 || all params: 124,148,742 || trainable%: 0.0093


In [13]:
from transformers import TrainingArguments

new_model_name = "prompt_fine_tuned_CB_croslo"

training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',
    max_steps=400,
    use_cpu=False,
    load_best_model_at_end=True
)

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer.train()

  0%|          | 0/400 [00:00<?, ?it/s]

{'loss': 1.0278, 'grad_norm': 13.051501274108887, 'learning_rate': 1.7500000000000002e-05, 'epoch': 0.45}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1157597303390503, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.23064935064935063, 'eval_runtime': 2.6355, 'eval_samples_per_second': 8.348, 'eval_steps_per_second': 8.348, 'epoch': 0.45}
{'loss': 0.9865, 'grad_norm': 12.213147163391113, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.91}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1195473670959473, 'eval_accuracy': 0.36363636363636365, 'eval_f1': 0.24300699300699302, 'eval_runtime': 0.3007, 'eval_samples_per_second': 73.173, 'eval_steps_per_second': 73.173, 'epoch': 0.91}
{'loss': 0.8601, 'grad_norm': 11.454643249511719, 'learning_rate': 1.25e-05, 'epoch': 1.36}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.135717511177063, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.2678, 'eval_samples_per_second': 82.141, 'eval_steps_per_second': 82.141, 'epoch': 1.36}
{'loss': 0.8769, 'grad_norm': 16.31204605102539, 'learning_rate': 1e-05, 'epoch': 1.82}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1594661474227905, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.3314, 'eval_samples_per_second': 66.376, 'eval_steps_per_second': 66.376, 'epoch': 1.82}
{'loss': 0.9026, 'grad_norm': 9.032601356506348, 'learning_rate': 7.500000000000001e-06, 'epoch': 2.27}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.173268437385559, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.2728, 'eval_samples_per_second': 80.653, 'eval_steps_per_second': 80.653, 'epoch': 2.27}
{'loss': 0.8002, 'grad_norm': 8.974824905395508, 'learning_rate': 5e-06, 'epoch': 2.73}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1884652376174927, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.2949, 'eval_samples_per_second': 74.614, 'eval_steps_per_second': 74.614, 'epoch': 2.73}
{'loss': 0.8093, 'grad_norm': 12.539688110351562, 'learning_rate': 2.5e-06, 'epoch': 3.18}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1996424198150635, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.2652, 'eval_samples_per_second': 82.941, 'eval_steps_per_second': 82.941, 'epoch': 3.18}
{'loss': 0.7259, 'grad_norm': 6.747017860412598, 'learning_rate': 0.0, 'epoch': 3.64}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.2045663595199585, 'eval_accuracy': 0.3181818181818182, 'eval_f1': 0.1536050156739812, 'eval_runtime': 0.2709, 'eval_samples_per_second': 81.217, 'eval_steps_per_second': 81.217, 'epoch': 3.64}
{'train_runtime': 31.3157, 'train_samples_per_second': 12.773, 'train_steps_per_second': 12.773, 'train_loss': 0.8736630058288575, 'epoch': 3.64}


TrainOutput(global_step=400, training_loss=0.8736630058288575, metrics={'train_runtime': 31.3157, 'train_samples_per_second': 12.773, 'train_steps_per_second': 12.773, 'total_flos': 15245058155940.0, 'train_loss': 0.8736630058288575, 'epoch': 3.6363636363636362})

In [16]:
trainer.evaluate()

  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.2045663595199585,
 'eval_accuracy': 0.3181818181818182,
 'eval_f1': 0.1536050156739812,
 'eval_runtime': 0.2783,
 'eval_samples_per_second': 79.051,
 'eval_steps_per_second': 79.051,
 'epoch': 3.6363636363636362}

In [17]:
trainer.push_to_hub(new_model_name)



training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lenatr99/prompt_fine_tuned_CB_croslo/commit/ce86c3599c5322ed21334990f8563844360714f0', commit_message='prompt_fine_tuned_CB_croslo', commit_description='', oid='ce86c3599c5322ed21334990f8563844360714f0', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
# Example
hypothesis ="Valence je pomagal"
premise = "Valence praznoglavi, Valence krepostni kreten. Zakaj si ga tip raje ni zataknil v ustrezen del lastne titanske anatomije? Je morda mislil, da mi pomaga?"

In [19]:
# We need to set the seed, otherwise some weights of the model are initialized differently every time, and consequently the result can be different each time as well
# set_seed(42)
import torch

adapter_name = "lenatr99/" + new_model_name

tokenizer = AutoTokenizer.from_pretrained(adapter_name)
inputs = tokenizer(f"{premise} {hypothesis}", return_tensors="pt")
label = torch.tensor([1]).unsqueeze(0)  # Batch size 1

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/329k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    trust_remote_code=True
)
outputs = model(**inputs, labels=label)
logits = outputs.logits

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/crosloengual-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Print prediction
logits.argmax().item()

2