In [None]:
!pip install accelerate -U
!pip install datasets
!pip install evaluate

In [None]:
import random
import numpy as np
import pandas as pd
from typing import Dict
from datasets import load_dataset, concatenate_datasets, load_metric, load_dataset, Dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)

from sklearn.metrics import accuracy_score, f1_score
import evaluate
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli")
adversarial_testset = load_dataset("iperbole/adversarial_fever_nli")
label_to_id = {'ENTAILMENT': 0, 'NEUTRAL': 1, 'CONTRADICTION': 2}

In [None]:
############ In case of training the model on both base dataset and adv-generated dataset ############

# from google.colab import drive
# drive.mount('/content/drive')

# adv_dataset = load_dataset('json', data_files='/content/drive/MyDrive/dataset_adv/train_dataset_r.json')

# base_trainset = dataset['train']
# adv_trainset = adv_dataset['train']
# base_trainset.shuffle(seed=42)
# adv_trainset.shuffle(seed=42)

# adv_trainset = adv_trainset.select(range(int(len(base_trainset)*1/2))) # i'm using half of the adv for better results
# train_dataset = concatenate_datasets([adv_trainset, base_trainset])
# train_dataset.shuffle(seed=42)

########################################################

In [None]:
######### base model #########
train_dataset = dataset['train']

In [None]:
language_model_name = "microsoft/deberta-v3-base"
custom_gsa = 32
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
max_inpunt_token_len = 512
epochs = 3

In [None]:
accuracy_metric = evaluate.load("accuracy", trust_remote_code=True)
f1_metric = evaluate.load("f1", trust_remote_code=True)
precision_metric = evaluate.load("precision", trust_remote_code=True)
recall_metric = evaluate.load("recall", trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')["f1"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

pos_tag_to_id = {
    'NOUN':3, 'PROPN':4, 'SCONJ':5, 'X':6, 'CCONJ':7, 'PUNCT':8, 'AUX':9, 'NUM':10, 'ADP':11, 'DET':12, 'VERB':13, 'INTJ':14, 'ADJ':15, 'SYM':16, 'ADV':17, 'PRON':18, 'PART':19, "<unk>": 20
    }

def tokenize_function(examples):
    tokens = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        max_length=max_inpunt_token_len,
        )

    tokens["label"] = [label_to_id[label] for label in examples["label"]]
    return tokens

In [None]:
# used for tokenize adv_testset that is without 'wsd' infos
def adv_tokenize_function(examples):
    tokens = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        max_length=max_inpunt_token_len,
        padding='max_length',
        )
    l = []
    for example in examples["premise"]:
        l.append([0] * 512)
    tokens["pos_tags"] = l

    tokens["label"] = [label_to_id[label] for label in examples["label"]]
    return tokens

In [None]:
set_seed(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model = AutoModelForSequenceClassification.from_pretrained( language_model_name,
                                                            ignore_mismatched_sizes=True,
                                                            output_attentions=False,
                                                            output_hidden_states=False,
                                                            num_labels=3
                                                            )

tokenizer = AutoTokenizer.from_pretrained(language_model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names, num_proc=2)
tokenized_eval_dataset = dataset["validation"].map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names, num_proc=2)

torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="training_dir",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=custom_gsa, # Accumulates gradients over custom_gsa steps
    warmup_steps=500,                       # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,
    save_strategy="epoch",
    learning_rate=learning_rate,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,                              # Enable mixed precision training to save memory
    report_to="none",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",      # load best model based on best evaluation loss

    dataloader_num_workers=2,
)



In [None]:
set_seed(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Aggiungi EarlyStoppingCallback
)
torch.cuda.empty_cache()
trainer.train()
torch.cuda.empty_cache()

In [None]:
train_result = trainer.evaluate()

In [None]:
trainer.save_model()
trainer.log_metrics("train", train_result)
trainer.save_state()
logs = trainer.state.log_history

In [None]:
test_dataset = dataset["test"]
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=dataset["test"].column_names, num_proc=2)
test_result = trainer.evaluate(eval_dataset=tokenized_test_dataset)
trainer.log_metrics("test", test_result)
print(test_result)

In [None]:
tokenized_adversarial_testset = adversarial_testset.map(adv_tokenize_function, batched=True, remove_columns=['part'], num_proc=2)
adversarial_test_result = trainer.evaluate(eval_dataset=tokenized_adversarial_testset)
trainer.log_metrics("adversarial_test", adversarial_test_result)
print(adversarial_test_result)