## Libraries

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding
import datasets
import numpy as np
import torch
import matplotlib.pyplot as plt

## GPU Settings


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU check

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

releasing memory allocated on the GPU

In [None]:
torch.cuda.empty_cache()

## Models

In [None]:
language_model_name = "distilbert-base-uncased"

Uncomment the model you wish to train

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#                                                     language_model_name,
#                                                     num_labels=3,        # number of categories
#                                                     ignore_mismatched_sizes=True,
#                                                     output_attentions=False,
#                                                     output_hidden_states=False,
#                                                 )

In [1]:
# model_val = AutoModelForSequenceClassification.from_pretrained(
#                                                     language_model_name,
#                                                     num_labels=3,        # number of categories
#                                                     ignore_mismatched_sizes=True,
#                                                     output_attentions=False,
#                                                     output_hidden_states=False,
#                                                 )

In [None]:
# model_wsd = AutoModelForSequenceClassification.from_pretrained(
#                                                     language_model_name,
#                                                     num_labels=3,        # number of categories
#                                                     ignore_mismatched_sizes=True,
#                                                     output_attentions=False,
#                                                     output_hidden_states=False,
#                                                 )

In [None]:
# model_srl = AutoModelForSequenceClassification.from_pretrained(
#                                                     language_model_name,
#                                                     num_labels=3,        # number of categories
#                                                     ignore_mismatched_sizes=True,
#                                                     output_attentions=False,
#                                                     output_hidden_states=False,
#                                                 )

In [None]:
# model_srl_wsd = AutoModelForSequenceClassification.from_pretrained(
#                                                     language_model_name,
#                                                     num_labels=3,        # number of categories
#                                                     ignore_mismatched_sizes=True,
#                                                     output_attentions=False,
#                                                     output_hidden_states=False,
#                                                 )

Here we load the already trained models

In [None]:
model_val = AutoModelForSequenceClassification.from_pretrained(
                                                    './weights/valTrial',
                                                    num_labels=3,        # number of categories
                                                    ignore_mismatched_sizes=True,
                                                    output_attentions=False,
                                                    output_hidden_states=False,
                                                )

In [None]:
model_wsd = AutoModelForSequenceClassification.from_pretrained(
                                                    './weights/wsdTrial',
                                                    num_labels=3,        # number of categories
                                                    ignore_mismatched_sizes=True,
                                                    output_attentions=False,
                                                    output_hidden_states=False,
                                                )

In [None]:
model_srl = AutoModelForSequenceClassification.from_pretrained(
                                                    './weights/srlTrial',
                                                    num_labels=3,        # number of categories
                                                    ignore_mismatched_sizes=True,
                                                    output_attentions=False,
                                                    output_hidden_states=False,
                                                )

In [None]:
model_srl_wsd = AutoModelForSequenceClassification.from_pretrained(
                                                    './weights/srl_wsd_Trial',
                                                    num_labels=3,        # number of categories
                                                    ignore_mismatched_sizes=True,
                                                    output_attentions=False,
                                                    output_hidden_states=False,
                                                )

## Datasets load

In [None]:
dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files = {
                                    'train': 'data/train-00000-of-00001.parquet',
                                    'validation' : 'data/validation-00000-of-00001.parquet'
                                },
                    )

In [None]:
adv_test_set = datasets.load_dataset(
                        path = 'parquet',
                        data_files =  {
                            'test' : 'data/adv-test-00000-of-00001.parquet'
                        }
                    )

In [None]:
wsd_aug_train_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files =  {
                            'train' : 'data/wsd_aug_train_dataset.parquet'
                        }
                    )

In [None]:
wsd_aug_val_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files =  {
                            'validation' : 'data/wsd_aug_val_dataset.parquet'
                        }
                    )

In [None]:
srl_aug_train_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files = {
                            'train' : 'data/srl_aug_train_dataset.parquet'
                        }
                    )

In [None]:
srl_aug_val_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files =  {
                            'validation' : 'data/srl_aug_val_dataset.parquet'
                        }
                    )

In [None]:
srl_wsd_aug_train_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files =  {
                            'train' : 'data/srl_wsd_aug_train_dataset.parquet'
                        }
                    )

In [None]:
srl_wsd_aug_val_dataset = datasets.load_dataset(
                        path = 'parquet',
                        data_files = {
                            'validation' : 'data/srl_wsd_aug_val_dataset.parquet'
                        }
                    )

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

## Main mapping function

Preparing the dataset for the NLI task

In [None]:
def NLImapping(sample):
    NLIsample = {}
    NLIsample['id'] = sample['id']
    if sample['label'] == 'ENTAILMENT':
        NLIsample['label'] = 0
    elif sample['label'] == 'NEUTRAL':
        NLIsample['label'] = 1
    else:
        NLIsample['label'] = 2

    NLIsample['sentence'] = '[CLS] ' + sample['premise'] + ' [SEP] ' + sample['hypothesis']
    return NLIsample

## Datasets preparation

### FEVER dataset

In [None]:
Fever_dataset = dataset.remove_columns(['wsd', 'srl'])

In [None]:
NLIDataset = Fever_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])

In [None]:
tokenized_FEVER_datasets = NLIDataset.map(tokenize_function, batched=True)


### Adversarial dataset

In [None]:
def advNLImapping(sample):
    NLIsample = {}
    NLIsample['id'] = sample['cid']
    if sample['label'] == 'ENTAILMENT':
        NLIsample['label'] = 0
    elif sample['label'] == 'NEUTRAL':
        NLIsample['label'] = 1
    else:
        NLIsample['label'] = 2

    NLIsample['sentence'] = '[CLS] ' + sample['premise'] + ' [SEP] ' + sample['hypothesis']
    return NLIsample

In [None]:
advNLIDataset = adv_test_set.map(advNLImapping, remove_columns=['premise', 'hypothesis', 'cid','part'])

In [None]:
tokenized_adv = advNLIDataset.map(tokenize_function, batched=True)

### WSD dataset

In [None]:
wsd_aug_train_dataset_NLI = wsd_aug_train_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])
wsd_aug_val_dataset_NLI = wsd_aug_val_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])

In [None]:
tokenized_wsd_aug_train = wsd_aug_train_dataset_NLI.map(tokenize_function, batched=True)
tokenized_wsd_aug_val = wsd_aug_val_dataset_NLI.map(tokenize_function, batched=True)

### SRL dataset

In [None]:
srl_aug_train_datasetNLI = srl_aug_train_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])
srl_aug_val_datasetNLI = srl_aug_val_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])

In [None]:
tokenized_srl_aug_train = srl_aug_train_datasetNLI.map(tokenize_function, batched=True)
tokenized_srl_aug_val = srl_aug_val_datasetNLI.map(tokenize_function, batched=True)

### WSD + SRL dataset

In [None]:
srl_wsd_aug_train_datasetNLI = srl_wsd_aug_train_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])
srl_wsd_aug_val_datasetNLI = srl_wsd_aug_val_dataset.map(NLImapping, remove_columns=['premise', 'hypothesis'])

In [None]:
tokenized_srl_wsd_aug_train = srl_wsd_aug_train_datasetNLI.map(tokenize_function, batched=True)
tokenized_srl_wsd_aug_val = srl_wsd_aug_val_datasetNLI.map(tokenize_function, batched=True)

## Training common settings

In [None]:
training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=2,                           # total number of training epochs
    per_device_train_batch_size=8,                # batch size per device during training
    per_device_eval_batch_size=8,
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=0.004,                           # strength of weight decay
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",                  # sets the evaluation to happen every certain number of steps.
    eval_steps=500,                               # specifies that evaluation should happen every 500 steps
    save_steps=500,                               # saves the model checkpoint every 500 steps
    load_best_model_at_end=True,                  # ensures that the best model (based on the metric specified) is loaded at the end of training
    metric_for_best_model="accuracy",
    learning_rate=1e-5,                           # learning rate
    lr_scheduler_type = 'linear'
)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = datasets.load_metric("accuracy")
   load_f1 = datasets.load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average='macro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

## Training on the FEVER Dataset


In [None]:
trainer_val = Trainer(
   model=model_val,
   args=training_args,
   train_dataset=tokenized_FEVER_datasets["train"],
   eval_dataset=tokenized_FEVER_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Uncomment below if you want to train the model


In [None]:
# trainer_val.train()

Uncomment below to save the weights of the just trained model

In [None]:
# model.save_pretrained('./newTrial')

## Evaluation on the FEVER validation set

In [None]:
eval_results = trainer_val.evaluate()
print(eval_results)

## Evaluation on the Adversarial Set


In [None]:
trainer_adv = Trainer(
   model=model_val,
   args=training_args,
   train_dataset=tokenized_FEVER_datasets["train"],
   eval_dataset=tokenized_adv["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
eval_adv_results = trainer_adv.evaluate()
print(eval_adv_results)

### Plot and comparison between evaluation on FEVER and Adversarial set

In [None]:
val_accuracy = eval_results['eval_accuracy']
adv_accuracy = eval_adv_results['eval_accuracy']

fig, ax = plt.subplots()

ax.bar('val_acc', val_accuracy, color='blue', label='val_acc')
ax.bar('adv_acc', adv_accuracy, color='red', label='adv_acc')

ax.set_ylabel('Accuracy')
ax.set_title('Comparison of val_acc and adv_acc')

ax.legend()
plt.savefig('2comparison.png')

## Training the model on the WSD augmented dataset



In [None]:
trainer_wsd = Trainer(
   model=model_wsd,
   args=training_args,
   train_dataset=tokenized_wsd_aug_train['train'],
   eval_dataset=tokenized_wsd_aug_val['validation'],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Uncomment below in order to train the model


In [None]:
# trainer_wsd.train()

Unccoment below to save the weights of the just trained model

In [None]:
# model_wsd.save_pretrained('./wsdTrial')

## Evaluation of the model_wsd on the FEVER validation set

The model trained on the WSD augmented dataset is evaluated on the FEVER validation set

In [None]:
eval_syn_wsd_results = trainer_wsd.evaluate()
print(eval_syn_wsd_results)

## Evaluation of the model_wsd on the adversarial test set

The model trained on the wsd augmented dataset is evaluated on the adversarial test set

In [None]:
trainer_adv_wsd = Trainer(
   model=model_wsd,
   args=training_args,
   train_dataset=tokenized_wsd_aug_train['train'],
   eval_dataset=tokenized_adv["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
eval_adv_wsd_results = trainer_adv_wsd.evaluate()
print(eval_adv_wsd_results)

### Plot comparison between evaluation on FEVER, Adversarial with base model and Adversarial set with model_wsd

In [None]:
val_accuracy = eval_results['eval_accuracy']
adv_accuracy = eval_adv_results['eval_accuracy']
wsd_accuracy = eval_adv_wsd_results['eval_accuracy']

fig, ax = plt.subplots()

ax.bar('val_acc', val_accuracy, color='blue', label='val_acc')
ax.bar('adv_acc', adv_accuracy, color='red', label='adv_acc')
ax.bar('wsd_acc', wsd_accuracy, color='green', label='wsd_acc')

ax.set_ylabel('Accuracy')
ax.set_title('Comparison of val_acc, adv_acc and wsd_acc')

ax.legend()
# plt.savefig('comparison.png')

## Training the model on the SRL augmented dataset

In [None]:
trainer_srl = Trainer(
   model=model_srl,
   args=training_args,
   train_dataset=tokenized_srl_aug_train['train'],
   eval_dataset=tokenized_srl_aug_val['validation'],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Uncomment to train the model

In [None]:
# trainer_srl.train()

Uncomment to save the weights of the just trained model

In [None]:
# model_srl.save_pretrained('./srlTrial')

## Evaluation of the model_srl on the FEVER validation set

In [None]:
eval_srl_results = trainer_srl.evaluate()
print(eval_srl_results)

## Evaluation of the model_srl on the adversarial validation set

In [None]:
trainer_srl_adv = Trainer(
    model=model_srl,
    args=training_args,
    train_dataset=tokenized_srl_aug_train['train'],
    eval_dataset=tokenized_adv["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

In [None]:
eval_srl_adv_results = trainer_srl_adv.evaluate()
print(eval_srl_adv_results)

## Training the model on the SRL + WSD augmented dataset

In [None]:
trainer_srl_wsd = Trainer(
    model=model_srl_wsd,
    args=training_args,
    train_dataset=tokenized_srl_wsd_aug_train['train'],
    eval_dataset=tokenized_srl_wsd_aug_val['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

Uncomment to train the model

In [None]:
# trainer_srl_wsd.train()

Uncomment to save the weights of the just trained model

In [None]:
# model_srl_wsd.save_pretrained('./srl_wsd_Trial')

## Evaluation of the model_srl_wsd on the FEVER validation set

In [None]:
eval_srl_wsd_results = trainer_srl_wsd.evaluate()
print(eval_srl_wsd_results)

## Evaluation of the model_srl_wsd on the adversarial set

In [None]:
trainer_srl_wsd_adv = Trainer(
    model=model_srl_wsd,
    args=training_args,
    train_dataset=tokenized_srl_wsd_aug_train['train'],
    eval_dataset=tokenized_adv["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

In [None]:
eval_srl_wsd_results_adv = trainer_srl_wsd_adv.evaluate()
print(eval_srl_wsd_results_adv)

### Plot comparison between evaluation on:
- Adversarial with base model 
- Adversarial set with model_wsd 
- Adversarial set with model_srl 
- Adversarial set with model_srl_wsd

In [None]:
adv_accuracy = eval_adv_results['eval_accuracy']
wsd_accuracy = eval_adv_wsd_results['eval_accuracy']
srl_accuracy = eval_srl_adv_results['eval_accuracy']
srl_wsd_accuracy = eval_srl_wsd_results_adv['eval_accuracy']

fig, ax = plt.subplots()

ax.bar('adv_acc', adv_accuracy, color='red', label='adv_acc')
ax.bar('wsd_acc', wsd_accuracy, color='green', label='wsd_acc')
ax.bar('srl_acc', srl_accuracy, color='yellow', label='srl_acc')
ax.bar('srl_wsd_acc', srl_wsd_accuracy, color='purple', label='srl_wsd_acc')

ax.set_ylabel('Accuracy')
ax.set_title('Comparison of adv_acc, wsd_acc, srl_acc and wsd+srl_acc')

ax.legend(loc='lower right')
plt.savefig('4comparison.png')