In [1]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from src.base_model import BaseModel
from src.KAN_model import KANModel
from src.Augement import Augment

from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)


### Model Parameters
# we will use with Distil-BERT
language_model_name = "microsoft/mdeberta-v3-base"
length = 128

### Training Argurments

# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32

# optim
learning_rate = 1e-4
weight_decay = 0.001 # we could use e.g. 0.01 in case of very low and very high amount of data for regularization

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"


set_seed(42)

In [2]:
# load our dataset
dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli")

In [3]:
## Let's see an example...
print(f"Sentence: {dataset['train'][12]}")

Sentence: {'id': '65960', 'premise': 'Whoopi Goldberg . From 1998 to 2002 , she was co-producer of the television game show Hollywood Squares .', 'hypothesis': 'Whoopi Goldberg co-produced an American dance tournament.', 'label': 'NEUTRAL', 'wsd': {'premise': [{'index': 0, 'text': 'Whoopi', 'pos': 'PROPN', 'lemma': 'Whoopi', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 1, 'text': 'Goldberg', 'pos': 'PROPN', 'lemma': 'Goldberg', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 2, 'text': '.', 'pos': 'PUNCT', 'lemma': '.', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 3, 'text': 'From', 'pos': 'ADP', 'lemma': 'from', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 4, 'text': '1998', 'pos': 'NUM', 'lemma': '1998', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 5, 'text': 'to', 'pos': 'ADP', 'lemma': 'to', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'},

In [4]:
## The structure of the huggingface dataset.
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 51086
    })
    validation: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2288
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2287
    })
})

In [5]:
augemnter = Augment(dataset, "test")
augmented_dataset =  augemnter.apply()
augmented_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 51086
    })
    validation: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2288
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2287
    })
})

### Metric Definition

Looking only at cross entropy loss cannot allow us to understand effectively the real capabilities of our NLP model. So let's define a standard method to compute:

- **Accuracy** metric
- **F1** metric

In [6]:
from datasets import load_metric

# Metrics

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [11]:
## Initialize the model
auto_model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3) # number of the classes
base_model = BaseModel(device, length, language_model_name)

KAN_model = KANModel(device, length, language_model_name)

tokenizer = AutoTokenizer.from_pretrained(language_model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    label_map = {
        'ENTAILMENT': 0,
        'CONTRADICTION': 1,
        'NEUTRAL': 2
    }
    # Map the labels
    examples['label'] = [label_map[label] for label in examples['label']]
    
    # Tokenize the premise and hypothesis
    
    tokenized = tokenizer(
        examples['premise'], 
        examples['hypothesis'], 
        truncation=True, 
        padding='max_length',
        max_length=length
    )
    
    # Add tokenized fields to the examples
    examples.update(tokenized)
    return examples

def tokenize_sense_function(examples):
    #TODO add word sense
    label_map = {
        'ENTAILMENT': 0,
        'CONTRADICTION': 1,
        'NEUTRAL': 2
    }
    # Map the labels
    examples['label'] = [label_map[label] for label in examples['label']]
    
    # Tokenize the premise and hypothesis
    tokenized = tokenizer(
        examples['premise'], 
        examples['hypothesis'], 
        truncation=True, 
        padding='max_length',
        max_length=length
    )
    
    # Add tokenized fields to the examples
    examples.update(tokenized)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize the dataset put the second phrase as the second parameter to have it concatenated with a <SEP> token
print("Tokenize the dataset ...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_sense_dataset = dataset.map(tokenize_sense_function, batched=True)
tokenized_augmented_dataset = augmented_dataset.map(tokenize_function, batched=True)
tokenized_augmented_sense_dataset = augmented_dataset.map(tokenize_sense_function, batched=True)

Tokenize the dataset ...


Map:   0%|          | 0/51086 [00:00<?, ? examples/s]

Map:   0%|          | 0/2288 [00:00<?, ? examples/s]

Map:   0%|          | 0/2287 [00:00<?, ? examples/s]

Map:   0%|          | 0/51086 [00:00<?, ? examples/s]

Map:   0%|          | 0/2288 [00:00<?, ? examples/s]

Map:   0%|          | 0/2287 [00:00<?, ? examples/s]

Map:   0%|          | 0/51086 [00:00<?, ? examples/s]

Map:   0%|          | 0/51086 [00:00<?, ? examples/s]

Map:   0%|          | 0/2288 [00:00<?, ? examples/s]

Map:   0%|          | 0/2287 [00:00<?, ? examples/s]

In [13]:
print(dataset['train'][15]['label'])

ENTAILMENT


## Model Training

To train a transformer model you can rely on the **Trainer** class of Huggingface (https://huggingface.co/docs/transformers/main_classes/trainer).

The Trainer class allows you to save many lines of code, and makes your code much more readable.

To initialize the Trainer class you have to define a **TrainerArguments** object.

In [14]:
training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate                   # learning rate
)

In [15]:
trainer_auto = Trainer(
   model=auto_model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer_base = Trainer(
   model=base_model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer_KAN = Trainer(
   model=KAN_model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer_sense_auto = Trainer(
   model=auto_model,
   args=training_args,
   train_dataset=tokenized_sense_dataset["train"],
   eval_dataset=tokenized_sense_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer_sense_base = Trainer(
   model=base_model,
   args=training_args,
   train_dataset=tokenized_sense_dataset["train"],
   eval_dataset=tokenized_sense_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

trainer_sense_KAN = Trainer(
   model=KAN_model,
   args=training_args,
   train_dataset=tokenized_sense_dataset["train"],
   eval_dataset=tokenized_sense_dataset["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [16]:
# Let's Train ...
trainer_auto.train()
trainer_base.train()
trainer_KAN.train()

trainer_sense_auto.train()
trainer_sense_base.train()
trainer_sense_KAN.train()

  0%|          | 0/1597 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Evaluate the model ...
trainer_auto.evaluate()
trainer_base.evaluate()
trainer_KAN.evaluate()

trainer_sense_auto.evaluate()
trainer_sense_base.evaluate()
trainer_sense_KAN.evaluate()

: 

In [None]:
trainer_augmented_auto = Trainer(
   model=auto_model,
   args=training_args,
   train_dataset=tokenized_augmented_datasets["train"],
   eval_dataset=tokenized_augmented_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer_augmented_base = Trainer(
   model=base_model,
   args=training_args,
   train_dataset=tokenized_augmented_datasets["train"],
   eval_dataset=tokenized_augmented_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer_augmented_KAN = Trainer(
   model=KAN_model,
   args=training_args,
   train_dataset=tokenized_augmented_datasets["train"],
   eval_dataset=tokenized_augmented_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer_augmented_sense_auto = Trainer(
   model=auto_model,
   args=training_args,
   train_dataset=tokenized_augmented_sense_datasets["train"],
   eval_dataset=tokenized_augmented_sense_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer_augmented_sense_base = Trainer(
   model=base_model,
   args=training_args,
   train_dataset=tokenized_augmented_sense_datasets["train"],
   eval_dataset=tokenized_augmented_sense_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer_augmented_sense_KAN = Trainer(
   model=KAN_model,
   args=training_args,
   train_dataset=tokenized_augmented_sense_datasets["train"],
   eval_dataset=tokenized_augmented_sense_datasets["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

: 

In [None]:
# Let's Train ...
trainer_augmented_auto.train()
trainer_augmented_base.train()
trainer_augmented_KAN.train()

trainer_augmented_sense_auto.train()
trainer_augmented_sense_base.train()
trainer_augmented_sense_KAN.train()

: 

In [None]:
# Evaluate the model ...
trainer_augmented_auto.evaluate()
trainer_augmented_base.evaluate()
trainer_augmented_KAN.evaluate()

trainer_augmented_sense_auto.evaluate()
trainer_augmented_sense_base.evaluate()
trainer_augmented_sense_KAN.evaluate()

: 