In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, optimization
import os 
from datasets import load_dataset
import pandas as pd
from copy import deepcopy
from torch.optim import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [3]:
dream_checkpoint = "RicoBorra/DREAM-t5-small"

dream_tokenizer = AutoTokenizer.from_pretrained(dream_checkpoint)
dream_model = AutoModelForSeq2SeqLM.from_pretrained(dream_checkpoint)
dream_data_collator = DataCollatorForSeq2Seq(tokenizer=dream_tokenizer, model=dream_checkpoint)

## FLUTE Data extraction and processing
Using the few instructions from the Git Readme to have the same initial data

In [4]:
# Only the train split is available on HuggingFace
dataset = load_dataset("ColumbiaNLP/FLUTE")

In [5]:
def add_combined_cols(entry):
    
    premise = entry["premise"].strip()
    hypothesis = entry["hypothesis"].strip()
    
    if not premise.endswith("."):
        premise += "."
    assert(premise.endswith("."))
    if not hypothesis.endswith("."):
        hypothesis += "."
    assert(hypothesis.endswith("."))
    
    entry["premise_hypothesis"] = 'Premise: ' + premise + ' Hypothesis: ' + hypothesis + 'Is there a contradiction or entailment between the premise and hypothesis ?'
    entry["label_explanation"] = 'Label: ' + entry["label"] + '. Explanation: ' + entry["explanation"]
    return entry
# combine columns
combined_cols_dataset = dataset['train'].map(add_combined_cols)

# create train test split because given data has only train data
# splits are shuffled by default
dataset_train_test = combined_cols_dataset.train_test_split(test_size=0.2, seed=42)

In [6]:
def preprocess_dataset(examples):
    model_inputs = tokenizer(examples['premise_hypothesis'])
    labels = tokenizer(examples['label_explanation'])
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [7]:
tokenized_ds = dataset_train_test.map(preprocess_dataset, batched=True)
tokenized_ds = tokenized_ds.remove_columns(dataset_train_test['train'].column_names)

Map:   0%|          | 0/6027 [00:00<?, ? examples/s]

Map: 100%|██████████| 6027/6027 [00:01<00:00, 3253.02 examples/s]
Map: 100%|██████████| 1507/1507 [00:00<00:00, 2916.41 examples/s]


In [8]:
import evaluate
rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## System 1 : Normal classifier

In [9]:
tokenizer_s1 = deepcopy(tokenizer)
model_s1 = deepcopy(model)

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir="D:\Documents\PoliTo\Deep NLP\Project\S1Model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    seed=42,
    #weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    load_best_model_at_end=True,
    #eval_accumulation_steps=8,
    #fp16=True,
    #push_to_hub=True,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    lr_scheduler_type='linear'
)

'''The 50 first elements are chosen just to test quickly the trainer'''
trainer = Seq2SeqTrainer(
    model=model_s1,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer_s1,
    data_collator=data_collator,
    #compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/9042 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  6%|▌         | 501/9042 [00:57<18:35,  7.65it/s]

{'loss': 2.3956, 'learning_rate': 4.723512497235125e-05, 'epoch': 0.17}


 11%|█         | 1001/9042 [02:03<15:41,  8.54it/s]

{'loss': 2.0239, 'learning_rate': 4.44702499447025e-05, 'epoch': 0.33}


 17%|█▋        | 1501/9042 [03:08<16:44,  7.51it/s]

{'loss': 1.9249, 'learning_rate': 4.170537491705375e-05, 'epoch': 0.5}


 22%|██▏       | 2001/9042 [04:16<16:19,  7.19it/s]

{'loss': 1.8541, 'learning_rate': 3.8940499889405e-05, 'epoch': 0.66}


 28%|██▊       | 2501/9042 [05:22<13:01,  8.37it/s]

{'loss': 1.8302, 'learning_rate': 3.617562486175625e-05, 'epoch': 0.83}


 33%|███▎      | 3000/9042 [06:31<14:30,  6.94it/s]

{'loss': 1.8211, 'learning_rate': 3.3410749834107505e-05, 'epoch': 1.0}


 33%|███▎      | 3014/9042 [06:33<15:23,  6.53it/s]
 33%|███▎      | 3014/9042 [07:04<15:23,  6.53it/s]

{'eval_loss': 1.612233281135559, 'eval_runtime': 30.7266, 'eval_samples_per_second': 49.045, 'eval_steps_per_second': 24.539, 'epoch': 1.0}


 39%|███▊      | 3501/9042 [08:10<12:32,  7.36it/s]   

{'loss': 1.7388, 'learning_rate': 3.0645874806458754e-05, 'epoch': 1.16}


 44%|████▍     | 4001/9042 [09:19<11:20,  7.41it/s]

{'loss': 1.7242, 'learning_rate': 2.7880999778809997e-05, 'epoch': 1.33}


 50%|████▉     | 4501/9042 [10:21<08:34,  8.83it/s]

{'loss': 1.7254, 'learning_rate': 2.5116124751161246e-05, 'epoch': 1.49}


 55%|█████▌    | 5001/9042 [11:17<07:36,  8.86it/s]

{'loss': 1.694, 'learning_rate': 2.23512497235125e-05, 'epoch': 1.66}


 61%|██████    | 5501/9042 [12:15<06:37,  8.90it/s]

{'loss': 1.7054, 'learning_rate': 1.9586374695863748e-05, 'epoch': 1.82}


 66%|██████▋   | 6001/9042 [13:11<06:03,  8.36it/s]

{'loss': 1.6438, 'learning_rate': 1.6821499668214997e-05, 'epoch': 1.99}


                                                   
 67%|██████▋   | 6028/9042 [13:37<05:40,  8.85it/s]

{'eval_loss': 1.54267418384552, 'eval_runtime': 22.9407, 'eval_samples_per_second': 65.691, 'eval_steps_per_second': 32.867, 'epoch': 2.0}


 72%|███████▏  | 6501/9042 [14:33<05:06,  8.29it/s]  

{'loss': 1.6772, 'learning_rate': 1.4056624640566246e-05, 'epoch': 2.16}


 77%|███████▋  | 7001/9042 [15:31<03:53,  8.74it/s]

{'loss': 1.6194, 'learning_rate': 1.1291749612917496e-05, 'epoch': 2.32}


 83%|████████▎ | 7501/9042 [16:28<02:49,  9.09it/s]

{'loss': 1.6284, 'learning_rate': 8.526874585268747e-06, 'epoch': 2.49}


 88%|████████▊ | 8001/9042 [17:25<01:56,  8.90it/s]

{'loss': 1.6266, 'learning_rate': 5.761999557619996e-06, 'epoch': 2.65}


 94%|█████████▍| 8501/9042 [18:22<00:59,  9.14it/s]

{'loss': 1.6435, 'learning_rate': 2.9971245299712457e-06, 'epoch': 2.82}


100%|█████████▉| 9001/9042 [19:19<00:04,  9.03it/s]

{'loss': 1.6412, 'learning_rate': 2.3224950232249503e-07, 'epoch': 2.99}


                                                   
100%|██████████| 9042/9042 [19:47<00:00,  9.25it/s]

{'eval_loss': 1.5241432189941406, 'eval_runtime': 23.6921, 'eval_samples_per_second': 63.608, 'eval_steps_per_second': 31.825, 'epoch': 3.0}


100%|██████████| 9042/9042 [19:48<00:00,  7.61it/s]

{'train_runtime': 1188.4944, 'train_samples_per_second': 15.213, 'train_steps_per_second': 7.608, 'train_loss': 1.7722833359618546, 'epoch': 3.0}





TrainOutput(global_step=9042, training_loss=1.7722833359618546, metrics={'train_runtime': 1188.4944, 'train_samples_per_second': 15.213, 'train_steps_per_second': 7.608, 'train_loss': 1.7722833359618546, 'epoch': 3.0})

In [16]:
i = "Premise : I was crushed by an elephant today. Hypothesis : I really never wanted to be crushed by an elephant."
t = tokenizer_s1(i, return_tensors='pt').input_ids
t = t.to(model_s1.device)
o = model_s1.generate(t, max_new_tokens = 100)
d = tokenizer_s1.decode(o[0])
d 

'<pad> Label : Contradiction. Explanation: To be crushed by an elephant is not a good thing because it is not something to be crushed by.</s>'