In [1]:
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import os
import pandas as pd
from pathlib import Path
import gc

BASE_DIR = Path(".")

In [24]:
e_snli_train_1 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_1.csv")
e_snli_train_2 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_2.csv")
e_snli_train = pd.concat([e_snli_train_1, e_snli_train_2])

e_snli_test = pd.read_csv(BASE_DIR / "datasets" / "esnli_test.csv")

e_snli_train.head()

Unnamed: 0,pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
0,3416050480.jpg#4r1n,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,the person is not necessarily training his horse,AF0PI3RISB5Q7,A person on a horse jumps over a broken down a...,A person is *training* *his* *horse* for a co...,{},345
1,3416050480.jpg#4r1c,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",One cannot be on a jumping horse cannot be a d...,A36ZT2WFIA2HMF,A person *on* *a* *horse* *jumps* over a brok...,"A person *is* *at* *a* *diner,* *ordering* an...",4235,25436
2,3416050480.jpg#4r1e,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",a broken down airplane is outdoors,A2GK75ZQTX2RDZ,A person on a horse jumps over *a* *broken* *...,"A person is *outdoors,* on a horse.",89107,3
3,2267923837.jpg#2r1n,neutral,Children smiling and waving at camera,They are smiling at their parents,Just because they are smiling and waving at a ...,A18TOIDG32QICP,Children smiling and waving at camera,They are smiling *at* *their* *parents*,{},534
4,2267923837.jpg#2r1e,entailment,Children smiling and waving at camera,There are children present,The children must be present to see them smili...,AEX0YE6TUZRHT,*Children* *smiling* *and* *waving* at camera,There are children *present*,0132,3


# Finetuning on E-SNLI

From the FLUTE paper:

> T5e-SNLI: e-SNLI (Camburu et al., 2018) dataset
> comes with supervised ground-truth labels and ra-
> tionales. We fine-tune the 3B version of T5 on
> e-SNLI for one epoch with a batch size of 1024,
> and an AdamW Optimizer with a learning rate of
> 1e − 4. We remove the Neutral examples from
> e-SNLI because our test data does not have such
> a category. We take the longest explanation per
> example in e-SNLI since our data has only one ref-
> erence explanation. In case the explanations are
> more than one sentence we join them using ‘and’
> since our data contains single-sentence explana-
> tions. This leaves us with 366,603 training and
> 6,607 validation examples.


In [5]:
e_snli_train = e_snli_train[e_snli_train["gold_label"] != "neutral"]
e_snli_test = e_snli_test[e_snli_test["gold_label"] != "neutral"]


def join_sentences(explanation):
    return str(explanation).replace(". ", " and ")


e_snli_train["Explanation_1"] = e_snli_train["Explanation_1"].apply(join_sentences)


def find_longest_explanation(row):
    explanations = [row["Explanation_1"], row["Explanation_2"], row["Explanation_3"]]
    return max(explanations, key=len)


e_snli_test["Explanation"] = e_snli_test.apply(find_longest_explanation, axis=1)
e_snli_test["Explanation"] = e_snli_test["Explanation"].apply(join_sentences)

In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import numpy as np

train_dataset = Dataset.from_pandas(e_snli_train)

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def preprocess_train(batch):
    pairIDs = batch["pairID"]
    gold_labels = batch["gold_label"]
    Sentence1s = batch["Sentence1"]
    Sentence2s = batch["Sentence2"]
    explanation_1 = batch["Explanation_1"]

    inputs = [
        f"Does the sentence '{s1}' entail or contradict the sentence '{s2}'? Please answer between 'Entails' or 'Contradicts' and explain your decision in a sentence."
        for s1, s2 in zip(Sentence1s, Sentence2s)
    ]

    targets = [
        f"{label} - {explanation}"
        for label, explanation in zip(gold_labels, explanation_1)
    ]

    inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    targets = tokenizer(
        targets,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}


tokenized_train = train_dataset.map(preprocess_train, batched=True)
tokenized_train = tokenized_train.remove_columns(
    [
        "pairID",
        "gold_label",
        "Sentence1",
        "Sentence2",
        "Explanation_1",
        "WorkerId",
        "Sentence1_marked_1",
        "Sentence2_marked_1",
        "Sentence1_Highlighted_1",
        "Sentence2_Highlighted_1",
        "__index_level_0__",
    ]
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/366603 [00:00<?, ? examples/s]

In [8]:
def preprocess_test(batch):
    pairIDs = batch["pairID"]
    gold_labels = batch["gold_label"]
    Sentence1s = batch["Sentence1"]
    Sentence2s = batch["Sentence2"]
    explanation = batch["Explanation"]

    inputs = [
        f"Does the sentence '{s1}' entail or contradict the sentence '{s2}'? Please answer between 'Entails' or 'Contradicts' and explain your decision in a sentence."
        for s1, s2 in zip(Sentence1s, Sentence2s)
    ]

    targets = [
        f"{label} - {explanation}"
        for label, explanation in zip(gold_labels, explanation)
    ]

    inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    targets = tokenizer(
        targets,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}


test_dataset = Dataset.from_pandas(e_snli_test)
tokenized_test = test_dataset.map(preprocess_test, batched=True)

Map:   0%|          | 0/6605 [00:00<?, ? examples/s]

['entailment - "Filled with song" is a rephrasing of the "choir sings to the masses."', 'contradiction - A choir sing some other songs other than book at church during the base play and they cannot see book and play base ball same time.', 'entailment - One must be happy in order to have a big grin.', 'contradiction - There can be either a woman with a very big grin or a woman who has been shot.', 'entailment - A man poses in front of an ad is the same as a man poses in front of advertisement because ad is an abbreviation for advertisement.', 'contradiction - The man poses in front of the advertisement therefore he did not walk by it.', 'entailment - A statue that no one is looking at would imply that not many people are interested in it.', 'contradiction - If tons of people are gathered around the statue, it is not possible that no one seems to be looking at it.', 'entailment - A Land Rover being driven across a river implies that water will be splashing as it corsses.', 'entailment - 

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("rouge")

# TODO: Is this correct? Test.


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print("Before decoding")
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_preds)
    print(decoded_labels)
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    print("result", result)
    return result["rouge1"]

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="models",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=32,  # smaller batch size to run on less VRAM
    per_device_train_batch_size=32,  # smaller batch size to run on less VRAM
    num_train_epochs=1,
    learning_rate=1e-4,
    save_total_limit=1,
    save_steps=10,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    # compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [12]:
trainer.train()

del tokenized_train
del tokenized_test
del model
gc.collect()

  0%|          | 0/11457 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3428, 'learning_rate': 9.563585580867592e-05, 'epoch': 0.04}
{'loss': 0.237, 'learning_rate': 9.127171161735184e-05, 'epoch': 0.09}
{'loss': 0.227, 'learning_rate': 8.690756742602775e-05, 'epoch': 0.13}
{'loss': 0.2196, 'learning_rate': 8.254342323470367e-05, 'epoch': 0.17}
{'loss': 0.213, 'learning_rate': 7.817927904337959e-05, 'epoch': 0.22}
{'loss': 0.2091, 'learning_rate': 7.381513485205552e-05, 'epoch': 0.26}
{'loss': 0.2098, 'learning_rate': 6.945099066073143e-05, 'epoch': 0.31}
{'loss': 0.2039, 'learning_rate': 6.508684646940735e-05, 'epoch': 0.35}
{'loss': 0.2036, 'learning_rate': 6.072270227808327e-05, 'epoch': 0.39}
{'loss': 0.2022, 'learning_rate': 5.63585580867592e-05, 'epoch': 0.44}
{'loss': 0.2029, 'learning_rate': 5.1994413895435114e-05, 'epoch': 0.48}
{'loss': 0.2007, 'learning_rate': 4.7630269704111024e-05, 'epoch': 0.52}
{'loss': 0.1984, 'learning_rate': 4.326612551278694e-05, 'epoch': 0.57}
{'loss': 0.1984, 'learning_rate': 3.8901981321462864e-05, 'epoch':

  0%|          | 0/207 [00:00<?, ?it/s]

{'eval_loss': 0.2696358561515808, 'eval_runtime': 17.7659, 'eval_samples_per_second': 371.779, 'eval_steps_per_second': 11.652, 'epoch': 1.0}
{'train_runtime': 3512.2859, 'train_samples_per_second': 104.377, 'train_steps_per_second': 3.262, 'train_loss': 0.20994369561441012, 'epoch': 1.0}


81

In [32]:
model = AutoModelForSeq2SeqLM.from_pretrained("models/checkpoint-11450")

P = "I bet I am blue."
H = "I bet I am like a cherry."
prompt = f"Does the sentence '{P}' entail or contradict the sentence '{H}'? Please answer between 'Entails' or 'Contradicts' and explain your decision in a sentence."
tokens = tokenizer(prompt, return_tensors="pt").input_ids
tokens = tokens.to(model.device)
output = model.generate(tokens, max_new_tokens=100)
output = tokenizer.decode(output[0])
print(output)

<pad> contradiction - Blue is not a cherry.</s>


## Finetuning on FLUTE

From the FLUTE paper:

> We fine-tune the 3B version of T5
> model for 10 epochs with a batch size of 1024, and
> an AdamW Optimizer with a learning rate of 1e−4
> in a multitask fashion with data from all the four
> types of figurative languages combined. Our train-
> ing data consists of 7,035 samples which is 50X
> smaller than e-SNLI. For validation we use 500 ex-
> amples which is used for selecting best checkpoint
> based on loss.


In [None]:
flute_dataset = pd.read_json(BASE_DIR / "datasets" / "train.jsonl")

print(len(flute_dataset))