In [1]:
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import os
import pandas as pd
from pathlib import Path

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

BASE_DIR = Path(".")
BATCH_SIZE = 8
NUM_EPOCHS = 5
NUM_SAMPLES = 1_000
MULTI_STAGES = False
FREEZE = False
FROZEN_LAYERS = 15

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "google/t5-efficient-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 256)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 256)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=256, out_features=256, bias=False)
              (k): Linear(in_features=256, out_features=256, bias=False)
              (v): Linear(in_features=256, out_features=256, bias=False)
              (o): Linear(in_features=256, out_features=256, bias=False)
              (relative_attention_bias): Embedding(32, 4)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=256, out_features=1024, bias=False)
              (wo): Linear(in_features=1024, out_features=256, bias=False)
              (dropout): Drop

In [4]:
e_snli_train_1 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_1.csv")[:NUM_SAMPLES]
e_snli_train_2 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_2.csv")[:NUM_SAMPLES]
e_snli_train = pd.concat([e_snli_train_1, e_snli_train_2])

e_snli_test = pd.read_csv(BASE_DIR / "datasets" / "esnli_test.csv")[:NUM_SAMPLES]

print(len(e_snli_train_1))
print(len(e_snli_train_2))
print(len(e_snli_train))
e_snli_train.head(1)

1000
1000
2000


Unnamed: 0,pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
0,3416050480.jpg#4r1n,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,the person is not necessarily training his horse,AF0PI3RISB5Q7,A person on a horse jumps over a broken down a...,A person is *training* *his* *horse* for a co...,{},345


In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [6]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import numpy as np

dataset = Dataset.from_pandas(e_snli_train)

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def preprocess_function(batch):
    pairIDs = batch["pairID"]
    gold_labels = batch["gold_label"]
    Sentence1s = batch["Sentence1"]
    Sentence2s = batch["Sentence2"]
    Explanation_1s = batch["Explanation_1"]
    WorkerIds = batch["WorkerId"]
    Sentence1_marked_1s = batch["Sentence1_marked_1"]
    Sentence2_marked_1s = batch["Sentence2_marked_1"]
    Sentence1_Highlighted_1s = batch["Sentence1_Highlighted_1"]
    Sentence2_Highlighted_1s = batch["Sentence2_Highlighted_1"]

    inputs = [
        f"Does the sentence '{s1}' entail or contradict the sentence '{s2}'? Please answer between 'Entails' or 'Contradicts' and explain your decision in a sentence."
        for s1, s2 in zip(Sentence1s, Sentence2s)
    ]

    targets = [
        f"{label} - {explanation}"
        for label, explanation in zip(gold_labels, Explanation_1s)
    ]

    inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=20,
        return_tensors="pt",
    )
    targets = tokenizer(
        targets,
        truncation=True,
        padding="max_length",
        max_length=20,
        return_tensors="pt",
    )
    return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}


# https://huggingface.co/docs/transformers/main_classes/data_collator

tokenized_datasets = dataset.select(range(500)).map(preprocess_function, batched=True)
tokenized_datasets.remove_columns(
    [
        "pairID",
        "gold_label",
        "Sentence1",
        "Sentence2",
        "Explanation_1",
        "WorkerId",
        "Sentence1_marked_1",
        "Sentence2_marked_1",
        "Sentence1_Highlighted_1",
        "Sentence2_Highlighted_1",
        "__index_level_0__",
    ]
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 500/500 [00:00<00:00, 5172.33 examples/s]


Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 500
})

In [7]:
import numpy as np
import evaluate

metric = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print("Before decoding")
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_preds)
    print(decoded_labels)
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    print("result", result)
    return result["rouge1"]

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="models",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(500))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    # compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [10]:
small_eval_dataset.shape

(500, 13)

In [11]:
trainer.train()

  0%|          | 0/1500 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


 33%|███▎      | 500/1500 [00:21<00:41, 24.12it/s]

{'loss': 3.1964, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


                                                  
 34%|███▎      | 505/1500 [00:25<05:11,  3.20it/s]

{'eval_loss': 2.1255650520324707, 'eval_runtime': 2.9202, 'eval_samples_per_second': 171.221, 'eval_steps_per_second': 171.221, 'epoch': 1.0}


 67%|██████▋   | 1000/1500 [00:46<00:20, 24.46it/s]

{'loss': 2.3043, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                   
 67%|██████▋   | 1003/1500 [00:50<03:22,  2.45it/s]

{'eval_loss': 1.9352325201034546, 'eval_runtime': 3.1034, 'eval_samples_per_second': 161.111, 'eval_steps_per_second': 161.111, 'epoch': 2.0}


100%|██████████| 1500/1500 [01:11<00:00, 23.70it/s]

{'loss': 2.1629, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 1500/1500 [01:14<00:00, 20.03it/s]

{'eval_loss': 1.8929709196090698, 'eval_runtime': 3.0619, 'eval_samples_per_second': 163.296, 'eval_steps_per_second': 163.296, 'epoch': 3.0}
{'train_runtime': 74.8938, 'train_samples_per_second': 20.028, 'train_steps_per_second': 20.028, 'train_loss': 2.5545413411458333, 'epoch': 3.0}





TrainOutput(global_step=1500, training_loss=2.5545413411458333, metrics={'train_runtime': 74.8938, 'train_samples_per_second': 20.028, 'train_steps_per_second': 20.028, 'train_loss': 2.5545413411458333, 'epoch': 3.0})

In [12]:
predictions = trainer.predict(small_eval_dataset)

decoded_preds = tokenizer.batch_decode(
    predictions.predictions, skip_special_tokens=True
)

print(decoded_preds)

# crashes

100%|██████████| 500/500 [00:10<00:00, 46.68it/s] 