In [1]:
from transformers import TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import os
import pandas as pd
from pathlib import Path

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

BASE_DIR = Path(".")
BATCH_SIZE = 8
NUM_EPOCHS = 5
NUM_SAMPLES = 1_000
MULTI_STAGES = False
FREEZE = False
FROZEN_LAYERS = 15

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "google/t5-efficient-tiny"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 256)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 256)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=256, out_features=256, bias=False)
              (k): Linear(in_features=256, out_features=256, bias=False)
              (v): Linear(in_features=256, out_features=256, bias=False)
              (o): Linear(in_features=256, out_features=256, bias=False)
              (relative_attention_bias): Embedding(32, 4)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=256, out_features=1024, bias=False)
              (wo): Linear(in_features=1024, out_features=256, bias=False)
              (dropout): Drop

In [4]:
e_snli_train_1 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_1.csv")[:NUM_SAMPLES]
e_snli_train_2 = pd.read_csv(BASE_DIR / "datasets" / "esnli_train_2.csv")[:NUM_SAMPLES]
e_snli_train = pd.concat([e_snli_train_1, e_snli_train_2])

e_snli_test = pd.read_csv(BASE_DIR / "datasets" / "esnli_test.csv")[:NUM_SAMPLES]

print(len(e_snli_train_1))
print(len(e_snli_train_2))
print(len(e_snli_train))
e_snli_train.head(1)

1000
1000
2000


Unnamed: 0,pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
0,3416050480.jpg#4r1n,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,the person is not necessarily training his horse,AF0PI3RISB5Q7,A person on a horse jumps over a broken down a...,A person is *training* *his* *horse* for a co...,{},345


In [5]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(e_snli_train)

def preprocess_function(batch):
    pairIDs = batch['pairID']
    gold_labels = batch['gold_label']
    Sentence1s = batch['Sentence1']
    Sentence2s = batch['Sentence2']
    Explanation_1s = batch['Explanation_1']
    WorkerIds = batch['WorkerId']
    Sentence1_marked_1s = batch['Sentence1_marked_1']
    Sentence2_marked_1s = batch['Sentence2_marked_1']
    Sentence1_Highlighted_1s = batch['Sentence1_Highlighted_1']
    Sentence2_Highlighted_1s = batch['Sentence2_Highlighted_1']

    inputs = [f"Does the sentence '{s1}' entail or contradict the sentence '{s2}'? Please answer between 'Entails' or 'Contradicts' and explain your decision in a sentence." for s1, s2 in zip(Sentence1s, Sentence2s)]

    targets = [f"{label} - {explanation}" for label, explanation in zip(gold_labels, Explanation_1s)]

    inputs = tokenizer(inputs)
    targets = tokenizer(targets)

    return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}

# https://huggingface.co/docs/transformers/main_classes/data_collator

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.remove_columns(['pairID', 'gold_label', 'Sentence1', 'Sentence2', 'Explanation_1', 'WorkerId', 'Sentence1_marked_1', 'Sentence2_marked_1', 'Sentence1_Highlighted_1', 'Sentence2_Highlighted_1', '__index_level_0__'])
tokenized_datasets

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map: 100%|██████████| 2000/2000 [00:00<00:00, 4138.39 examples/s]


Dataset({
    features: ['pairID', 'gold_label', 'Sentence1', 'Sentence2', 'Explanation_1', 'WorkerId', 'Sentence1_marked_1', 'Sentence2_marked_1', 'Sentence1_Highlighted_1', 'Sentence2_Highlighted_1', '__index_level_0__', 'input_ids', 'labels'],
    num_rows: 2000
})

In [7]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments( 
    output_dir="models", 
    evaluation_strategy="epoch", 
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(500))
small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(500))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [10]:
trainer.train()

  0%|          | 0/1500 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


 33%|███▎      | 500/1500 [01:07<03:20,  5.00it/s]

{'loss': 5.4077, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}




OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacty of 1.95 GiB of which 14.62 MiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.85 GiB is allocated by PyTorch, and 32.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF