In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Trainer
import pandas as pd
import torch
import evaluate
import nltk

BATCH_SIZE = 10
NUM_EPOCHS = 8
base_checkpoint = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(base_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint)

def add_cols(entry):

    premise = entry["premise"].strip()
    hypothesis = entry["hypothesis"].strip()

    if not premise.endswith("."):
        premise += "."
    assert(premise.endswith("."))
    if not hypothesis.endswith("."):
        hypothesis += "."
    assert(hypothesis.endswith("."))

    # Columns for System 1
    entry["premise_hypothesis"] = 'Premise: ' + premise + ' Hypothesis: ' + hypothesis + ' Is there a contradiction or entailment between the premise and hypothesis ?'
    #entry["label_explanation"] = 'Explanation: ' + entry["explanation"] + '. Label: ' + entry["label"]
    entry["label_explanation"] = 'Label: ' + entry["label"] + '. Explanation: ' + entry["explanation"]
    return entry

df = pd.read_csv("complete_dataset.csv").fillna("")
df_syn = pd.read_csv("synthetic_data_merge.tsv", sep="\t").fillna("")
ds = Dataset.from_pandas(df).shuffle(seed=42)
ds_syn = Dataset.from_pandas(df_syn).shuffle(seed=42)

ds = ds.map(add_cols)
ds_syn = ds_syn.map(add_cols)

def preprocess_dataset_s1(examples):
    model_inputs = tokenizer(examples['premise_hypothesis'], truncation=True, max_length=512)
    labels = tokenizer(examples['label_explanation'], truncation=True, max_length=512)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

Map:   0%|          | 0/7534 [00:00<?, ? examples/s]

Map:   0%|          | 0/23154 [00:00<?, ? examples/s]

In [2]:
sentence = "translate English to French: Hello big guy, I'm a very strange man"
tokenizer.decode(model.generate(tokenizer(sentence, return_tensors="pt")['input_ids'])[0], skip_special_tokens=True)



'Bonjour grand homme, je suis un homme très étrange'

In [3]:
for now_ds in (ds_syn.train_test_split(test_size=0.2), ds.train_test_split(test_size=0.2)):

    curr_ds = now_ds.map(preprocess_dataset_s1, batched=True).remove_columns(now_ds['train'].column_names)

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"T5-small-synthetic-FLUTE",
        learning_rate=3e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=2*8,
        save_total_limit=2,
        num_train_epochs=NUM_EPOCHS,
        report_to="none",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        eval_accumulation_steps=1,
        logging_steps=1,
        lr_scheduler_type="constant"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=curr_ds["train"],
        eval_dataset=curr_ds["test"].select(range(350)),
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    )

    trainer.train()

    # have to do batched rouge computation otherwise not enough memory
    rouge = evaluate.load("rouge")
    metrics = {'rouge1': 0., 'rouge2': 0., 'rougeL': 0., 'rougeLsum': 0.}
    count = 0
    for i in range(0, len(curr_ds['test']), 80):
        count += 1
        (predictions, _), label_ids, _ = trainer.predict(test_dataset=curr_ds['test'].select(range(i, min(i+80, len(curr_ds['test'])))))
        # delete stuff after EOS token
        predicted_token_ids = torch.argmax(torch.from_numpy(predictions), dim=-1)
        for i in range(predicted_token_ids.shape[0]):
            ind = (predicted_token_ids[i] == 1).nonzero(as_tuple=True)[0]
            if ind.numel() != 0:
                predicted_token_ids[i, ind[0]:] = 1

        decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
        labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        new_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        for k in new_metrics:
            metrics[k] += new_metrics[k]

    for k in metrics:
            metrics[k] /= count

    print(metrics['rouge1'])
    break

Map:   0%|          | 0/18523 [00:00<?, ? examples/s]

Map:   0%|          | 0/4631 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.6954,1.271385
2,1.039,1.169694
3,1.4594,1.124114
4,0.9971,1.0919
5,1.3562,1.075722
6,0.6796,1.06535
7,1.0082,1.054857
8,0.8228,1.051661


0.6888044528955631


In [4]:
from huggingface_hub import login
login()
trainer.push_to_hub()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/RicoBorra/T5-small-synthetic-FLUTE/tree/main/'

In [4]:
decoded_preds

['Label: Contratailment. Explanation: A slap and tickle refers to physical amorous play, which the manentailment is saying that the man is looking looking looking for physical am but but also.',
 'Label: Contradiction. Explanation: To smot something something means to destroy or aggressive, while to avoid something means to to move it off',
 'Label: Contradiction. Explanation: Most fact of  a paper boy deliver to deliver that everyone receive their work on time and so not who does delivers their is time is not awesome their expectation. hence it be awesome awesome awesome.',
 'Label: Entailment. Explanation: It is often very as  and friends are late to  that are been planned for advance and soif friends happens all it friends friends person of friends it is natural to feel upset when inconvenience',
 "Label: Contradiction. Explanation: It is not that someone's ex girlfriend cheating on them with one of their close friends would be understandable because it would the trust and is usuall

In [5]:
decoded_labels

['Label: Entailment. Explanation: A slap and tickle refers to physical amorous play, so the entailment is saying that the man is not just looking for physical play, but for love.',
 'Label: Contradiction. Explanation: To smite something means to strike it hard, while to avoid something means not to hit it.',
 'Label: Contradiction. Explanation: The purpose of having a paper boy is to ensure that people receive their newspapers on time, so someone who never delivers it on time is not meeting that expectation and hence cannot be considered as awesome.',
 'Label: Entailment. Explanation: It is often seen as rude when people are late to things that have been planned in advance and if it happens frequently with the same group of friends it is natural to feel pain or frustration',
 "Label: Contradiction. Explanation: It is unlikely that someone's ex girlfriend cheating on them with one of their close friends would be understandable because it breaks the trust which is usually present in thos

In [8]:
#curr_ds = now_ds.map(preprocess_dataset_s1, batched=True).remove_columns(now_ds['train'].column_names)
model = AutoModelForSeq2SeqLM.from_pretrained(base_checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir=f"synthetics",
    learning_rate=3e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=2*BATCH_SIZE,
    save_total_limit=2,
    num_train_epochs=8,
    report_to="none",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    eval_accumulation_steps=1,
    logging_steps=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=curr_ds["train"],
    eval_dataset=curr_ds["test"].select(range(350)),
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

trainer.train()

# have to do batched rouge computation otherwise not enough memory
rouge = evaluate.load("rouge")
metrics = {'rouge1': 0., 'rouge2': 0., 'rougeL': 0., 'rougeLsum': 0.}
count = 0
for i in range(0, len(curr_ds['test']), 100):
    count += 1
    (predictions, _), label_ids, _ = trainer.predict(test_dataset=curr_ds['test'].select(range(i, min(i+100, len(curr_ds['test'])))))
    predicted_token_ids = torch.argmax(torch.from_numpy(predictions), dim=-1)
    # delete stuff after EOS token
    for i in range(predicted_token_ids.shape[0]):
        ind = (predicted_token_ids[i] == 1).nonzero(as_tuple=True)[0]
        if ind.numel() != 0:
            predicted_token_ids[i, ind[0]:] = 1

    decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
    labels = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    new_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    for k in new_metrics:
        metrics[k] += new_metrics[k]

for k in metrics:
        metrics[k] /= count

print(metrics['rouge1'])

Epoch,Training Loss,Validation Loss
1,1.7071,1.504676
2,1.5514,1.418469
3,1.3719,1.367043
4,1.3048,1.344013
5,1.3831,1.328908
6,1.3697,1.316552
7,1.0337,1.316561
8,0.7297,1.31605


0.6565146506379984
