In [1]:
import WebNLG_xmlReader.benchmark_reader as xml_reader
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import tensorflow as tf

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 

tokenizer = AutoTokenizer.from_pretrained("t5-base") #"t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") #TFAutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
from datasets import load_dataset
#dataset = load_dataset("../Data/Cleaned_data/", data_files="Train.csv")
full_dataset = load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})

Using custom data configuration Cleaned_data-94f25176065263b6
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-94f25176065263b6/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 3/3 [00:00<00:00, 999.12it/s]


In [4]:
# zie https://github.com/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb

def preprocess_data(data):
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding="max_length")

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding="max_length", truncation=True).input_ids

    #target_texts = tokenizer(texts, truncation=True, padding="max_length").input_ids
    
    #model_inputs['decoder_input_ids'] = np.zeros((len(target_texts), 0))
    model_inputs["labels"] = target_texts
    #print(f'target_texts    {target_texts}')

    # ook nodig

    return model_inputs

In [5]:
encoded_train_ds = full_dataset["train"].map(preprocess_data, batched=True, remove_columns=full_dataset["train"].column_names)  
encoded_dev_ds = full_dataset["dev"].map(preprocess_data, batched=True, remove_columns=full_dataset["dev"].column_names)
encoded_test_ds = full_dataset["test"].map(preprocess_data, batched=True, remove_columns=full_dataset["test"].column_names)


Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-94f25176065263b6/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-06c63a61b1dc6bc6.arrow
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
100%|██████████| 6/6 [00:01<00:00,  5.29ba/s]
Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-94f25176065263b6/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-c896e0437fc2affe.arrow


In [6]:
# create smaller dataset chunk
small_train = encoded_train_ds.shard(num_shards = 8, index = 0)
small_val = encoded_dev_ds.shard(num_shards = 8, index = 0)
small_test = encoded_test_ds.shard(num_shards = 8, index = 0)


In [42]:
# Create functions for evaluating the predictions/generations
import datasets
metric = datasets.load_metric("bleu")
def postprocess_text(preds, labels):
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

# def compute_metrics(pred):
#     predictions, labels = pred
#     if isinstance(predictions, tuple):
#         predictions = predictions[0]
#     decoded_preds = tokenizer.batch_decode(predictions,
#     skip_special_tokens=True)

#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels,
#     skip_special_tokens=True)
#     # Some simple post-processing
#     decoded_preds, decoded_labels = postprocess_text(decoded_preds,
#     decoded_labels)

#     bleu_output = bleu.compute(predictions=decoded_preds,
#     references=decoded_labels)
#     return bleu_output

def compute_metrics(eval_preds):
    #print(f"eval_preds   {eval_preds} \n\n")
    preds, labels = eval_preds #only has 2 values to unpack
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    print(f"preds   {preds} \n\n")
    print(f"labels   {labels} \n\n")

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    #if data_args.ignore_pad_token_for_loss:
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    #print(f"decoded_preds   {decoded_preds} \n\n")
    #print(f"decoded_labels   {decoded_labels} \n\n")

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'results     {result}')

    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [16]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    learning_rate=0.001,
    do_eval=True, # will be set to true if evaluation strategy is set
    do_predict=True, #Whether to run predictions on the test set or not.
    evaluation_strategy="steps", 
    eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
    save_steps=200, # Number of updates steps before two checkpoint saves if save_strategy="steps".
    max_steps=10, # the total number of training steps to perform
    save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
    predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
    generation_num_beams=10,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
    gradient_checkpointing=True, #
    gradient_accumulation_steps=1, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
    per_device_train_batch_size=4, #The batch size per GPU/TPU core/CPU for training.
    per_device_eval_batch_size=4, #The batch size per GPU/TPU core/CPU for evaluation.
    optim="adafactor" #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
    #fp16=True
)

#stepsize vergroten (net als medium post)
#fp16 toepassen

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,  return_tensors="pt")

In [43]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset= small_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

***** Running training *****
  Num examples = 5734
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  0%|          | 0/10 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 10%|█         | 1/10 [00:02<00:18,  2.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 20%|██        | 2/10 [00:03<00:15,  1.93s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 30%|███       | 3/10 [00:08<00:21,  3.04s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache

{'train_runtime': 55.2744, 'train_samples_per_second': 0.724, 'train_steps_per_second': 0.181, 'train_loss': 2.6537424087524415, 'epoch': 0.01}





TrainOutput(global_step=10, training_loss=2.6537424087524415, metrics={'train_runtime': 55.2744, 'train_samples_per_second': 0.724, 'train_steps_per_second': 0.181, 'train_loss': 2.6537424087524415, 'epoch': 0.01})

In [44]:
trainer.evaluate(small_val) #evaluate on validation set
test_op = trainer.predict(small_test) # predict on test set
print(tokenizer.decode(test_op[0][1],skip_special_tokens=True))

***** Running Evaluation *****
  Num examples = 687
  Batch size = 4




[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A




preds   [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 


labels   [[   71  2335  7532 ...     0     0     0]
 [21512     6     3 ...     0     0     0]
 [   71  4509   568 ...     0     0     0]
 ...
 [ 4603    49  6899 ...     0     0     0]
 [10511  4049     9 ...     0     0     0]
 [   86    20  6494 ...     0     0     0]] 




ZeroDivisionError: float division by zero

### future evals
1. https://github.com/WanzhengZhu/GRUEN
2. https://github.com/ufal/nlgi_eval
3. https://github.com/google-research/bleurt