In [3]:
import WebNLG_xmlReader.benchmark_reader as xml_reader
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import tensorflow as tf
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline

from datasets import load_dataset


In [1]:
#import model
#tokenizer = AutoTokenizer.from_pretrained("t5-base")
#model = T5ForConditionalGeneration.from_pretrained("t5-base")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 376/376 [00:00<00:00, 376kB/s]
Downloading: 100%|██████████| 702/702 [00:00<00:00, 701kB/s]
Downloading: 100%|██████████| 4.11M/4.11M [00:00<00:00, 5.10MB/s]
Downloading: 100%|██████████| 65.0/65.0 [00:00<00:00, 64.8kB/s]
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Downloading: 100%|██████████| 2.17G/2.17G [00:35<00:00, 65.9MB/s]


## Data Preperation

In [4]:
full_dataset = load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})

Using custom data configuration Cleaned_data-bcb014efcf526ad6
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 3/3 [00:00<00:00, 998.88it/s]


In [11]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 45870
    })
    dev: Dataset({
        features: ['input', 'output'],
        num_rows: 5493
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 9084
    })
})

In [13]:
# zie https://github.com/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb

def preprocess_data(data):
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length)

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
    
    model_inputs["labels"] = target_texts
    
    return model_inputs

In [15]:
encoded_train_ds = full_dataset["train"].map(preprocess_data, batched=True, remove_columns=full_dataset["train"].column_names)  
encoded_dev_ds = full_dataset["dev"].map(preprocess_data, batched=True, remove_columns=full_dataset["dev"].column_names)
encoded_test_ds = full_dataset["test"].map(preprocess_data, batched=True, remove_columns=full_dataset["test"].column_names)

encoded_train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_dev_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# create smaller dataset chunk
small_train = encoded_train_ds.shard(num_shards = 16, index = 0)
small_val = encoded_dev_ds.shard(num_shards = 16, index = 0)
small_test = encoded_test_ds.shard(num_shards = 16, index = 0)

100%|██████████| 46/46 [00:04<00:00, 10.26ba/s]
100%|██████████| 6/6 [00:00<00:00, 11.82ba/s]
100%|██████████| 10/10 [00:00<00:00, 10.72ba/s]


## Model Prep

In [16]:
# pip install datasets
import datasets
bleu = datasets.load_metric("bleu")
def postprocess_text(preds, labels):
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

def compute_metrics(pred):
    predictions, labels = pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
     
    decoded_preds = tokenizer.batch_decode(predictions,
    skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels,
    skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
    decoded_labels)

    bleu_output = bleu.compute(predictions=decoded_preds,
    references=decoded_labels)
    return bleu_output

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
model_name = "mt5-base-smallDS-text-generation-1Epoch"
model_dir = f"../Results/{model_name}"


training_args = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=0.001,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=1,
                evaluation_strategy="epoch", 
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=10,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=1, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=4, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=4, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                report_to="tensorboard"
)

In [18]:
def model_init():
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

trainer = Seq2SeqTrainer(
            model_init=model_init,
            args=training_args,
            train_dataset=small_train,
            eval_dataset=small_val,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer
            )

loading configuration file https://huggingface.co/google/mt5-base/resolve/main/config.json from cache at C:\Users\Simon/.cache\huggingface\transformers\5ebfd830555547194403d6803baa127970de59b443c04b7a1a60b16a97ed3958.b589da7dac64196f9764abaf2c4c7e507cec8b14b96da3ef270d924f155062de
Model config MT5Config {
  "_name_or_path": "google/mt5-base",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.19.4",
  "use_cache": true,
  "v

In [51]:
# Start TensorBoard before training to monitor it in progress

%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

## Train and Evaluate

In [19]:
trainer.train()

loading configuration file https://huggingface.co/google/mt5-base/resolve/main/config.json from cache at C:\Users\Simon/.cache\huggingface\transformers\5ebfd830555547194403d6803baa127970de59b443c04b7a1a60b16a97ed3958.b589da7dac64196f9764abaf2c4c7e507cec8b14b96da3ef270d924f155062de
Model config MT5Config {
  "_name_or_path": "google/mt5-base",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.19.4",
  "use_cache": true,
  "v

{'loss': 1.9236, 'learning_rate': 0.00030264993026499307, 'epoch': 0.7}


Model weights saved in ../Results/mt5-base-smallDS-text-generation-1Epoch\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ../Results/mt5-base-smallDS-text-generation-1Epoch\checkpoint-500\tokenizer_config.json
Special tokens file saved in ../Results/mt5-base-smallDS-text-generation-1Epoch\checkpoint-500\special_tokens_map.json
Copy vocab file to ../Results/mt5-base-smallDS-text-generation-1Epoch\checkpoint-500\spiece.model
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 70%|██████▉   | 501/717 [3:42:57<1:39:10, 27.55s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 70%|███████   | 502/717 [3:43:23<1:37:06, 27.10s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 70%|███████   | 503/717 [3:43:49<1:35:41, 26.83s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 70%|███████   | 504/717 [3:44:15<

{'eval_loss': 0.43952852487564087, 'eval_bleu': 0.0, 'eval_precisions': [0.08303484207098664, 0.008434176751008434, 0.00041911148365465214, 0.0], 'eval_brevity_penalty': 0.4662917972068701, 'eval_length_ratio': 0.5672330993719985, 'eval_translation_length': 3071, 'eval_reference_length': 5414, 'eval_runtime': 1511.3112, 'eval_samples_per_second': 0.228, 'eval_steps_per_second': 0.057, 'epoch': 1.0}
{'train_runtime': 20631.5873, 'train_samples_per_second': 0.139, 'train_steps_per_second': 0.035, 'train_loss': 1.494947712790517, 'epoch': 1.0}





TrainOutput(global_step=717, training_loss=1.494947712790517, metrics={'train_runtime': 20631.5873, 'train_samples_per_second': 0.139, 'train_steps_per_second': 0.035, 'train_loss': 1.494947712790517, 'epoch': 1.0})

In [20]:
trainer.save_model(f"../Models/{model_name}") 

Saving model checkpoint to ../Models/mt5-base-smallDS-text-generation-1Epoch
Configuration saved in ../Models/mt5-base-smallDS-text-generation-1Epoch\config.json
Model weights saved in ../Models/mt5-base-smallDS-text-generation-1Epoch\pytorch_model.bin
tokenizer config file saved in ../Models/mt5-base-smallDS-text-generation-1Epoch\tokenizer_config.json
Special tokens file saved in ../Models/mt5-base-smallDS-text-generation-1Epoch\special_tokens_map.json
Copy vocab file to ../Models/mt5-base-smallDS-text-generation-1Epoch\spiece.model


In [21]:
trainer.evaluate(small_val)
test_op = trainer.predict(small_test)
print(tokenizer.decode(test_op[0][1],skip_special_tokens=True))

***** Running Evaluation *****
  Num examples = 344
  Batch size = 4


KeyboardInterrupt: 

# MBART Implementation

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

bart_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
full_dataset = load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})

Using custom data configuration Cleaned_data-bcb014efcf526ad6
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 3/3 [00:00<00:00, 749.43it/s]


In [22]:
# zie https://github.com/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb

def preprocess_data(data):
    #print(data)
    max_length = 256
    RDFs = data["input"]
    #print(f'RDF    {RDFs} \n\n\n')
    texts = data["output"]
    #print(f'texts    {texts} \n\n\n')

    model_inputs = bart_tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length)

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    #with bart_tokenizer.as_target_tokenizer():
    #    pass
    target_texts = bart_tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
    
    model_inputs["labels"] = target_texts
    
    return model_inputs

In [23]:
encoded_train_ds = full_dataset["train"].map(preprocess_data, batched=True, remove_columns=full_dataset["train"].column_names)  
encoded_dev_ds = full_dataset["dev"].map(preprocess_data, batched=True, remove_columns=full_dataset["dev"].column_names)
encoded_test_ds = full_dataset["test"].map(preprocess_data, batched=True, remove_columns=full_dataset["test"].column_names)

encoded_train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_dev_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# create smaller dataset chunk
small_train = encoded_train_ds.shard(num_shards = 16, index = 0)
small_val = encoded_dev_ds.shard(num_shards = 16, index = 0)
small_test = encoded_test_ds.shard(num_shards = 16, index = 0)

100%|██████████| 46/46 [00:04<00:00,  9.97ba/s]
100%|██████████| 6/6 [00:00<00:00, 11.22ba/s]
100%|██████████| 10/10 [00:00<00:00, 11.28ba/s]


In [24]:
# pip install datasets
import datasets
bleu = datasets.load_metric("bleu")
def postprocess_text(preds, labels):
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

def compute_metrics(pred):
    predictions, labels = pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
     
    decoded_preds = bart_tokenizer.batch_decode(predictions,
    skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, bart_tokenizer.pad_token_id)
    decoded_labels = bart_tokenizer.batch_decode(labels,
    skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
    decoded_labels)

    bleu_output = bleu.compute(predictions=decoded_preds,
    references=decoded_labels)
    return bleu_output

  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
model_name = "mBart-base-smallDS-text-generation-1Epoch"
model_dir = f"../Results/{model_name}"


training_args_bart = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=0.001,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=1,
                evaluation_strategy="epoch", 
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=10,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=1, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=4, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=4, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                report_to="tensorboard"
)

In [26]:
def model_init_bart():
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

bart_trainer = Seq2SeqTrainer(
            model_init=model_init_bart,
            args=training_args_bart,
            train_dataset=small_train,
            eval_dataset=small_val,
            compute_metrics=compute_metrics,
            tokenizer=bart_tokenizer
            )

loading configuration file https://huggingface.co/facebook/mbart-large-50/resolve/main/config.json from cache at C:\Users\Simon/.cache\huggingface\transformers\f05465e59eda7f301b62d284f0aff5987c4eafb42ddef0f71b9a8e4c4f6e00f2.b12b1c70d50e4d5e2fcb7773b69c8bbdd6d9f2e18e435c2b368caea016a3ef77
Model config MBartConfig {
  "_name_or_path": "facebook/mbart-large-50",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_toke

In [27]:
bart_trainer.train()

loading configuration file https://huggingface.co/facebook/mbart-large-50/resolve/main/config.json from cache at C:\Users\Simon/.cache\huggingface\transformers\f05465e59eda7f301b62d284f0aff5987c4eafb42ddef0f71b9a8e4c4f6e00f2.b12b1c70d50e4d5e2fcb7773b69c8bbdd6d9f2e18e435c2b368caea016a3ef77
Model config MBartConfig {
  "_name_or_path": "facebook/mbart-large-50",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_toke

{'loss': 1.1941, 'learning_rate': 0.00030264993026499307, 'epoch': 0.7}


Model weights saved in ../Results/mBart-base-smallDS-text-generation-1Epoch\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ../Results/mBart-base-smallDS-text-generation-1Epoch\checkpoint-500\tokenizer_config.json
Special tokens file saved in ../Results/mBart-base-smallDS-text-generation-1Epoch\checkpoint-500\special_tokens_map.json
100%|██████████| 717/717 [5:27:59<00:00, 26.46s/it]  ***** Running Evaluation *****
  Num examples = 344
  Batch size = 4
Trainer is attempting to log a value of "[0.11467889908256881, 0.02178649237472767, 0.0013404825737265416, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                   
100%|██████████| 717/717 [7:44:48<00:00, 26.46s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 717/717 [7:44:48<00:00, 38.90s/it]

{'eval_loss': 0.5359098315238953, 'eval_bleu': 0.0, 'eval_precisions': [0.11467889908256881, 0.02178649237472767, 0.0013404825737265416, 0.0], 'eval_brevity_penalty': 0.22684547083525528, 'eval_length_ratio': 0.402659770964167, 'eval_translation_length': 2180, 'eval_reference_length': 5414, 'eval_runtime': 8209.7727, 'eval_samples_per_second': 0.042, 'eval_steps_per_second': 0.01, 'epoch': 1.0}
{'train_runtime': 27889.138, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.026, 'train_loss': 1.0206961957646878, 'epoch': 1.0}





TrainOutput(global_step=717, training_loss=1.0206961957646878, metrics={'train_runtime': 27889.138, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.026, 'train_loss': 1.0206961957646878, 'epoch': 1.0})

In [28]:
bart_trainer.save_model(f"../Models/{model_name}") 


Saving model checkpoint to ../Models/mBart-base-smallDS-text-generation-1Epoch
Configuration saved in ../Models/mBart-base-smallDS-text-generation-1Epoch\config.json
Model weights saved in ../Models/mBart-base-smallDS-text-generation-1Epoch\pytorch_model.bin
tokenizer config file saved in ../Models/mBart-base-smallDS-text-generation-1Epoch\tokenizer_config.json
Special tokens file saved in ../Models/mBart-base-smallDS-text-generation-1Epoch\special_tokens_map.json


In [None]:
bart_trainer.evaluate(small_val)
test_op = bart_trainer.predict(small_test)
print(tokenizer.decode(test_op[0][1],skip_special_tokens=True))

## Testing

In [57]:
# Comparing input RDF to trainer.predict output
for pred_iteration, predictions in enumerate(test_op[0][:5]):
    print(f"input RDF:  {tokenizer.decode(small_test['input_ids'][pred_iteration],skip_special_tokens=True) }\nGeneration:  {tokenizer.decode(predictions,skip_special_tokens=True)}\n\n")

input RDF:  ['accidentAddress | southeast_Houston', 'accidentDate | overnight','shootingType | violent_shooting']
Generation:  A violent shooting occurred overnight in southeast Houston.


input RDF:  ['victimAge | 19', 'victimGender | men']
Generation:  A 19-year-old man and a 19-year-old woman were also shot


input RDF:  ['victimAge | 39-year-old', 'victimGender | female', 'victimRace | black', 'victimStatus | major_injuries']
Generation:  A 39-year-old woman and a 39-year-old woman with major injuries


input RDF:  ['victimName | Adrian_Potts']
Generation:  Adrian Potts was pronounced dead at the scene.


input RDF:  ["hospitalName | Cook_Children's_Medical_Center", 'takenToHospital | True', 'victimAge | 3-year-olds', 'victimAge_Group | children', 'victimNumber | two', 'victimStatus | serious_condition']
Generation:  The children were taken to Cook Children's Medical Center in serious condition.




In [29]:
# get the saved model and tokenizer
saved_model = T5ForConditionalGeneration.from_pretrained('../Models/mBart-base-smallDS-text-generation-1Epoch', local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained('../Models/mBart-base-smallDS-text-generation-1Epoch',local_files_only=True)

loading configuration file ../Models/mBart-base-smallDS-text-generation-1Epoch\config.json
You are using a model of type mbart to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Model config T5Config {
  "_name_or_path": "facebook/mbart-large-50",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "dropout_rate": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token

In [30]:
#split the testset into input and output, so that we easily generate predictions and compare them to the true version
true_articles = small_test['labels']
input_information = small_test.remove_columns("labels")

# set-up a dataloader to load in the tokenized test dataset
test_dataloader = torch.utils.data.DataLoader(input_information, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(test_dataloader):
  #if i < 5:
  predictions = saved_model.generate(**batch) #, min_length=30, max_length=100, num_beams=5, top_p=0.6, repetition_penalty=1.3)
  #print(f'prediction: {tokenizer.decode(predictions[0],skip_special_tokens=True)}  ') # 
  all_predictions.append(predictions)
  #else:
  #  break

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]



KeyboardInterrupt: 

In [None]:
# Comparing input RDF to trainer.predict output
for pred_iteration, predictions in enumerate(true_articles[0][:5]):
    print(f"Gold:  {tokenizer.decode(true_articles[pred_iteration],skip_special_tokens=True) }\nGeneration:  {tokenizer.decode(all_predictions_flattened[pred_iteration],skip_special_tokens=True)}\n\n")

Gold:  A bar fight ends in a violent shooting overnight in southeast Houston.
Generation:  Police are investigating a violent shooting in southeast Houston overnight.


Gold:  The wounded men range in age from 19 to 21.
Generation:  The men, who are all expected to survive, are between 19 and 19 years old.


Gold:  39-year-old black female with major injuries
Generation:  A 39-year-old female was listed in critical condition and a male was listed in


Gold:  Police on Saturday night identified him as Adrian Potts.
Generation:  Adrian Potts was shot in the right leg, but he was never hurt.


Gold:  The two children, both 3-year-olds, were transported in serious condition to Cook Children's Medical Center.
Generation:  The children, who are all taken to Cook Chili's Medical Center in serious condition, are




#### pickle generated predictions and gold standard 

In [None]:
import pickle
with open(f'../Testingdata/Predictions_{model_name}.pickle', 'wb') as f:
    pickle.dump(all_predictions_flattened, f)

with open(f'../Testingdata/Labels_{model_name}.pickle', 'wb') as f:
    pickle.dump(true_articles, f)



In [None]:
# compute metrics
predictions_labels = [all_predictions_flattened, true_articles]
compute_metrics(predictions_labels) # This calculates blue metric

In [9]:
decoded_predictions = []
decoded_true_articles = []

for iteration, text in enumerate(true_articles):
    decoded_predictions.append(tokenizer.decode(all_predictions_flattened[iteration],skip_special_tokens=True))
    decoded_true_articles.append(tokenizer.decode(text,skip_special_tokens=True))

#### Rouge test

In [21]:
import evaluate
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_true_articles)
rouge_results

{'rouge1': 0.49186400017973353,
 'rouge2': 0.29073247525987433,
 'rougeL': 0.4282798132067188,
 'rougeLsum': 0.4283660260971445}

#### Bertscore

In [102]:
from evaluate import load
bertscore = load("bertscore")
#predictions = ["hello there", "general kenobi"]
##references = ["hello there", "general kenobi"]
bertscore_results = bertscore.compute(predictions=decoded_predictions, references=decoded_true_articles, lang="en")

In [103]:
print(bertscore_results['f1'])

[0.9499719738960266, 0.8786465525627136, 0.9425523281097412, 0.888322651386261, 0.9639273881912231, 0.9058011770248413, 0.922014057636261, 0.9419782757759094, 0.872467041015625, 0.9157440662384033, 0.9387620091438293, 0.8557538390159607, 0.9083451628684998, 0.9227930307388306, 0.9603692889213562, 0.9999998807907104, 0.9119024276733398, 0.9661831259727478, 0.9350420832633972, 0.8840372562408447, 0.9284369945526123, 0.8676361441612244, 0.8574816584587097, 0.9095780849456787, 0.9454705119132996, 0.9572874307632446, 0.9291267395019531, 0.9404165744781494, 0.8495192527770996, 0.8651273250579834, 0.8772913217544556, 0.9339597225189209, 0.9348630309104919, 0.8939517736434937, 0.9057127237319946, 0.9151082038879395, 0.9366471767425537, 0.90410315990448, 0.9384956955909729, 0.9110017418861389, 0.8989789485931396, 0.9116221070289612, 0.9441954493522644, 0.972309410572052, 0.9292705655097961, 0.8936586976051331, 0.9400068521499634, 0.8933810591697693, 0.8447283506393433, 0.9088520407676697, 0.914

#### Perplexity

In [109]:
from evaluate import load
perplexity = load("perplexity", module_type="metric")
perp_results = perplexity.compute(predictions=decoded_predictions, model_id='gpt2')
perp_results["mean_perplexity"]

Using pad_token, but it is not set yet.
100%|██████████| 71/71 [00:10<00:00,  6.57it/s]


761.7258675585331

#### Meteor

In [110]:
meteor = evaluate.load('meteor')
meteor_results = meteor.compute(predictions=decoded_predictions, references=decoded_true_articles)
meteor_results

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.36306766407957697}

### BARTScore

https://github.com/neulab/BARTScore

In [12]:
### Code for Bartscore
# %%
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=256, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def multi_ref_score(self, srcs, tgts, agg="mean", batch_size=4):
        # Assert we have the same number of references
        ref_nums = [len(x) for x in tgts]
        if len(set(ref_nums)) > 1:
            raise Exception("You have different number of references per test sample.")

        ref_num = len(tgts[0])
        score_matrix = []
        for i in range(ref_num):
            curr_tgts = [x[i] for x in tgts]
            scores = self.score(srcs, curr_tgts, batch_size)
            score_matrix.append(scores)
        if agg == "mean":
            score_list = np.mean(score_matrix, axis=0)
        elif agg == "max":
            score_list = np.max(score_matrix, axis=0)
        else:
            raise NotImplementedError
        return list(score_list)

    def test(self, batch_size=3):
        """ Test """
        src_list = [
            'This is a very good idea. Although simple, but very insightful.',
            'Can I take a look?',
            'Do not trust him, he is a liar.'
        ]

        tgt_list = [
            "That's stupid.",
            "What's the problem?",
            'He is trustworthy.'
        ]

        print(self.score(src_list, tgt_list, batch_size))

In [13]:
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

In [15]:
bart_scores = bart_scorer.score(srcs=decoded_predictions, tgts=decoded_true_articles, batch_size=1) 

In [17]:
print(len(bart_scores))
bart_scores

1136


[-2.5916852951049805,
 -2.8240702152252197,
 -1.751649260520935,
 -3.2559080123901367,
 -1.5100139379501343,
 -4.154699802398682,
 -2.406635284423828,
 -2.4597301483154297,
 -3.315011739730835,
 -3.2063775062561035,
 -1.6033596992492676,
 -3.407088279724121,
 -2.2348411083221436,
 -2.855161428451538,
 -1.1250760555267334,
 -0.4073900282382965,
 -2.483037233352661,
 -1.2667436599731445,
 -2.0463755130767822,
 -3.5058274269104004,
 -1.4195690155029297,
 -3.1969618797302246,
 -2.915322780609131,
 -2.538642644882202,
 -1.7162305116653442,
 -1.9518054723739624,
 -1.8608429431915283,
 -1.7569050788879395,
 -4.102962970733643,
 -4.241076946258545,
 -3.9985783100128174,
 -1.7357791662216187,
 -1.4153032302856445,
 -2.3941750526428223,
 -2.353602170944214,
 -2.5307979583740234,
 -2.136995315551758,
 -2.352670907974243,
 -2.213817834854126,
 -1.5588377714157104,
 -1.8158177137374878,
 -2.2451202869415283,
 -1.289177417755127,
 -1.0434327125549316,
 -2.2734375,
 -1.9201128482818604,
 -1.434446930

#### BLEURT ---> Giving TF error?

https://github.com/google-research/bleurt


Tf error occurs, this is a known issue. Apparently this PR should fix it: https://github.com/google-research/bleurt/pull/44 --> Fix doesnt seem so work

In [51]:
#code for Bleurt
#from bleurt import score
import bleurt.score as score


In [6]:
import pickle
# open a file, where you stored the pickled data
file = open('../Testingdata/Labels_t5-base-smallDS-story-generation-1Epoch.pickle', 'rb')
# dump information to that file
all_predictions_flattened = pickle.load(file)
# close the file
file.close()

# open a file, where you stored the pickled data
file = open('../Testingdata/Predictions_t5-base-smallDS-story-generation-1Epoch.pickle', 'rb')
# dump information to that file
true_articles = pickle.load(file)
# close the file
file.close()

In [7]:
decoded_predictions = []
decoded_true_articles = []

for iteration, text in enumerate(true_articles):
    decoded_predictions.append(tokenizer.decode(all_predictions_flattened[iteration],skip_special_tokens=True))
    decoded_true_articles.append(tokenizer.decode(text,skip_special_tokens=True))

In [49]:
checkpoint = "bleurt/test_checkpoint"
references = decoded_true_articles
candidates = decoded_predictions

In [52]:
scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(references=references, candidates=candidates)
assert type(scores) == list and len(scores) == 1
print(scores)

INFO:tensorflow:Reading checkpoint bleurt/test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


InvalidArgumentError: cannot compute __inference_pruned_4209 as input #0(zero-based) was expected to be a int64 tensor but is a int32 tensor [Op:__inference_pruned_4209]

### Visualizations (SHAP / ECCO)

In [59]:
import ecco

lm = ecco.from_pretrained('../Models/t5-base') #t5-base-smallDS-story-generation-1Epoch

ValueError: ("The model '../Models/t5-base' is not defined in Ecco's 'model-config.yaml' file and so is not explicitly supported yet. Supported models are:", ['gpt2', 'gpt2-medium', 'gpt2-xl', 'distilgpt2', 'distilbert-base-uncased', 'bert-base-uncased', 'bert-large-uncased', 'distilbert-base-uncased-finetuned-sst-2-english', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'albert-base-v2', 'albert-large-v2', 'albert-xxlarge-v2', 'distilroberta-base', 'roberta-base', 'roberta-large', 'google/electra-small-discriminator', 'google/electra-base-discriminator', 'sentence-transformers/distilbert-base-nli-stsb-mean-tokens', 't5-small', 't5-base', 'bigscience/T0_3B', 'bigscience/T0', 'bigscience/T0p', 'bigscience/T0pp', 'tscholak/1wnr382e', 'valhalla/t5-small-qg-hl', 'valhalla/t5-small-qa-qg-hl', 'valhalla/t5-base-e2e-qg', 'facebook/bart-large-mnli', 'sshleifer/tiny-gpt2', 'julien-c/bert-xsmall-dummy', 'EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-neo-2.7B'])

In [None]:
## SHAP
explainer = shap.DeepExplainer(model, background)
shap_values = explainer.shap_values(x_test_each_class)


In [8]:
import shap

In [10]:
# load a transformers pipeline model
#model = transformers.pipeline('sentiment-analysis', return_all_scores=True)
#model = T5ForConditionalGeneration.from_pretrained('../Models/t5-base-smallDS-story-generation-1Epoch/', local_files_only=True)

model_pipeline = pipeline('text-generation', model=saved_model, tokenizer=tokenizer)




The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['XGLMForCausalLM', 'PLBartForCausalLM', 'QDQBertLMHeadModel', 'TrOCRForCausalLM', 'GPTJForCausalLM', 'RemBertForCausalLM', 'RoFormerForCausalLM', 'BigBirdPegasusForCausalLM', 'GPTNeoForCausalLM', 'BigBirdForCausalLM', 'CamembertForCausalLM', 'XLMRobertaXLForCausalLM', 'XLMRobertaForCausalLM', 'RobertaForCausalLM', 'BertLMHeadModel', 'OpenAIGPTLMHeadModel', 'GPT2LMHeadModel', 'TransfoXLLMHeadModel', 'XLNetLMHeadModel', 'XLMWithLMHeadModel', 'ElectraForCausalLM', 'CTRLLMHeadModel', 'ReformerModelWithLMHead', 'BertGenerationDecoder', 'XLMProphetNetForCausalLM', 'ProphetNetForCausalLM', 'BartForCausalLM', 'OPTForCausalLM', 'MBartForCausalLM', 'PegasusForCausalLM', 'MarianForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'MegatronBertForCausalLM', 'Speech2Text2ForCausalLM', 'Data2VecTextForCausalLM'].


In [21]:

# explain the model on two sample inputs
#explainer = shap.Explainer(model_pipeline) 
explainer = shap.Explainer(saved_model, tokenizer)
shap_values = explainer(["What a great movie!"])

# visualize the first prediction's explanation for the POSITIVE output class
#shap.plots.text(shap_values[0, :, "POSITIVE"])


In [22]:
shap.plots.text(shap_values)

In [23]:
shap_values = explainer(['Scientists confirmed the worst possible outcome: the massive asteroid will collide with Earth'])

Partition explainer: 2it [00:17, 17.29s/it]               


In [24]:
shap.plots.text(shap_values)
