In [1]:
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import nltk

from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, AutoModelForSeq2SeqLM
import datasets
import evaluate

#import Evaluation_Code.Parent as parent ## code for PARENT metric
import Evaluation_Code.Bartscore as bartscore ## code for Bartscore

import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def ensure_cuda_compatability():
    print(f'Torch version: {torch.__version__}')
    print(f'Cuda version: {torch.version.cuda}')
    print(f'Cudnn version: {torch.backends.cudnn.version()}')
    print(f'Is cuda available: {torch.cuda.is_available()}')
    print(f'Number of cuda devices: {torch.cuda.device_count()}')
    print(f'Current default device: {torch.cuda.current_device()}')
    print(f'First cuda device: {torch.cuda.device(0)}')
    print(f'Name of the first cuda device: {torch.cuda.get_device_name(0)}\n\n')


def preprocess_model(model_name):
    """
    Setup the model and tokenizer for preprocessing. This will be a pre-trained model collected from huggingface
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    print('LOGGING: preprocess_model DONE \n')
    return model, tokenizer


def load_CACAPO_data():
    """
    This function retrieves the csv files and creates a dataset
    """
    print('LOGGING: load_CACAPO_data DONE \n')

    return datasets.load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})


def preprocess_data(data):
    """
    Tokenize the data
    """
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length)

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
    
    model_inputs["labels"] = target_texts
    
    return model_inputs



def transform_datasets(dataset):
    """
    After loading in and creating the initial dataset, the text data is transformed, by tokenizing the input and output texts. The initial dataset is also split into train,val,test for training use. 
    NOTE That the test set will not be preprocessed here yet, this will be done in a different function
    """

    ## Create smaller versions of the dataset
    small_train = dataset["train"].shard(num_shards = 512, index = 0)
    small_val = dataset["dev"].shard(num_shards = 512, index = 0)
    small_test = dataset["test"].shard(num_shards = 512, index = 0)

    ## Process the data in batches
    small_train = small_train.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)  
    small_val = small_val.map(preprocess_data, batched=True, remove_columns=dataset["dev"].column_names)
    #small_test = small_test.map(preprocess_data, batched=True, remove_columns=dataset["test"].column_names)

    # transform the datasets into torch sensors, as the model will expect this format 
    small_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    small_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    #small_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    print('LOGGING: transform_datasets DONE \n')

    return small_train, small_val, small_test



def load_eval_metrics():
    """
    Loads in all metrics that will be used later on during evaluation. This is seperated to not load in the metrics a dozen of times during training.
    """
    bleu = datasets.load_metric("bleu")
    rouge = evaluate.load('rouge')
    meteor = evaluate.load('meteor')
    perplexity = evaluate.load("perplexity", module_type="metric")
    bertscore = evaluate.load("bertscore")
    bart_scorer = bartscore.BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

    print('LOGGING: load_eval_metrics DONE \n')

    return bleu, rouge, meteor, perplexity, bertscore, bart_scorer



def postprocess_text(preds, labels):
    """
    Supplementary Method called in decode_text.
    
    Returns list of split decoded labels and predictions for evaluation
    """
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

def decode_text(predictions, labels):
    """
    Supplementary Method called in compute_metrics.
    
    Returns decoded labels and predictions for evaluation
    """
    if isinstance(predictions, tuple):
            predictions = predictions[0]
        
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return decoded_preds, decoded_labels


def evaluate_texts(decoded_preds, decoded_labels):
    """
    Calculates metrics given a list of decoded predictions and decoded labels
    """
    #post_process for BLEU
    blue_preds, blue_labels = postprocess_text(decoded_preds,  decoded_labels)

    # setup metrics for use
    bleu, rouge, meteor, perplexity, bertscore, bart_scorer = load_eval_metrics()

    # Calculate the metrics
    print(f'\n LOGGING: Calculating Blue')
    bleu_output = bleu.compute(predictions=blue_preds, references=blue_labels)
    print(f'\n LOGGING: Calculating Rouge')
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Meteor')
    meteor_output = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Perplexity')
    perp_output = perplexity.compute(predictions=decoded_preds, model_id='gpt2')
    print(f'\n LOGGING: Calculating Bertscore')
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    print(f'\n LOGGING: Calculating Bartscore')
    bart_scores_output = bart_scorer.score(srcs=decoded_preds, tgts=decoded_labels, batch_size=8) 
    ### Need to add parent 

    return bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output



def compute_metrics(pred):
    """"
    Metrics to be evaluated during training and validation
    Metrics used: BLEU, ROUGE, METEOR, Bertscore, BARTScore
    """
    # decode the predictions and labels for eval
    predictions, labels = pred
    decoded_preds, decoded_labels = decode_text(predictions, labels)

    bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_preds, decoded_labels)

    ## Huggingsface trainer requires a dict if multiple metrics are used
    return {"blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_output, 
            "perp_output": perp_output, "bertscore_output": bertscore_output, "bart_scores_output": bart_scores_output}



def set_training_args(model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size):
    """
    Setup the training arguments that will be used during training.
    """
    #model_name = "t5-fp16-test"
    model_dir = f"../Results/{model_name}"

    training_args = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=learning_rate,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=num_train_epochs,
                evaluation_strategy= evaluation_strategy, 
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=generation_num_beams,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=gradient_accumulation_steps, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=per_device_train_batch_size, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=per_device_eval_batch_size, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                #report_to="tensorboard",
                fp16=True
    )

    print('LOGGING: set_training_args DONE \n')

    return training_args


def get_clean_model(model_name):
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained(model_name)


def set_trainer(model_name, training_args, train_ds, val_ds, tokenizer):
    """
    Initializes a trainer
    Takes in: Model name, training arguments, training dataset, validation dataset, and tokenizer
    Returns: Trainer instance
    """
    clean_model = get_clean_model(model_name)
    trainer = Seq2SeqTrainer(
                model=clean_model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer
                )
    
    print('LOGGING: set_trainer DONE \n')

    return trainer

def train_and_save(trainer, model_name):
    trainer.train()
    trainer.save_model(f"../Models/{model_name}") 


def get_saved_model(model_name):
  saved_model = T5ForConditionalGeneration.from_pretrained(f'../Models/{model_name}', local_files_only=True)
  tokenizer = AutoTokenizer.from_pretrained(f'../Models/{model_name}' ,local_files_only=True)
  return saved_model, tokenizer



def generate_predictions(test_set, saved_model):
  """
  Generates predictions based on the test set, returns a list of predictions and the corresponding "true" articles
  """
  #split the testset into input and output, so that we easily generate predictions and compare them to the true version
  true_articles = test_set['output']

  print(f' true_articles  {true_articles}' )

  encoded_test_set = test_set.map(preprocess_data, batched=True, remove_columns=test_set.column_names)
  encoded_test_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
  encoded_inputs = encoded_test_set.remove_columns("labels")

  # set-up a dataloader to load in the tokenized test dataset
  test_dataloader = torch.utils.data.DataLoader(encoded_inputs, batch_size=32)

  # generate text for each batch
  all_predictions = []
  for i,batch in enumerate(test_dataloader):

    predictions = saved_model.generate(**batch)

    all_predictions.append(predictions)

  # flatten predictions
  all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

  return all_predictions_flattened, true_articles



def decode_predictions(predictions, tokenizer):
  """
  Decode the predictions made by the model
  """
  decoded_predictions = []

  for iteration, prediction in enumerate(predictions):
      decoded_predictions.append(tokenizer.decode(prediction,skip_special_tokens=True))

  return decoded_predictions



def evaluate_test_set(test_set, model_name):
  """
  Transforms test set, retrieves predictions, and evaluates these predictions
  """
  saved_model, saved_tokenizer = get_saved_model(model_name)

  predictions, test_articles = generate_predictions(test_set, saved_model)
  
  decoded_test_predictions = decode_predictions(predictions, saved_tokenizer)

  bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_test_predictions, test_articles)

  evaluation_results = {"blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_output, "perp_output": perp_output, "bertscore_output": bertscore_output, "bart_scores_output": bart_scores_output}

  log_results(evaluation_results)



  # Additional PARENT evaluation
  tables = test_set['input']
  references = test_articles
  generations = decoded_test_predictions
  parent_attempt(generations, references, tables)
  #

  #bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_test_predictions, test_articles)

  ## Huggingsface trainer requires a dict if multiple metrics are used
  return evaluation_results


def write_to_text_parent(decoded_predictions, true_articles, rdfs):


    with open('../Parent_test/true_articles.txt', 'w') as f:
        for articles in true_articles:
            f.write(f'{articles} \n')
   
    with open('../Parent_test/decode_predictions.txt', 'w') as f:
        for predictions in decoded_predictions:
            f.write(f'{predictions} \n')

    with open('../Parent_test/rdfs.txt', 'w') as f:
        for pairs in rdfs:
            f.write(f'{pairs} \n')


def prepare_inputs_parent(RDFs):
    """
    Cleans the RDF pairs and transforms them in the proper format so that the parent module can calculate with it.
    """
    
    attribute_value_pairs = []

    for iteration, inputRDF in enumerate(RDFs):
        if 'â‚¹' in inputRDF:
        #print(f'{inputRDF} \n')
            inputRDF = inputRDF.replace('â‚¹', '')
        
        split_RDF = inputRDF.split(", ")
        entry=[]
        for connected_pair in split_RDF:
            if '[' in connected_pair:
                connected_pair = connected_pair.replace('[', '')
            if ']' in connected_pair:
                connected_pair = connected_pair.replace(']', '')
            if '_' in connected_pair:
                connected_pair = connected_pair.replace('_', ' ')    
            split_pair = tuple(connected_pair.split(' | '))
            entry.append((split_pair))
        attribute_value_pairs.append(entry)
    return attribute_value_pairs



def parent_attempt(generations, references, rdfs):
    """
    The Parent metric needs special treatment, as it only accepts specific inputs and file types.
    """
    prepared_rdfs = prepare_inputs_parent(rdfs)
    write_to_text_parent(generations, references, rdfs)

    %run -i "~E:/ArriaThesis/MscThesis/Code/Evaluation_Code/Parent.py" --references "E:/ArriaThesis/MscThesis/Parent_test/true_articles.txt" \
                                                                       --generations "E:/ArriaThesis/MscThesis/Parent_test/decode_predictions.txt"  \
                                                                       --tables "E:/ArriaThesis/MscThesis/Parent_test/rdfs.txt" 

def log_results(results):
    with open('../Logging_Results/logResults.json', 'w') as convert_file:
        convert_file.write(json.dumps(results))

In [3]:
def fine_tune_model(model_name):
    # ensure cuda compatability
    ensure_cuda_compatability()

    # I instantiate the tokenizer as a global variable, as the .map function in transform_datasets was not working properly. 
    # This should not be an issue, as the tokenizer remains consistent during training and evaluation.
    global tokenizer
    
    # retrieve model and tokenizer from huggingface to prepare dataset
    model, tokenizer = preprocess_model(model_name)
    
    #retrieve the unprocessed data from the csv files
    entire_dataset = load_CACAPO_data()
    
    # process the dataset and split it into its natural train, val, test split
    train_ds, val_ds, test_ds = transform_datasets(entire_dataset)

    # setup the training arguments 
    # parameters = (model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size)
    training_args = set_training_args(model_name, 0.001, 1, 'epoch', 10, 4, 8, 8)

    # create a trainer instance 
    trainer = set_trainer(model_name, training_args, train_ds, val_ds, tokenizer)

    # Finally fine-tune the model and save it
    train_and_save(trainer, model_name)

    testset_evaluation_results = evaluate_test_set(test_ds, model_name)

    #log_results(testset_evaluation_results)

    return testset_evaluation_results

def main():
    model_name = 't5-base'
    results = fine_tune_model(model_name)
    print(results)

In [4]:
main()

Torch version: 1.12.1
Cuda version: 11.3
Cudnn version: 8302
Is cuda available: True
Number of cuda devices: 1
Current default device: 0
First cuda device: <torch.cuda.device object at 0x000001FFCE92A888>
Name of the first cuda device: NVIDIA GeForce GTX 1070




For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Using custom data configuration Cleaned_data-3c9b553877c933dc
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-3c9b553877c933dc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


LOGGING: preprocess_model DONE 

LOGGING: load_CACAPO_data DONE 



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:00<00:00, 998.80it/s]
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]

LOGGING: transform_datasets DONE 

LOGGING: set_training_args DONE 




Using cuda_amp half precision backend
***** Running training *****
  Num examples = 30
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 1


LOGGING: set_trainer DONE 



  0%|          | 0/1 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:06<00:00,  6.46s/it]***** Running Evaluation *****
  Num examples = 4
  Batch size = 8
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simon\AppData\Roaming

LOGGING: load_eval_metrics DONE 


 LOGGING: Calculating Blue

 LOGGING: Calculating Rouge

 LOGGING: Calculating Meteor

 LOGGING: Calculating Perplexity


loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--gpt2\snapshots\75e09b43581151bd1d9ef6700faa605df408979f\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfor


 LOGGING: Calculating Bertscore


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at C:\Users\Simon/.cac


 LOGGING: Calculating Bartscore
{'eval_loss': 17.07703399658203, 'eval_blue_output': {'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 0.014541200706177667, 'length_ratio': 0.19117647058823528, 'translation_length': 13, 'reference_length': 68}, 'eval_rouge_output': {'rouge1': 0.2264957264957265, 'rouge2': 0.08575581395348837, 'rougeL': 0.20058275058275057, 'rougeLsum': 0.1987179487179487}, 'eval_meteor_results': {'meteor': 0.03596663752913753}, 'eval_perp_output': {'perplexities': [1904.3201904296875, 868.296142578125, 3095.555419921875, 3135.15283203125], 'mean_perplexity': 2250.8311462402344}, 'eval_bertscore_output': {'precision': [0.805133581161499, 0.805006206035614, 0.790022611618042, 0.7857273817062378], 'recall': [0.802182674407959, 0.8623934388160706, 0.761752188205719, 0.7797167301177979], 'f1': [0.8036553859710693, 0.8327122330665588, 0.7756299376487732, 0.7827105522155762], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.23.1)'}, 'eval_b

Model weights saved in ../Models/t5-base\pytorch_model.bin
tokenizer config file saved in ../Models/t5-base\tokenizer_config.json
Special tokens file saved in ../Models/t5-base\special_tokens_map.json
loading configuration file ../Models/t5-base\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_pe

 true_articles  ['A bar fight ends in a violent shooting overnight in southeast Houston.', "The Minnesota right-hander threw his first major league complete game and Carlos Gomez had four hits and two RBIs, leading the Twins to a 5-1 victory Thursday night that stretched KC's losing streak to 11 straight.", 'ETF is a basket of securities traded on an exchange similar to stocks.', 'Ms Diamond said clear skies at night would result in a widespread frost from Wednesday.', 'Ook dat neemt PSV mee naar volgende week, naar de sleutelwedstrijd in de Kuip.', 'N i e u w bericht, vervangt: AEX opent met nipte winst']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
loading file vocab.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--facebook--bart-large-cnn\snapshots\c5121e42f57eca153aea31729f71cbedcd77a656\vocab.json
loading file merges.txt from cache at C:\Users\Simon/.cache\huggingface\hub\models--facebook--bart-large-cnn\snapshots\c5121e42f57eca153aea31729f71cbedcd77a656\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.

LOGGING: load_eval_metrics DONE 


 LOGGING: Calculating Blue

 LOGGING: Calculating Rouge

 LOGGING: Calculating Meteor

 LOGGING: Calculating Perplexity


loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--gpt2\snapshots\75e09b43581151bd1d9ef6700faa605df408979f\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfor


 LOGGING: Calculating Bertscore


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at C:\Users\Simon/.cac


 LOGGING: Calculating Bartscore
sum(precisions):  0.20412414523193154    len(precisions  6)
sum(recalls):  0.20005000000000003    len(recalls) 6)
sum(all_f_scores) :  0.2020410238678085    len(all_f_scores)  6)
{'blue_output': {'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 0.05376799253902105, 'length_ratio': 0.2549019607843137, 'translation_length': 26, 'reference_length': 102}, 'rouge_output': {'rouge1': 0.19494454347395526, 'rouge2': 0.0898917259211377, 'rougeL': 0.1873712616359675, 'rougeLsum': 0.18732005295472787}, 'meteor_results': {'meteor': 0.028067127721660073}, 'perp_output': {'perplexities': [1461.142578125, 2210.488525390625, 124.29325103759766, 3785.825927734375, 2463.470947265625, 6312.71044921875], 'mean_perplexity': 2726.3219464619956}, 'bertscore_output': {'precision': [0.7856204509735107, 0.7702969312667847, 0.7882493734359741, 0.7831587195396423, 0.7831859588623047, 0.8187204599380493], 'recall': [0.8525662422180176, 0.7806869149208069, 0.82337