## This file serves as a cleaner version of fine-tuning a model. 

Below several models can be added to the main function, afterwhich the code will prepare, train, evaluate, and save the model

In [1]:
import WebNLG_xmlReader.benchmark_reader as xml_reader
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import nltk
#import tensorflow as tf
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, AutoModelForSeq2SeqLM

import datasets
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def ensure_cuda_compatability():
    print(f'Torch version: {torch.__version__}')
    print(f'Cuda version: {torch.version.cuda}')
    print(f'Cudnn version: {torch.backends.cudnn.version()}')
    print(f'Is cuda available: {torch.cuda.is_available()}')
    print(f'Number of cuda devices: {torch.cuda.device_count()}')
    print(f'Current default device: {torch.cuda.current_device()}')
    print(f'First cuda device: {torch.cuda.device(0)}')
    print(f'Name of the first cuda device: {torch.cuda.get_device_name(0)}\n\n')


def preprocess_model(model_name):
    """
    Setup the model and tokenizer for preprocessing. This will be a pre-trained model collected from huggingface
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    print('LOGGING: preprocess_model DONE \n')
    return model, tokenizer

def load_CACAPO_data():
    """
    This function retrieves the csv files and creates a dataset
    """
    print('LOGGING: load_CACAPO_data DONE \n')

    return datasets.load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})

def preprocess_data(data):
    """
    Tokenize the data
    """
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length)

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
    
    model_inputs["labels"] = target_texts
    
    return model_inputs


def transform_datasets(dataset):
    """
    After loading in and creating the initial dataset, the text data is transformed, by tokenizing the input and output texts. The initial dataset is also split into train,val,test for training use. 
    """

    ## Create smaller versions of the dataset
    small_train_test = dataset["train"].shard(num_shards = 64, index = 0)
    small_val_test = dataset["dev"].shard(num_shards = 64, index = 0)
    small_test_test = dataset["test"].shard(num_shards = 64, index = 0)

    ## Process the data in batches
    small_train_test = small_train_test.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)  
    small_val_test = small_val_test.map(preprocess_data, batched=True, remove_columns=dataset["dev"].column_names)
    small_test_test = small_test_test.map(preprocess_data, batched=True, remove_columns=dataset["test"].column_names)

    # transform the datasets into torch sensors, as the model will expect this format 
    small_train_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    small_val_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    small_test_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    print('LOGGING: transform_datasets DONE \n')

    return small_train_test, small_val_test, small_test_test


def load_eval_metrics():
    """
    Loads in all metrics that will be used later on during evaluation. This is seperated to not load in the metrics a dozen of times during training.
    """
    bleu = datasets.load_metric("bleu")
    rouge = evaluate.load('rouge')
    meteor = evaluate.load('meteor')
    perplexity = evaluate.load("perplexity", module_type="metric")
    bertscore = evaluate.load("bertscore")
    
    print('LOGGING: load_eval_metrics DONE \n')

    return bleu, rouge, meteor, perplexity, bertscore

def postprocess_text(preds, labels):
    """
    Supplementary Method called in decode_text.
    
    Returns list of split decoded labels and predictions for evaluation
    """
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

# def rouge_postprocess(decoded_preds, decoded_labels):
#        # Rouge expects a newline after each sentence
#     decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
#                       for pred in decoded_preds]
#     decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
#                       for label in decoded_labels]
                
    # return decoded_preds, decoded_labels

def decode_text(predictions, labels):
    """
    Supplementary Method called in compute_metrics.
    
    Returns decoded labels and predictions for evaluation
    """
    if isinstance(predictions, tuple):
            predictions = predictions[0]
        
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # # Some simple post-processing
    # decoded_preds, decoded_labels = postprocess_text(decoded_preds,
    # decoded_labels)

    return decoded_preds, decoded_labels



def compute_metrics(pred):
    """"
    Metrics to be evaluated during training and validation
    Metrics used: BLEU, ROUGE, METEOR, BARTScore, PARENT
    """
    # decode the predictions and labels for eval
    predictions, labels = pred
    decoded_preds, decoded_labels = decode_text(predictions, labels)

    #post_process for BLEU
    blue_preds, blue_labels = postprocess_text(decoded_preds,  decoded_labels)

    # setup metrics for use
    bleu, rouge, meteor, perplexity, bertscore = load_eval_metrics()

    # Calculate the metrics
    print(f'\n Calculating Blue')
    bleu_output = bleu.compute(predictions=blue_preds, references=blue_labels)
    print(f'\n Calculating rouge')
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n Calculating meteor')
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n Calculating perplexity')
    perp_results = perplexity.compute(predictions=decoded_preds, model_id='gpt2')
    print(f'\n Calculating bertscore')
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    ### Need to add parent and bartscore

    ## Huggingsface trainer requires a dict if multiple metrics are used
    return {" blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_results, "perp_results": perp_results, "bertscore_output": bertscore_output}



def set_training_args(model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size):
    """
    Setup the training arguments that will be used during training.
    """
    #model_name = "t5-fp16-test"
    model_dir = f"../Results/{model_name}"

    training_args = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=learning_rate,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=num_train_epochs,
                evaluation_strategy= evaluation_strategy, 
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=generation_num_beams,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=gradient_accumulation_steps, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=per_device_train_batch_size, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=per_device_eval_batch_size, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                #report_to="tensorboard",
                fp16=True
    )

    print('LOGGING: set_training_args DONE \n')

    return training_args


def get_clean_model(model_name):
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained(model_name)


def set_trainer(model_name, training_args, train_ds, val_ds, tokenizer):
    """
    Initializes a trainer
    Takes in: Model name, training arguments, training dataset, validation dataset, and tokenizer
    Returns: Trainer instance
    """

    #metrics = load_eval_metrics()
    clean_model = get_clean_model(model_name)
    trainer = Seq2SeqTrainer(
                model=clean_model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer
                )
    
    print('LOGGING: set_trainer DONE \n')

    return trainer

def train_and_save(trainer, model_name):
    trainer.train()
    trainer.save_model(f"../Models/{model_name}") 


In [5]:
def fine_tune_model(model_name):
    # ensure cuda compatability
    ensure_cuda_compatability()
    # I instantiate the tokenizer as a global variable, as the .map function in transform_datasets was not working properly. 
    # This should not be an issue, as the tokenizer remains consistent during training and evaluation.
    global tokenizer
    # retrieve model and tokenizer from huggingface to prepare dataset
    model, tokenizer = preprocess_model(model_name)
    
    #retrieve the unprocessed data from the csv files
    entire_dataset = load_CACAPO_data()
    
    # process the dataset and split it into its natural train, val, test split
    train_ds, val_ds, test_ds = transform_datasets(entire_dataset)

    # setup the training arguments 
    # parameters = (model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size)
    training_args = set_training_args(model_name, 0.001, 1, 'epoch', 10, 4, 8, 8)

    # create a trainer instance 
    trainer = set_trainer(model_name, training_args, train_ds, val_ds, tokenizer)

    # Finally fine-tune the model and save it
    train_and_save(trainer, model_name)

def main():
    model_name = 't5-base'
    fine_tune_model(model_name)

In [6]:
main()

Torch version: 1.12.1
Cuda version: 11.3
Cudnn version: 8302
Is cuda available: True
Number of cuda devices: 1
Current default device: 0
First cuda device: <torch.cuda.device object at 0x0000015290B19188>
Name of the first cuda device: NVIDIA GeForce GTX 1070




For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Using custom data configuration Cleaned_data-bcb014efcf526ad6
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


LOGGING: preprocess_model DONE 

LOGGING: load_CACAPO_data DONE 



100%|██████████| 3/3 [00:00<00:00, 599.53it/s]
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]


LOGGING: transform_datasets DONE 

LOGGING: set_training_args DONE 



Using cuda_amp half precision backend
***** Running training *****
  Num examples = 717
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 22


LOGGING: set_trainer DONE 



  0%|          | 0/22 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  5%|▍         | 1/22 [00:06<02:14,  6.42s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpoi

LOGGING: load_eval_metrics DONE 


 Calculating Blue

 Calculating rouge

 Calculating meteor

 Calculating perplexity


loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--gpt2\snapshots\909a290700bd99135e67c64eefc166960b67cfd2\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfor


 Calculating bertscore


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--roberta-large\snapshots\5069d8a2a32a7df4c69ef9b56348be04152a2341\config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at C:\Users\Simon/.cac

{'eval_loss': 0.2718319892883301, 'eval_ blue_output': {'bleu': 0.07287266233295649, 'precisions': [0.5236593059936908, 0.2791970802919708, 0.16846652267818574, 0.10290237467018469], 'brevity_penalty': 0.32478051013616505, 'length_ratio': 0.4706755753526355, 'translation_length': 634, 'reference_length': 1347}, 'eval_\n\n rouge_output': {'rouge1': 0.4567588175569386, 'rouge2': 0.24496511777338179, 'rougeL': 0.3927351243033804, 'rougeLsum': 0.3917153867199384}, 'eval_\n\n meteor_results': {'meteor': 0.32032203156915295}, 'eval_\n\n perp_results': {'perplexities': [20.996877670288086, 31.80162811279297, 130.133056640625, 21.80083465576172, 314.10064697265625, 307.6853942871094, 43.46910858154297, 63.4985237121582, 48.2667350769043, 199.16017150878906, 239.6354522705078, 53.45401382446289, 188.5612335205078, 75.34443664550781, 110.3641586303711, 284.09051513671875, 114.99337768554688, 390.0467834472656, 2665.14013671875, 1136.6309814453125, 1509.013671875, 104.45072937011719, 2010.5133056

Model weights saved in ../Models/t5-base\pytorch_model.bin
tokenizer config file saved in ../Models/t5-base\tokenizer_config.json
Special tokens file saved in ../Models/t5-base\special_tokens_map.json
