In [2]:
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import nltk

from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, AutoModelForSeq2SeqLM
import datasets
import evaluate

import Evaluation_Code.Parent as parent ## code for PARENT metric
import Evaluation_Code.Bartscore as bartscore ## code for Bartscore

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def ensure_cuda_compatability():
    print(f'Torch version: {torch.__version__}')
    print(f'Cuda version: {torch.version.cuda}')
    print(f'Cudnn version: {torch.backends.cudnn.version()}')
    print(f'Is cuda available: {torch.cuda.is_available()}')
    print(f'Number of cuda devices: {torch.cuda.device_count()}')
    print(f'Current default device: {torch.cuda.current_device()}')
    print(f'First cuda device: {torch.cuda.device(0)}')
    print(f'Name of the first cuda device: {torch.cuda.get_device_name(0)}\n\n')


def preprocess_model(model_name):
    """
    Setup the model and tokenizer for preprocessing. This will be a pre-trained model collected from huggingface
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name) 
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    print('LOGGING: preprocess_model DONE \n')
    return model, tokenizer


def load_CACAPO_data():
    """
    This function retrieves the csv files and creates a dataset
    """
    print('LOGGING: load_CACAPO_data DONE \n')

    return datasets.load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})


def preprocess_data(data):
    """
    Tokenize the data
    """
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length)

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
    
    model_inputs["labels"] = target_texts
    
    return model_inputs


def transform_datasets(dataset):
    """
    After loading in and creating the initial dataset, the text data is transformed, by tokenizing the input and output texts. The initial dataset is also split into train,val,test for training use. 
    """

    ## Create smaller versions of the dataset
    small_train = dataset["train"].shard(num_shards = 256, index = 0)
    small_val = dataset["dev"].shard(num_shards = 256, index = 0)
    small_test = dataset["test"].shard(num_shards = 256, index = 0)

    ## Process the data in batches
    small_train = small_train.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)  
    small_val = small_val.map(preprocess_data, batched=True, remove_columns=dataset["dev"].column_names)
    small_test = small_test.map(preprocess_data, batched=True, remove_columns=dataset["test"].column_names)

    # transform the datasets into torch sensors, as the model will expect this format 
    small_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    small_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    small_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    print('LOGGING: transform_datasets DONE \n')

    return small_train, small_val, small_test


def load_eval_metrics():
    """
    Loads in all metrics that will be used later on during evaluation. This is seperated to not load in the metrics a dozen of times during training.
    """
    bleu = datasets.load_metric("bleu")
    rouge = evaluate.load('rouge')
    meteor = evaluate.load('meteor')
    perplexity = evaluate.load("perplexity", module_type="metric")
    bertscore = evaluate.load("bertscore")
    bart_scorer = bartscore.BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

    
    print('LOGGING: load_eval_metrics DONE \n')

    return bleu, rouge, meteor, perplexity, bertscore, bart_scorer

def postprocess_text(preds, labels):
    """
    Supplementary Method called in decode_text.
    
    Returns list of split decoded labels and predictions for evaluation
    """
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

def decode_text(predictions, labels):
    """
    Supplementary Method called in compute_metrics.
    
    Returns decoded labels and predictions for evaluation
    """
    if isinstance(predictions, tuple):
            predictions = predictions[0]
        
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return decoded_preds, decoded_labels


def compute_metrics(pred):
    """"
    Metrics to be evaluated during training and validation
    Metrics used: BLEU, ROUGE, METEOR, BARTScore, PARENT
    """
    # decode the predictions and labels for eval
    predictions, labels = pred
    decoded_preds, decoded_labels = decode_text(predictions, labels)

    #post_process for BLEU
    blue_preds, blue_labels = postprocess_text(decoded_preds,  decoded_labels)

    # setup metrics for use
    bleu, rouge, meteor, perplexity, bertscore, bart_scorer = load_eval_metrics()

    # Calculate the metrics
    print(f'\n LOGGING: Calculating Blue')
    bleu_output = bleu.compute(predictions=blue_preds, references=blue_labels)
    print(f'\n LOGGING: Calculating Rouge')
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Meteor')
    meteor_output = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Perplexity')
    perp_output = perplexity.compute(predictions=decoded_preds, model_id='gpt2')
    print(f'\n LOGGING: Calculating Bertscore')
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    print(f'\n LOGGING: Calculating Bartscore')
    bart_scores_output = bart_scorer.score(srcs=decoded_preds, tgts=decoded_labels, batch_size=8) 
    ### Need to add parent 

    ## Huggingsface trainer requires a dict if multiple metrics are used
    return {"blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_output, 
            "perp_output": perp_output, "bertscore_output": bertscore_output, "bart_scores_output": bart_scores_output}


def set_training_args(model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size):
    """
    Setup the training arguments that will be used during training.
    """
    #model_name = "t5-fp16-test"
    model_dir = f"../Results/{model_name}"

    training_args = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=learning_rate,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=num_train_epochs,
                evaluation_strategy= evaluation_strategy, 
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=generation_num_beams,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=gradient_accumulation_steps, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=per_device_train_batch_size, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=per_device_eval_batch_size, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                #report_to="tensorboard",
                fp16=True
    )

    print('LOGGING: set_training_args DONE \n')

    return training_args


def get_clean_model(model_name):
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained(model_name)


def set_trainer(model_name, training_args, train_ds, val_ds, tokenizer):
    """
    Initializes a trainer
    Takes in: Model name, training arguments, training dataset, validation dataset, and tokenizer
    Returns: Trainer instance
    """

    #metrics = load_eval_metrics()
    clean_model = get_clean_model(model_name)
    trainer = Seq2SeqTrainer(
                model=clean_model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer
                )
    
    print('LOGGING: set_trainer DONE \n')

    return trainer

def train_and_save(trainer, model_name):
    trainer.train()
    trainer.save_model(f"../Models/{model_name}") 

In [6]:
def fine_tune_model(model_name):
    # ensure cuda compatability
    ensure_cuda_compatability()

    # I instantiate the tokenizer as a global variable, as the .map function in transform_datasets was not working properly. 
    # This should not be an issue, as the tokenizer remains consistent during training and evaluation.
    global tokenizer
    
    # retrieve model and tokenizer from huggingface to prepare dataset
    model, tokenizer = preprocess_model(model_name)
    
    #retrieve the unprocessed data from the csv files
    entire_dataset = load_CACAPO_data()
    
    # process the dataset and split it into its natural train, val, test split
    train_ds, val_ds, test_ds = transform_datasets(entire_dataset)

    # setup the training arguments 
    # parameters = (model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size)
    training_args = set_training_args(model_name, 0.001, 1, 'epoch', 10, 4, 8, 8)

    # create a trainer instance 
    trainer = set_trainer(model_name, training_args, train_ds, val_ds, tokenizer)

    # Finally fine-tune the model and save it
    train_and_save(trainer, model_name)

def main():
    model_name = 't5-base'
    fine_tune_model(model_name)

In [8]:
main()

Torch version: 1.12.1
Cuda version: 11.3
Cudnn version: 8302
Is cuda available: True
Number of cuda devices: 1
Current default device: 0
First cuda device: <torch.cuda.device object at 0x000002CDE0244A88>
Name of the first cuda device: NVIDIA GeForce GTX 1070




Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

LOGGING: preprocess_model DONE 

LOGGING: load_CACAPO_data DONE 




100%|██████████| 3/3 [00:00<00:00, 88.31it/s]
Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-bdd640fb06671ad1.arrow
Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-bcb014efcf526ad6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-3eb13b9046685257.arrow
PyTorch: setting up devices


LOGGING: transform_datasets DONE 



The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


LOGGING: set_training_args DONE 



loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

LOGGING: set_trainer DONE 



Using bos_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
***** Running training *****
  Num examples = 2867
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 89

[AYou're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
