In [1]:
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import torch 
import nltk

from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline, AutoModelForSeq2SeqLM
import datasets
import evaluate

#import Evaluation_Code.Parent as parent ## code for PARENT metric
import Evaluation_Code.Bartscore as bartscore ## code for Bartscore

import json
from ast import literal_eval


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def ensure_cuda_compatability():
    print(f'Torch version: {torch.__version__}')
    print(f'Cuda version: {torch.version.cuda}')
    print(f'Cudnn version: {torch.backends.cudnn.version()}')
    print(f'Is cuda available: {torch.cuda.is_available()}')
    print(f'Number of cuda devices: {torch.cuda.device_count()}')
    print(f'Current default device: {torch.cuda.current_device()}')
    print(f'First cuda device: {torch.cuda.device(0)}')
    print(f'Name of the first cuda device: {torch.cuda.get_device_name(0)}\n\n')


def preprocess_model(model_name):
    """
    Setup the model and tokenizer for preprocessing. This will be a pre-trained model collected from huggingface
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.cuda()

    print('LOGGING: preprocess_model DONE \n')
    return model, tokenizer


def load_CACAPO_data():
    """
    This function retrieves the csv files and creates a dataset
    """
    print('LOGGING: load_CACAPO_data DONE \n')

    return datasets.load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})


def preprocess_data(data):
    """
    Tokenize the data
    """
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    ## When converting a pandas df to csv (used for loading dataset), a list of lists can transform to a long string
    ## Here we convert it back with literal_eval

    for rdf_iteration, rdf in enumerate(RDFs):
        RDFs[rdf_iteration] = literal_eval(rdf)

    model_inputs = tokenizer(RDFs, truncation=True, padding='max_length', return_tensors='pt',  max_length=max_length, is_split_into_words=True).to('cuda')
    
    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt',  max_length=max_length).input_ids
        
    model_inputs["labels"] = target_texts.to('cuda')

    return model_inputs



def transform_datasets(dataset):
    """
    After loading in and creating the initial dataset, the text data is transformed, by tokenizing the input and output texts. The initial dataset is also split into train,val,test for training use.
    NOTE That the test set will not be preprocessed here yet, this will be done in a different function
    """

    ## Create smaller versions of the dataset
    small_train = dataset["train"].shard(num_shards = 1, index = 0)
    small_val = dataset["dev"].shard(num_shards = 1, index = 0)
    small_test = dataset["test"].shard(num_shards = 1, index = 0)

    # to use the actual articles for evaluation
    true_articles_test = small_test['output']
    # The Parent Metric requires the original RDFs
    test_rdf_input = small_test['input']


    ## Process the data in batches
    small_train = small_train.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
    small_val = small_val.map(preprocess_data, batched=True, remove_columns=dataset["dev"].column_names)
    small_test = small_test.map(preprocess_data, batched=True, remove_columns=dataset["test"].column_names)

    # transform the datasets into torch sensors, as the model will expect this format
    small_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'], device="cuda")
    small_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'], device="cuda")
    small_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'], device="cuda")

    print('LOGGING: transform_datasets DONE \n')

    return small_train, small_val, small_test, true_articles_test, test_rdf_input



def load_eval_metrics():
    """
    Loads in all metrics that will be used later on during evaluation. This is seperated to not load in the metrics a dozen of times during training.
    """
    bleu = datasets.load_metric("bleu")
    rouge = evaluate.load('rouge')
    meteor = evaluate.load('meteor')
    perplexity = evaluate.load("perplexity", module_type="metric")
    bertscore = evaluate.load("bertscore")
    bart_scorer = bartscore.BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

    print('LOGGING: load_eval_metrics DONE \n')

    return bleu, rouge, meteor, perplexity, bertscore, bart_scorer



def postprocess_text(preds, labels):
    """
    Supplementary Method called in decode_text.

    Returns list of split decoded labels and predictions for evaluation
    """
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels



def decode_text(predictions, labels):
    """
    Supplementary Method called in compute_metrics.

    Returns decoded labels and predictions for evaluation
    """
    if isinstance(predictions, tuple):
            predictions = predictions[0]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)#.to(device)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)#.to(device)

    return decoded_preds, decoded_labels


def evaluate_texts(decoded_preds, decoded_labels):
    """
    Calculates metrics given a list of decoded predictions and decoded labels
    """
    #post_process for BLEU
    blue_preds, blue_labels = postprocess_text(decoded_preds,  decoded_labels)

    # setup metrics for use
    bleu, rouge, meteor, perplexity, bertscore, bart_scorer = load_eval_metrics()

    # Calculate the metrics
    print(f'\n LOGGING: Calculating Blue')
    bleu_output = bleu.compute(predictions=blue_preds, references=blue_labels)
    print(f'\n LOGGING: Calculating Rouge')
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Meteor')
    meteor_output = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    print(f'\n LOGGING: Calculating Perplexity')
    perp_output = perplexity.compute(predictions=decoded_preds, model_id='gpt2')
    print(f'\n LOGGING: Calculating Bertscore')
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    print(f'\n LOGGING: Calculating Bartscore')
    bart_scores_output = bart_scorer.score(srcs=decoded_preds, tgts=decoded_labels, batch_size=8)
    ### Need to add parent

    return bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output



def compute_metrics(pred):
    """"
    Metrics to be evaluated during training and validation
    Metrics used: BLEU, ROUGE, METEOR, Bertscore, BARTScore
    """
    # decode the predictions and labels for eval
    predictions, labels = pred
    decoded_preds, decoded_labels = decode_text(predictions, labels)

    bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_preds, decoded_labels)

    ## Huggingsface trainer requires a dict if multiple metrics are used
    return {"blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_output,
            "perp_output": perp_output, "bertscore_output": bertscore_output, "bart_scores_output": bart_scores_output}



def set_training_args(model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size):
    """
    Setup the training arguments that will be used during training.
    """
    #model_name = "t5-fp16-test"
    model_dir = f"../Results/{model_name}"

    training_args = Seq2SeqTrainingArguments(
                output_dir=model_dir,
                learning_rate=learning_rate,
                do_eval=True, # will be set to true if evaluation strategy is set
                do_predict=True, #Whether to run predictions on the test set or not.
                num_train_epochs=num_train_epochs,
                evaluation_strategy= evaluation_strategy,
                #eval_steps= 100, # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
                save_steps=500, # Number of updates steps before two checkpoint saves if save_strategy="steps".
                #max_steps=10, # the total number of training steps to perform
                save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
                predict_with_generate=True, # Whether to use generate to calculate generative metrics (ROUGE, BLEU).
                generation_num_beams=generation_num_beams,  #The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration
                gradient_checkpointing=True, #
                gradient_accumulation_steps=gradient_accumulation_steps, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass
                per_device_train_batch_size=per_device_train_batch_size, #The batch size per GPU/TPU core/CPU for training.
                per_device_eval_batch_size=per_device_eval_batch_size, #The batch size per GPU/TPU core/CPU for evaluation.
                optim="adafactor", #The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
                #report_to="tensorboard",
                fp16=True
    )

    print('LOGGING: set_training_args DONE \n')

    return training_args


def get_clean_model(model_name):
    """
    Simple function to ensure that a new model is used for finetuning
    """
    return AutoModelForSeq2SeqLM.from_pretrained(model_name)


def set_trainer(model_name, training_args, train_ds, val_ds, tokenizer):
    """
    Initializes a trainer
    Takes in: Model name, training arguments, training dataset, validation dataset, and tokenizer
    Returns: Trainer instance
    """
    clean_model = get_clean_model(model_name)
    trainer = Seq2SeqTrainer(
                model=clean_model,
                args=training_args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer
                )

    print('LOGGING: set_trainer DONE \n')

    return trainer

def train_and_save(trainer, model_name):
    trainer.train()
    trainer.save_model(f"../Models/{model_name}")

    print('LOGGING: train_and_save DONE \n')


def get_saved_model(model_name):
    saved_model = T5ForConditionalGeneration.from_pretrained(f'../Models/{model_name}', local_files_only=True)
    saved_model.cuda()
    tokenizer = AutoTokenizer.from_pretrained(f'../Models/{model_name}' ,local_files_only=True)
    return saved_model, tokenizer



def generate_predictions(saved_model, test_set):
    """
    Generates predictions based on the test set, returns a list of predictions and the corresponding "true" articles
    """
    encoded_inputs = test_set.remove_columns("labels")
    encoded_inputs["input_ids"].to('cuda')
    encoded_inputs["attention_mask"].to('cuda')
    #encoded_inputs.to('cuda')
    # set-up a dataloader to load in the tokenized test dataset
    test_dataloader = torch.utils.data.DataLoader(encoded_inputs, batch_size=8)

    # generate text for each batch
    all_predictions = []
    for i,batch in enumerate(test_dataloader):
        predictions = saved_model.generate(**batch) # .to(device) --> RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)
        all_predictions.append(predictions)

    # flatten predictions
    all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

    print('LOGGING: generate_predictions DONE \n')


    return all_predictions_flattened #, true_articles


def decode_predictions(predictions, tokenizer):
    """
    Decode the predictions made by the model
    """
    decoded_predictions = []

    for iteration, prediction in enumerate(predictions):
        decoded_predictions.append(tokenizer.decode(prediction,skip_special_tokens=True))

    print('LOGGING: decode_predictions DONE \n')

    return decoded_predictions



def evaluate_test_set(model_name, test_set, true_articles_test, test_rdf_input):
    """
    Transforms test set, retrieves predictions, and evaluates these predictions
    """
    #parent_attempt(model_name, _, _, _)
    saved_model, saved_tokenizer = get_saved_model(model_name)

    #predictions, test_articles = generate_predictions(saved_model, test_set)
    predictions = generate_predictions(saved_model, test_set)

    #predictions = generate_pred(trainer, test_set)

    decoded_test_predictions = decode_predictions(predictions, saved_tokenizer)

    #bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_test_predictions, test_articles)

    #evaluation_results = {"blue_output": bleu_output, "rouge_output": rouge_output, "meteor_results": meteor_output, "perp_output": perp_output, "bertscore_output": bertscore_output, "bart_scores_output": bart_scores_output}

    #log_results(model_name, evaluation_results)

    ##Additional PARENT evaluation
    tables = test_rdf_input
    references = true_articles_test
    generations = decoded_test_predictions
    parent_attempt(model_name, generations, references, tables)
    

    #

    #bleu_output, rouge_output, meteor_output, perp_output, bertscore_output, bart_scores_output = evaluate_texts(decoded_test_predictions, test_articles)

    ## Huggingsface trainer requires a dict if multiple metrics are used
    #return evaluation_results


def write_to_text_parent(model_name, decoded_predictions, true_articles, rdfs):


    with open('../Parent_test/true_articles.txt', 'w', encoding='utf-8') as f:
        for articles in true_articles:
            f.write(f'{articles} \n')

    with open('../Parent_test/decode_predictions.txt', 'w', encoding='utf-8') as f:
        for predictions in decoded_predictions:
            f.write(f'{predictions} \n')

    with open('../Parent_test/rdfs.txt', 'w', encoding='utf-8') as f:
        for pairs in rdfs:
            f.write(f'{pairs} \n')


def prepare_inputs_parent(RDFs):
    """
    Cleans the RDF pairs and transforms them in the proper format so that the parent module can calculate with it.
    """

    attribute_value_pairs = []

    for iteration, inputRDF in enumerate(RDFs):
        split_RDF = inputRDF.split(", ")
        entry=[]
        for connected_pair in split_RDF:
            if '[' in connected_pair:
                connected_pair = connected_pair.replace('[', '')
            if ']' in connected_pair:
                connected_pair = connected_pair.replace(']', '')
            if '_' in connected_pair:
                connected_pair = connected_pair.replace('_', ' ')
            split_pair = tuple(connected_pair.split(' | '))
            entry.append((split_pair))
        attribute_value_pairs.append(entry)
    return attribute_value_pairs


def parent_attempt(model_name, generations, references, rdfs):
    """
    The Parent metric needs special treatment, as it only accepts specific inputs and file types.
    """
    prepared_rdfs = prepare_inputs_parent(rdfs)
    #prepared_text = remove_unk_char(references)
    write_to_text_parent(model_name, generations, references, prepared_rdfs)

    %run -i "~E:/ArriaThesis/MscThesis/Code/Evaluation_Code/Parent.py" --references "E:/ArriaThesis/MscThesis/Parent_test/true_articles.txt" \
                                                                        --generations "E:/ArriaThesis/MscThesis/Parent_test/decode_predictions.txt"  \
                                                                        --tables "E:/ArriaThesis/MscThesis/Parent_test/rdfs.txt"

def log_results(model_name, results):
    with open(f'../Logging_Results/{model_name}_logResults.json', 'w') as convert_file:
        convert_file.write(json.dumps(results))

def fine_tune_model(model_name):
    # ensure cuda compatability
    ensure_cuda_compatability()

    # I instantiate the tokenizer as a global variable, as the .map function in transform_datasets was not working properly. 
    # This should not be an issue, as the tokenizer remains consistent during training and evaluation.
    global tokenizer
    
    #global device
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    ## retrieve model and tokenizer from huggingface to prepare dataset
    model, tokenizer = preprocess_model(model_name)
    
    ### retrieve the unprocessed data from the csv files
    entire_dataset = load_CACAPO_data()
    
    ### process the dataset and split it into its natural train, val, test split
    train_ds, val_ds, test_ds, true_articles_test, test_rdf_inputs = transform_datasets(entire_dataset)

    #### setup the training arguments 
    #### parameters = (model_name, learning_rate, num_train_epochs, evaluation_strategy, generation_num_beams, gradient_accumulation_steps, per_device_train_batch_size, per_device_eval_batch_size)
    #training_args = set_training_args(model_name, 0.005, 1, 'epoch', 5, 2, 8, 8)

    # ###create a trainer instance 
    #trainer = set_trainer(model_name, training_args, train_ds, val_ds, tokenizer)

    #### Finally fine-tune the model and save it
    #train_and_save(trainer, model_name)

    testset_evaluation_results = evaluate_test_set( model_name, test_ds, true_articles_test, test_rdf_inputs)
    #testset_evaluation_results = evaluate_test_set(_, model_name)

    #return testset_evaluation_results

def main():
    global model_name
    model_name = 't5-base'  #'google/mt5-base'
    results = fine_tune_model(model_name)
    print(results)

In [4]:
main()

Torch version: 1.12.1
Cuda version: 11.3
Cudnn version: 8302
Is cuda available: True
Number of cuda devices: 1
Current default device: 0
First cuda device: <torch.cuda.device object at 0x0000016A709F5148>
Name of the first cuda device: NVIDIA GeForce GTX 1070




For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Using custom data configuration Cleaned_data-4b8a9b4c5ecd8560
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-4b8a9b4c5ecd8560/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


LOGGING: preprocess_model DONE 

LOGGING: load_CACAPO_data DONE 



100%|██████████| 3/3 [00:00<00:00, 499.48it/s]
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
 94%|█████████▍| 15/16 [00:02<00:00,  7.17ba/s]
 50%|█████     | 1/2 [00:00<00:00,  2.59ba/s]
 75%|███████▌  | 3/4 [00:00<00:00,  7.38ba/s]


LOGGING: transform_datasets DONE 





text_writing_pairs: [("'accidentAddress", "southeast Houston'"), ("'accidentDate", "overnight'"), ("'shootingType", "violent shooting'")]
text_writing_pairs: [("'accidentAddress", "La Revolucion Sports Bar on San Antonio and Channelside Streets'"), ("'victimGender", "men'"), ("'victimNumber", "at least two'")]
text_writing_pairs: [("'victimNumber", "One'"), ("'victimNumber", "as many as four'"), ("'victimStatus", "critical condition'")]
text_writing_pairs: [("'hospitalName", "Bayshore Medical Center'"), ("'takenToHospital", "True'"), ("'victimNumber", "Two'")]
text_writing_pairs: [("'suspectStatus", "on the run'"), ("'suspectVehicle", "baby blue Chrysler 300'")]
text_writing_pairs: [("'accidentAddress", "west Phoenix house party'"), ("'shootingType", "drive-by shooting'"), ("'victimAge", "19-year-old'"), ("'victimGender", "woman'"), ("'victimStatus", "fatally wounded'")]
text_writing_pairs: [("'accidentDate", "early Sunday'"), ("'shootingType", "shooting'"), ("'takenToHospital", "True'

I1021 23:38:52.950217 19692 Parent.py:463] Evaluated 3028 examples.
I1021 23:38:53.867279 19692 Parent.py:465] Precision = 0.1848 Recall = 0.0001 F-score = 0.0002


FileNotFoundError: [Errno 2] No such file or directory: 'E:/ArriaThesis/MscThesis/Logging_Results/t5-baselogResults.json'

None


Models voor experimenten:

- yhavinga/t5-base-dutch --> puurly dutch
- t5-base --> english
- google/mT5-base --> multilingual