In [1]:
import WebNLG_xmlReader.benchmark_reader as xml_reader
import os.path
import pickle
import pandas as pd
import numpy as np
import re
import tensorflow as tf

In [2]:
#gpus = tf.config.list_physical_devices('GPU')

# if gpus:
#     for gpu in gpus:
#         tf.config.experimental.set_virtual_device_configuration(gpu,[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
#         tf.config.experimental.set_memory_growth(gpu,True)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=4096)]) 
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


# Dataset Cleaning and Preperations

CACAPO contains data for both pipeline and neural end-to-end structures. As this project only focuses on E2E models, we will not need a majority of the data. The code below extracts the data and makes it easier to retrieve for model fine-tuning

In [3]:
combined_train_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Stocks/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatTrain.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatTrain.xml']

combined_dev_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Stocks//WebNLGFormatDev.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatDev.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatDev.xml']

combined_test_dataset = [
                '../Data/CACAPO/en/Incidents/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Sports/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Stocks/WebNLGFormatTest.xml', 
                '../Data/CACAPO/en/Weather/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Incidents/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Sports/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Stocks/WebNLGFormatTest.xml', 
                '../Data/CACAPO/nl/Weather/WebNLGFormatTest.xml']

all_data = [combined_train_dataset, combined_dev_dataset, combined_test_dataset]

In [4]:

# create instance of benchmark class to transform xml
train_instance =  xml_reader.Benchmark()
dev_instance =  xml_reader.Benchmark()
test_instance =  xml_reader.Benchmark()



def transform_data(data):
    """
    This function calls the xml_reader code to transform the xml into a more suitable code version to use.
    The function takes in the 3 different datasets, which it then transforms and collects into usable variables.
    """
    labels = ["Train", "Dev", "Test"]
    #loop through the different datasets groups and transform the xml into usable code
    for iteration, datasets in enumerate(data):
        #choose the right files
        files = xml_reader.select_files(datasets)
        
        #For each datasplit, transform the xml and store the transformation into a usable variable
        try:
            if datasets == combined_train_dataset:
                train_instance.fill_benchmark(files)
            elif datasets == combined_dev_dataset:
                dev_instance.fill_benchmark(files)
            elif datasets == combined_test_dataset:
                test_instance.fill_benchmark(files)
            
            print(f'Completed the transformation of the {labels[iteration]} datasets \n')
        except:
            print("Error: The proper datasets have not been found. Please check that all dataset splits are available")
    
    return train_instance, dev_instance, test_instance


def total_data_check(data_instance, iteration):
    labels = ['Train', 'Dev', 'Test']
    print(f"Number of entries: in {labels[iteration]}:      {data_instance.entry_count()} ") 
    print(f"Number of texts: in {labels[iteration]}:      {data_instance.total_lexcount()} ")
    print(f"Number of distinct properties in {labels[iteration]}:      {len(list(data_instance.unique_p_mtriples()))}")
    print("\n")

    
def single_entry_check(data_instance):
    
    for entry in data_instance.entries:
        #print(f'entry.id        {entry.id}')
        if entry.id == 'Id1':
            print(f"Info about {entry.id} in category '{entry.category}' in size '{entry.size}':")
            print("# of lexicalisations", entry.count_lexs())
            print("Properties: ", entry.relations())
            print("RDF triples: ", entry.list_triples())
            print("Subject:", entry.modifiedtripleset.triples[0].s)
            print("Predicate:", entry.modifiedtripleset.triples[0].p)
            print("Lexicalisation:", entry.lexs[0].lex)
            #print("Another lexicalisation:", entry.lexs[1].lex)
            if entry.dbpedialinks:
                # dbpedialinks is a list where each element is a Triple instance
                print("DB link, en:", entry.dbpedialinks[0].s)  # subject in English

            print("Article text", entry.lexs[0].return_text()) 
            

def extract_data(data_instance):
    RDF_set, text_set = [], []

    for entry in data_instance.entries:
        RDF_set.append(entry.list_triples())
        text_set.append(entry.lexs[0].return_text())

    return RDF_set, text_set 


def clean_text(data):
    text_set = data.copy()
    for iteration, article in enumerate(data):
        text_set[iteration] = re.sub('\\n|', '', article)

    print(f'Text_set    {text_set}')

    return text_set

def create_dataframe(input_data, output_data):
    
    dataframe = pd.DataFrame({'input': input_data, 'output': output_data})
    return dataframe

def write_to_csv(data, iteration):
    """
    Data = dataset
    Iteration = iteration to determine the dataset split
    data_type = is the dataset RDF or text
    """
    labels = ['Train', 'Dev', 'Test']

    if isinstance(data, pd.DataFrame):
        print(f'data is a dataframe')


    try:
        print("Entered try check")
        if (os.path.exists(f'Data/Cleaned_data/{labels[iteration]}') == False) :
            print("Entered path check")
            save_path = 'C:/Users/Simon/Desktop/ArriaThesis/MscThesis/Data/Cleaned_data/'            
            name_of_file = f'{labels[iteration]}'
            data.to_csv(os.path.join(save_path, f"{name_of_file}.csv"), index=False)    

    except:
        print(f'file for {labels[iteration]} already exists')


# def retrieve_data(file_name):
    
#     dataset_path = f"../Data/Cleaned_data/{file_name}.pkl"
#     with open(dataset_path, 'rb') as f:
#         dataset = pickle.load(f)
    
#     return dataset

def read_csv_file(file_name):
    file_path = f"../Data/Cleaned_data/{file_name}.csv"
    dataset = pd.read_csv(file_path)
    return dataset

def Overal_function(data):
    transformed_train, transformed_dev, transformed_test = transform_data(data)

    combined_transformation = [transformed_train, transformed_dev, transformed_test]
    RDF_text_datasets =[]

    for iteration, dataset in enumerate(combined_transformation):

        RDF_set, Text_set = extract_data(dataset)
        clean_text_set = clean_text(Text_set)
        data_dataframe = create_dataframe(RDF_set, clean_text_set)
        #print(f'Text_set    {Text_set}')
        write_to_csv(data_dataframe, iteration)
        print("\n\n\n")

    #return RDF_text_datasets

In [15]:
Overal_function(all_data)

Completed the transformation of the datasets 

Completed the transformation of the datasets 

Completed the transformation of the datasets 

dataframe is a dataframe
data is a dataframe
Entered try check
Entered path check
save_path: C:/Users/Simon/Desktop/ArriaThesis/MscThesis/Data/Cleaned_data/
name_of_file: Train




dataframe is a dataframe
data is a dataframe
Entered try check
Entered path check
save_path: C:/Users/Simon/Desktop/ArriaThesis/MscThesis/Data/Cleaned_data/
name_of_file: Dev




dataframe is a dataframe
data is a dataframe
Entered try check
Entered path check
save_path: C:/Users/Simon/Desktop/ArriaThesis/MscThesis/Data/Cleaned_data/
name_of_file: Test






# Transformer modelling

# Dataset object creation

In [2]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, AutoModelForCausalLM 

tokenizer = AutoTokenizer.from_pretrained("t5-base") #"t5-base"

model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base") #TFAutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [3]:
from datasets import load_dataset
#dataset = load_dataset("../Data/Cleaned_data/", data_files="Train.csv")
full_dataset = load_dataset("../Data/Cleaned_data/", data_files={"train": "Train.csv", "dev": "Dev.csv", "test": "Test.csv"})

Using custom data configuration Cleaned_data-587c560a7f9fd76e
Found cached dataset csv (C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-587c560a7f9fd76e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 3/3 [00:00<00:00, 272.47it/s]


In [4]:
# zie https://github.com/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb

def preprocess_data(data):
    max_length = 256
    RDFs = data["input"]
    texts = data["output"]

    model_inputs = tokenizer(RDFs, truncation=True, padding="max_length")

    # specially for seq2seq tokenizer, "Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to sequence-to-sequence models that need a slightly different processing for the labels."
    with tokenizer.as_target_tokenizer():
        target_texts = tokenizer(texts, padding="max_length", truncation=True).input_ids

    #target_texts = tokenizer(texts, truncation=True, padding="max_length").input_ids
    
    #model_inputs['decoder_input_ids'] = np.zeros((len(target_texts), 0))
    model_inputs["labels"] = target_texts
    #print(f'target_texts    {target_texts}')

    # ook nodig

    return model_inputs

In [6]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 45870
    })
    dev: Dataset({
        features: ['input', 'output'],
        num_rows: 5493
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 9084
    })
})

In [5]:
encoded_train_ds = full_dataset["train"].map(preprocess_data, batched=True, remove_columns=full_dataset["train"].column_names)  
encoded_dev_ds = full_dataset["dev"].map(preprocess_data, batched=True, remove_columns=full_dataset["dev"].column_names)
encoded_test_ds = full_dataset["test"].map(preprocess_data, batched=True, remove_columns=full_dataset["test"].column_names)


Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-587c560a7f9fd76e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-87225235fffdee6f.arrow
100%|██████████| 6/6 [00:01<00:00,  4.47ba/s]
Loading cached processed dataset at C:/Users/Simon/.cache/huggingface/datasets/csv/Cleaned_data-587c560a7f9fd76e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-3bcc14842f1eb062.arrow


In [57]:
encoded_train_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'],
    num_rows: 45870
})

In [66]:
encoded_train_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 45870
})

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

tf_train = encoded_train_ds.to_tf_dataset(
  columns=["input_ids", "attention_mask", 'decoder_input_ids',"labels"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=1,
)

tf_val = encoded_dev_ds.to_tf_dataset(
  columns=["input_ids", "attention_mask", 'decoder_input_ids',"labels"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=1,
)

tf_test = encoded_test_ds.to_tf_dataset(
  columns=["input_ids", "attention_mask", 'decoder_input_ids',"labels"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=1,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Model compile and fit 


In [10]:
# from tensorflow.python.client import device_lib 
# print(device_lib.list_local_devices())


In [7]:
batch_size = 2
num_epochs = 3

num_train_steps = len(tf_train) * num_epochs


model.compile(
  optimizer=tf.keras.optimizers.Adam(3e-5)
  #metrics=tf.metrics.SparseCategoricalAccuracy(),

)


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [12]:
# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only use the first GPU
#   try:
#     tf.config.set_visible_devices(gpus[0], 'GPU')
#     logical_gpus = tf.config.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
#   except RuntimeError as e:
#     # Visible devices must be set before GPUs have been initialized
#     print(e)

In [8]:
# To avoid GraphExecution erros
tf.config.run_functions_eagerly(True)
#tf.keras.backend.clear_session()
#gc.collect()

model.fit(
  tf_train,
  epochs=3,
  batch_size=1
  #validation_data=tf_val
  )
    


Epoch 1/3


ResourceExhaustedError: Exception encountered when calling layer "self_attn" (type TFBartAttention).

OOM when allocating tensor with shape[12,1024,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Softmax]

Call arguments received:
  • hidden_states=tf.Tensor(shape=(1, 1024, 768), dtype=float32)
  • key_value_states=None
  • past_key_value=None
  • attention_mask=tf.Tensor(shape=(1, 1, 1024, 1024), dtype=float32)
  • layer_head_mask=None
  • training=True

In [13]:
print(tf.__version__)

2.8.0


In [None]:
# Extracted from https://discuss.huggingface.co/t/fine-tuning-t5-on-tensorflow/12253/2

import tensorflow as tf
import numpy as np
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer

dataset = load_dataset('jfleg')
dataset = concatenate_datasets([dataset['validation'], dataset['test']])
dataset = dataset.filter(lambda x: len(x['sentence']) > 16)

pd_dataset = dataset.to_pandas()
pd_dataset = pd_dataset.explode('corrections', ignore_index=True)
dataset = Dataset.from_pandas(pd_dataset)

dataset = dataset.map(lambda x: {'correction': x['corrections'], 'sentence': 'grammar:' + x['sentence']})
dataset = dataset.remove_columns(['corrections'])

def preprocess(examples):
  model_inputs = tokenizer(examples['sentence'], max_length=128, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples['correction'], max_length=128, truncation=True)

  model_inputs['labels'] = labels['input_ids']
  model_inputs['decoder_input_ids'] = np.zeros((len(labels['input_ids']), 0))
  return model_inputs

inputs = dataset.map(preprocess, batched=True)
inputs = inputs.remove_columns(['sentence', 'correction'])

model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')

batch_size = 8
num_epochs = 3

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

tf_train = inputs.to_tf_dataset(
  columns=["attention_mask", "input_ids", 'decoder_input_ids'],
  label_cols=["labels"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=batch_size,
)

num_train_steps = len(tf_train) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(
  optimizer=optimizer,
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(
  tf_train,
  epochs=num_epochs,
  batch_size=batch_size
)

# Resources

Sources discussing OOM issues

1. https://www.cnblogs.com/sherrydatascience/p/13894511.html
2. https://www.reddit.com/r/tensorflow/comments/qaj3r4/out_of_memory_when_repeatedly_predicting_during/
3. https://github.com/tensorflow/tensorflow/issues/44711#issuecomment-727050449

1. https://github.com/huggingface/transformers/blob/main/examples/tensorflow/language-modeling/run_clm.py#L275
2. Extracted from https://discuss.huggingface.co/t/fine-tuning-t5-on-tensorflow/12253/2
3. https://huggingface.co/docs/transformers/v4.22.1/en/main_classes/trainer#transformers.TrainingArguments
4. https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb#scrollTo=tBK5TwkLWK_d
5. https://huggingface.co/docs/transformers/notebooks#tensorflow-examples

# could be interesting
https://huggingface.co/docs/optimum/index

# Trying memory optimization 
from https://huggingface.co/docs/transformers/v4.22.2/en/perf_train_gpu_one

In [3]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [4]:
print_gpu_utilization()

GPU memory occupied: 1509 MB.


In [5]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, AutoModelForCausalLM 

#tokenizer = AutoTokenizer.from_pretrained("t5-base") #"t5-base"

model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")#TFAutoModelForSeq2SeqLM
print_gpu_utilization()

  from .autonotebook import tqdm as notebook_tqdm
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


GPU memory occupied: 5716 MB.


In [7]:
from transformers import AutoModelForSequenceClassification


model = TFAutoModelForSequenceClassification.from_pretrained("bert-large-uncased")
print_gpu_utilization()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

AssertionError: Torch not compiled with CUDA enabled

In [67]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TFTrainer, AutoModelForSequenceClassification, T5Config, AutoModelForSeq2SeqLM



model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_checkpoint}-finetune-generation",
    learning_rate=4e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    gradient_accumulation_steps=4, 
    # Gradient checkpointing saves strategically selected activations throughout the computational 
    # graph so only a fraction of the activations need to be re-computed for the gradients
    gradient_checkpointing=True,
    optim="adafactor",
)

# train_args = Seq2SeqTrainingArguments(
#     gradient_accumulation_steps=4, 
#     # Gradient checkpointing saves strategically selected activations throughout the computational 
#     # graph so only a fraction of the activations need to be re-computed for the gradients
#     gradient_checkpointing=True,
#     optim="adafactor",
#     output_dir=f"{model_checkpoint}-finetune-generation",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size = 4, 
#     learning_rate =2e-5,)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalt

In [68]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,  return_tensors="pt")

In [69]:

# # Function that returns an untrained model to be trained
# def model_init():
#     #config=T5Config.from_pretrained('t5-base')
#     return TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) #, config=config)


model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_train_ds,
    eval_dataset= encoded_dev_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)



# trainer = Seq2SeqTrainer(
    
#     model=model,
#     # The training arguments.
#     args=train_args,
#     # The training dataset.
#     train_dataset=tf_train,
#     # The evaluation dataset. We use a small subset of the validation set
#     # composed of 150 samples to speed up computations...
#     eval_dataset=tf_val
#     # Even though the training set and evaluation set are already tokenized, the
#     # tokenizer is needed to pad the "input_ids" and "attention_mask" tensors
#     # to the length managed by the model. It does so one batch at a time, to
#     # use less memory as possible.
#     #tokenizer=tokenizer_init
# )

loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

In [70]:
# Train pre-trained model
trainer.train()

loading configuration file config.json from cache at C:\Users\Simon/.cache\huggingface\hub\models--t5-base\snapshots\23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9\config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
  

KeyboardInterrupt: 

In [1]:
# nvcc --version --> Cuda version

## Fast fine-tune test

In [6]:
# create smaller dataset chunk
small_train = encoded_train_ds.shard(num_shards = 5, index = 0)
small_val = encoded_dev_ds.shard(num_shards = 5, index = 0)
small_test = encoded_test_ds.shard(num_shards = 5, index = 0)


In [9]:
small_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1817
})

In [None]:
#https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97

### Code from prev student

In [None]:


import torch


training_args = Seq2SeqTrainingArguments(
 output_dir="./results",
 learning_rate=0.001,
 do_eval=True, # will be set to true if evaluation strategy is set
 do_predict=True,
 evaluation_strategy="steps",
 eval_steps= 5,
 save_steps=5,
 max_steps=10, # the total number of training steps to perform
 save_total_limit= 10, # the maximum number of models to keep before deleting the oldest one
 predict_with_generate=True,
 generation_num_beams=10
)


# pip install datasets
import datasets
bleu = datasets.load_metric("bleu")
def postprocess_text(preds, labels):
    preds = [pred.split() for pred in preds]
    labels = [[label.split()] for label in labels]
    return preds, labels

def compute_metrics(pred):
    predictions, labels = pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions,
    skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels,
    skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds,
    decoded_labels)

    bleu_output = bleu.compute(predictions=decoded_preds,
    references=decoded_labels)
    return bleu_output


trainer = Seq2SeqTrainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 compute_metrics=compute_metrics,
 tokenizer=tokenizer
)



# trainer.train(resume_from_checkpoint=True)
trainer.train()
trainer.evaluate(val_dataset)
test_op = trainer.predict(test_dataset)
print(tokenizer.decode(test_op[0][1],skip_special_tokens=True))