In [1]:
from transformers import BartTokenizerFast, BartForConditionalGeneration, BartConfig, TrainingArguments, Trainer
import torch
import transformers
from datasets import load_dataset
from torch.utils.data import DataLoader
import pandas as pd

In [2]:
tokenizer = BartTokenizerFast.from_pretrained("/kaggle/input/nepbart-tokenizer/nepbart_tokenizer")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BartTokenizerFast'.


Dataset Size: 2_248_625

In [3]:
parameters = {
    'VOCAB_SIZE': 32_768,
    'SEQUENCE_LENGTH': 512,
    'EMBEDDING_DIMENSION': 512,

    'ENCODER_LAYERS': 6,
    'ENCODER_ATTENTION_HEADS': 8,
    'ENCODER_FFN_DIM': 2048,

    'DECODER_LAYERS': 6,
    'DECODER_ATTENTION_HEADS': 8,
    'DECODER_FFN_DIM': 2048,

    'DROPOUT': 0.1,

    'TRAIN_BATCH_SIZE': 32,
    'EVAL_BATCH_SIZE': 32,
    'EPOCHS': 5,
    'LEARNING_RATE': 1e-4,
    'WARMUP_STEPS': 5_000,
    'GRADIENT_ACCUMULATION_STEPS': 4,
    'L2_REG': 0.01,
    'MAX_STEPS': 12_000 # dataset_size / (BATCH*GRADIENT_ACCUMULATION_STEPS)
}

In [4]:
dataset_files = {
    "train": "/kaggle/input/nepbart-dataset-tokenized-masked-padded/nepbart_tokenized_masked_padded_train.parquet",
    "eval": "/kaggle/input/nepbart-dataset-tokenized-masked-padded/nepbart_tokenized_masked_padded_test.parquet"
}

In [5]:
dataset = load_dataset("parquet", data_files=dataset_files, streaming=True)
train_dataset = dataset["train"].repeat(None)
eval_dataset = dataset["eval"]

In [6]:
def collate_fn(batch: torch.tensor):
    return {key: torch.tensor([(d[key]) for d in batch]) for key in batch[0]}

In [7]:
model = BartForConditionalGeneration(
    BartConfig(
        vocab_size=parameters['VOCAB_SIZE'],
        max_position_embeddings=parameters['SEQUENCE_LENGTH'],
        encoder_layers=parameters['ENCODER_LAYERS'],
        encoder_ffn_dim=parameters['ENCODER_FFN_DIM'],
        encoder_attention_heads=parameters['ENCODER_ATTENTION_HEADS'],
        decoder_layers=parameters['DECODER_LAYERS'],
        decoder_ffn_dim=parameters['DECODER_FFN_DIM'],
        decoder_attention_heads=parameters['DECODER_ATTENTION_HEADS'],
        d_model=parameters['EMBEDDING_DIMENSION'],
        dropout=parameters['DROPOUT'],
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        is_encoder_decoder=True,
        decoder_start_token_id=tokenizer.eos_token_id,
    )
)

In [8]:
print(f'{model.num_parameters():_}')

61_444_096


In [9]:
training_args = TrainingArguments(
    output_dir='/kaggle/tmp/',
    overwrite_output_dir=True,
    # eval_strategy='epoch',
    eval_strategy='steps',
    eval_steps=10_000,
    per_device_train_batch_size=parameters['TRAIN_BATCH_SIZE'],
    per_device_eval_batch_size=parameters['EVAL_BATCH_SIZE'],
    learning_rate=parameters['LEARNING_RATE'],
    max_steps=parameters['MAX_STEPS'],
    warmup_steps=parameters['WARMUP_STEPS'],
    weight_decay=parameters['L2_REG'],
    logging_dir="/kaggle/tmp/",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none",
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn
)

In [11]:
trainer.train()

Step,Training Loss,Validation Loss
10000,0.4701,0.448746


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=12000, training_loss=2.1275572967529297, metrics={'train_runtime': 19246.7626, 'train_samples_per_second': 19.951, 'train_steps_per_second': 0.623, 'total_flos': 5.2070304448512e+16, 'train_loss': 2.1275572967529297, 'epoch': 1.0})

In [12]:
print(trainer.evaluate(eval_dataset))

{'eval_loss': 0.4487462639808655, 'eval_runtime': 3672.1709, 'eval_samples_per_second': 70.333, 'eval_steps_per_second': 2.198, 'epoch': 1.0}


In [13]:
trainer.save_model('/kaggle/tmp/final_model')

In [14]:
import tarfile
with tarfile.open("/kaggle/working/model_checkpoints.tar.gz", "w:gz") as tar:
    tar.add("/kaggle/tmp", arcname="model_checkpoints")