In [1]:
!pip freeze > '/kaggle/working/requirements.txt'

In [2]:
!pip install transformers==4.47.0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import BartTokenizerFast, BartForConditionalGeneration, BartConfig, TrainingArguments, Trainer
import torch
import transformers
from datasets import load_dataset
from torch.utils.data import DataLoader

2025-05-13 16:05:52.496543: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747152352.686662      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747152352.737962      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
tokenizer = BartTokenizerFast.from_pretrained("/kaggle/input/nepbart-tokenizer/nepbart_tokenizer")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BartTokenizerFast'.


In [5]:
parameters = {
    'VOCAB_SIZE': 32_768,
    'SEQUENCE_LENGTH': 512,
    'EMBEDDING_DIMENSION': 512,

    'ENCODER_LAYERS': 6,
    'ENCODER_ATTENTION_HEADS': 8,
    'ENCODER_FFN_DIM': 2048,

    'DECODER_LAYERS': 6,
    'DECODER_ATTENTION_HEADS': 8,
    'DECODER_FFN_DIM': 2048,

    'DROPOUT': 0.1,

    'TRAIN_BATCH_SIZE': 32,
    'EVAL_BATCH_SIZE': 32,
    'EPOCHS': 5,
    'LEARNING_RATE': 1e-4,
    'WARMUP_STEPS': 5_000,
    'GRADIENT_ACCUMULATION_STEPS': 4,
    'L2_REG': 0.01,
    'MAX_STEPS': 123_000 # dataset_size / (BATCH*GRADIENT_ACCUMULATION_STEPS) 
}

In [6]:
dataset_files = {
    "train": "/kaggle/input/nepbart-dataset-tokenized-masked-padded/nepbart_tokenized_masked_padded_train.parquet",
    "eval": "/kaggle/input/nepbart-dataset-tokenized-masked-padded/nepbart_tokenized_masked_padded_test.parquet"
}

In [7]:
dataset = load_dataset("parquet", data_files=dataset_files, streaming=True)
train_dataset = dataset["train"].repeat(None)
eval_dataset = dataset["eval"]

In [8]:
def collate_fn(batch: torch.tensor):
    return {key: torch.tensor([(d[key]) for d in batch]) for key in batch[0]}

In [9]:
model = BartForConditionalGeneration(
    BartConfig(
        vocab_size=parameters['VOCAB_SIZE'],
        max_position_embeddings=parameters['SEQUENCE_LENGTH'],
        encoder_layers=parameters['ENCODER_LAYERS'],
        encoder_ffn_dim=parameters['ENCODER_FFN_DIM'],
        encoder_attention_heads=parameters['ENCODER_ATTENTION_HEADS'],
        decoder_layers=parameters['DECODER_LAYERS'],
        decoder_ffn_dim=parameters['DECODER_FFN_DIM'],
        decoder_attention_heads=parameters['DECODER_ATTENTION_HEADS'],
        d_model=parameters['EMBEDDING_DIMENSION'],
        dropout=parameters['DROPOUT'],
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        is_encoder_decoder=True,
        decoder_start_token_id=tokenizer.eos_token_id,
    )
)

In [10]:
training_args = TrainingArguments(
    output_dir='/kaggle/tmp/',
    overwrite_output_dir=True,
    # eval_strategy='epoch',
    eval_strategy='steps',
    eval_steps=10_000,
    per_device_train_batch_size=parameters['TRAIN_BATCH_SIZE'],
    per_device_eval_batch_size=parameters['EVAL_BATCH_SIZE'],
    learning_rate=parameters['LEARNING_RATE'],
    # num_train_epochs=parameters['EPOCHS'],
    max_steps=parameters['MAX_STEPS'],
    warmup_steps=parameters['WARMUP_STEPS'],
    weight_decay=parameters['L2_REG'],
    logging_dir="/kaggle/tmp/",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none",
)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn
)

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
trainer.train(resume_from_checkpoint='/kaggle/input/nepbart-checkpoint-116000-steps/checkpoint-116000')

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss,Validation Loss
120000,0.3225,0.310537


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=123000, training_loss=0.01824139094158886, metrics={'train_runtime': 16142.2572, 'train_samples_per_second': 243.832, 'train_steps_per_second': 7.62, 'total_flos': 5.33720620597248e+17, 'train_loss': 0.01824139094158886, 'epoch': 1.0})

In [14]:
print(trainer.evaluate(eval_dataset))

{'eval_loss': 0.3105374276638031, 'eval_runtime': 3654.2885, 'eval_samples_per_second': 70.677, 'eval_steps_per_second': 2.209, 'epoch': 1.0}


In [15]:
trainer.save_model('/kaggle/tmp/final_model')

In [16]:
import tarfile
with tarfile.open("/kaggle/working/model_checkpoints.tar.gz", "w:gz") as tar:
    tar.add("/kaggle/tmp", arcname="model_checkpoints")