In [None]:
#| default_exp train

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| include: false
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
#| include: false
from nbdev.showdoc import *

In [None]:
#| export
import torch
import wandb

import pytorch_lightning as pl

from codecarbon import EmissionsTracker
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import WandbLogger

2023-03-10 21:49:16.743509: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-10 21:49:17.177108: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:
2023-03-10 21:49:17.177150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.6/lib64:


In [None]:
#| include: false
from completeformer.data import CompleteformerDataset
from completeformer.models import Completeformer
from pathlib import Path

config = {
    "position_type": "sinusoidal",
    "dim": 512,
    "enc_max_len": 512,
    "enc_layers": 6,
    "enc_heads": 8,
    "dec_max_len": 256,
    "dec_layers": 6,
    "dec_heads": 8,
    "lr": 1e-4,
    "num_warmup_steps": 20,
}

ENC_MAX_LEN = 1024
DEC_MAX_LEN = 128
BATCH_SIZE = 16
short_ds = CompleteformerDataset(
    length="short",
    tokenizer_name="semeru/completeformer_tokenizer",
    batch_size=BATCH_SIZE,
    enc_max_len=ENC_MAX_LEN,
    dec_max_len=DEC_MAX_LEN,
    num_workers=4,
)
short_ds.prepare_data()

Found cached dataset completeformer (/work/.cache/huggingface/datasets/semeru___completeformer/short/1.1.0/73b388644fdc749387b0ed1f17dad849561b5ca75db0faa9bfdecd5e8d6d1d91)


  0%|          | 0/3 [00:00<?, ?it/s]

      

#0:   0%|          | 0/71 [00:00<?, ?ba/s]

#1:   0%|          | 0/71 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/71 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/71 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/9 [00:00<?, ?ba/s]

#1:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/9 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
#| export
def train(
    model,
    data_module,
    num_epochs,
    output_dir,
    name=None,
    limit_train_batches=1.0,
    limit_val_batches=1.0,
    accumulate_grad_batches=1,
    val_check_interval=20_000,
    ):
    """
    Train a model with a given training data loader, validation data loader,
    optimizer, scheduler, loss function, metrics, and callbacks.

    Args:
        model (LightningModule): The model to train.
        train_dataloader (DataLoader): The training data loader.
        val_dataloader (DataLoader): The validation data loader.
        num_epochs (int): The number of epochs to train for.
        output_dir (str): The directory to save the model to.
        name (str): The name of the model.
    Returns:
        best_model_path (str): The path to the best model's checkpoint.
    """
    pl.seed_everything(115, workers=True)
    wandb_logger = WandbLogger(project="Completeformer", name=name)
    # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
    checkpoint_path = output_dir / "checkpoints"
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=str(checkpoint_path),
        filename="completeformer-{epoch:02d}-{val_loss:.2f}",
        save_top_k=5,
        mode="min",
    )
    trainer = pl.Trainer(
        log_every_n_steps=1,
        logger=wandb_logger,
        default_root_dir=str(checkpoint_path),
        gpus=torch.cuda.device_count(),
        accumulate_grad_batches=accumulate_grad_batches,
        max_epochs=num_epochs,
        limit_train_batches=limit_train_batches,
        limit_val_batches=limit_val_batches,
        precision=16,
        callbacks=[
            checkpoint_callback,
            TQDMProgressBar(refresh_rate=1),
        ],
    )

    # train the model
    trainer.fit(model, data_module)

    # save the last model
    trainer.save_checkpoint(str(checkpoint_path / "final_checkpoint.ckpt"))

    # save the best model to wandb
    best_model_path = checkpoint_callback.best_model_path
    if best_model_path is not None:
        wandb.save(best_model_path)

    return model, checkpoint_callback.best_model_path, trainer

In [None]:
# write some unit tests to test the above function
num_epochs = 1
output_dir = Path("/tmp/completeformer/models")

model = Completeformer(short_ds.tokenizer, length="short", max_epochs=num_epochs, **config)
model, best_model_path, trainer = train(
    model,
    short_ds,
    num_epochs=num_epochs,
    output_dir=output_dir,
    name="test",
    limit_train_batches=0.1,
    limit_val_batches=0.1,
    val_check_interval=124,
)

# evaluate that the best model path exists
assert Path(best_model_path).exists()

[nltk_data] Downloading package wordnet to /home/nathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/nathan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nathan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Global seed set to 115
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnatedog[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_deprecation(
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type         | Params
---------------------------------------
0 | model | XTransformer | 111 M 
---------------------------------------
111 M     Trainable params
0         Non-trainable params
111 M     Total params
223.363   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training: 0it [00:00, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation: 0it [00:00, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
short_results = trainer.test(model=model, dataloaders=short_ds.test_dataloader())[0]

{'tst_loss': 3.730475902557373,
 'bleu_short': 0.09326142817735672,
 'chrf_short': 28.752025604248047,
 'exact_match_short': 0.0,
 'leven_dist_short': 0.8018017197124175,
 'meteor_short': 0.46674807470162394,
 'rouge_short': 0.290683514636622}

assert short_results.keys() == {
    "tst_loss",
    "bleu_short",
    "chrf_short",
    "exact_match_short",
    "leven_dist_short",
    "meteor_short",
    "rouge_short",
}

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
#| include: false
from nbdev import nbdev_export; nbdev_export()