# Training validation

Triton based training, for loss curve comparision / validation between the various kernel implements

In [None]:
# Configure the parent path to be the proj folder
import sys, os, torch, time
sys.path.append('../../')
sys.path.append('../../test')

# Memory segmenting fix?
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Import the model classes
from rwkv_block.v7_goose.model.rwkv7_goose_model import RWKV7GooseModel
from trainer.SimpleTestTrainer import SimpleTestTrainer

# Device to run on
RUN_DEVICE="cuda:0"

# If multiple cuda devices are available
# we use the respective device, so that I can run multiple notebooks in parallel
#
# Comment out this logic if you intend to manually set the device
if torch.cuda.device_count() >= 8:
    RUN_DEVICE="cuda:2"

# Training batch size
BATCH_SIZE = 4

# Model shape and size
LAYER_COUNT = 12
DIM_SIZE = 512
TMIX_BACKEND="triton"

# Create and initalize the model
model = RWKV7GooseModel({
    "n_layer": LAYER_COUNT,
    "n_dim": DIM_SIZE,
    "tmix_backend": TMIX_BACKEND,
    "device": RUN_DEVICE,
    "dtype": "bfloat16",
    "n_vocab": 50432
})
model.init_parameters()

# Setup the trainer
trainer = SimpleTestTrainer(model, device=RUN_DEVICE, batch_size=BATCH_SIZE)

# Trigger the train process
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


---------------------------------------------
[SimpleTestTrainer] Initializing the trainer for:  RWKV-Block.SimpleTestTrainer
- hf_dataset:          recursal/SuperWiki-Tiny
- dataset_ctx_length:  4096
- dataset_min_length:  4096
- tokenizer_name:      EleutherAI/gpt-neox-20b
- batch_size:          4
- learning_rate:       0.001
- num_epochs:          1
---------------------------------------------
[SimpleTestTrainer] Loading the tokenizer:  EleutherAI/gpt-neox-20b ...
[SimpleTestTrainer] Loading the dataset:  recursal/SuperWiki-Tiny ...
[SimpleTestTrainer] Preparing the training dataset...


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[SimpleTestTrainer] Training dataset size:    718044
[SimpleTestTrainer] Validation dataset size:  719
[SimpleTestTrainer] Preparing the data loaders...
[SimpleTestTrainer] Training batch count:    50000
[SimpleTestTrainer] Validation batch count:  179
[SimpleTestTrainer] Setting up the optimizer, loss function...
[SimpleTestTrainer] Initializing wandb...
[SimpleTestTrainer] wandb is logged in.


[SimpleTestTrainer] Initialization complete.
---------------------------------------------
Epoch 1/1


Training:   0%|▌                                                                                                                                    | 202/50000 [02:47<4:27:59,  3.10it/s, loss=1.7]