# Eagle 7B : Training on 1 more Trillion tokens!

In [1]:
# -----------------------------------------------------------------
# Your configurable settings
# -----------------------------------------------------------------

# WANDB settings
ENABLE_WANDB=True
WANDB_PREFIX="Baseline"
WANDB_PROJECT="RWKV-x-v5-Expansion-Test"

# Project directory offset (you need to modify if, you move the notebook into another dir)
PROJECT_DIR_OFFSET="../../../../"

# GPU count to use
GPU_DEVICES="auto"

# -----------------------------------------------------------------
# Training settings you can use to override the "auto" default above
# -----------------------------------------------------------------
DEEPSPEED_STRAT="deepspeed_stage_2"
TRAINING_CTX_LEN=4096
MICROBATCH_SIZE=8

# -----------------------------------------------------------------
# Model settings
# -----------------------------------------------------------------
LAYER_COUNT=32
DIM_SIZE=4096

# ---
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("TRAINING_CTX_LEN:", TRAINING_CTX_LEN)
if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, PROJECT_DIR_OFFSET))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Check if the directory exists
if not os.path.exists(TRAINER_DIR):
    raise Exception("The trainer directory does not exists. Did you move the notebook?")

ENABLE_WANDB: True
GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2
TRAINING_CTX_LEN: 4096
NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test
TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer


## Init the empty model

In [2]:
# Lets initialized the various models, and their sizes
!cd "{TRAINER_DIR}" && python3 init_model.py \
    --n_layer {LAYER_COUNT} --n_embd {DIM_SIZE} \
    --vocab_size world \
    --skip-if-exists --safe-init \
    ../model/L{LAYER_COUNT}-D{DIM_SIZE}-world-init.pth

[2024-02-11 09:31:11,758] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
---- Initializing model ----
No of layers: 32
Embedding size: 4096
Output model path: ../model/L32-D4096-world-init.pth
Vocab size: 65536
Emb scale: 0.0001
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Model exists, skipping init_model


## Start the training run!

In [4]:
# Setup the checkpoint dir
!cd "{PROJECT_DIR}" && mkdir -p "/checkpoint/v5-expansion-test/baseline/L{LAYER_COUNT}-D{DIM_SIZE}/"

# Lets start the training
!cd "{TRAINER_DIR}" && \
    export RWKV_NO_CUDA=1 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki-100k-world-16k-rechunk.yaml" \
        --model.load_model="../model/L{LAYER_COUNT}-D{DIM_SIZE}-world-init.pth" \
        --data.skip_datapath_setup=True \
        --trainer.callbacks.init_args.dirpath="/checkpoint/v5-expansion-test/baseline/L{LAYER_COUNT}-D{DIM_SIZE}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - L{LAYER_COUNT}-D{DIM_SIZE} (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"


[2024-02-11 17:57:34,033] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test/config/enwiki-100k-world-16k-rechunk.yaml', '--model.load_model=../model/L32-D4096-world-init.pth', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/v5-expansion-test/baseline/L32-D4096/', '--trainer.logger.init_args.name=Baseline - L32-D4096 (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-x-v5-Expansion-Test', '--trainer.strategy=deepspeed_stage_2',