In [1]:
# -----------------------------------------------------------------
# Your configurable settings
# -----------------------------------------------------------------

# WANDB settings
ENABLE_WANDB=True
WANDB_PREFIX="Core-Expand"
WANDB_PROJECT="RWKV-x-v5-Expansion-Test"

# Project directory offset (you need to modify if, you move the notebook into another dir)
PROJECT_DIR_OFFSET="../../../../"

# GPU count to use
GPU_DEVICES="auto"

# -----------------------------------------------------------------
# Training settings you can use to override the "auto" default above
# -----------------------------------------------------------------
DEEPSPEED_STRAT="deepspeed_stage_2"
TRAINING_CTX_LEN=4096
MICROBATCH_SIZE=8

# -----------------------------------------------------------------
# Model settings
# -----------------------------------------------------------------
LAYER_COUNT=32
DIM_SIZE=1024
HALF_DIM_SIZE=DIM_SIZE // 2

# ---
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("TRAINING_CTX_LEN:", TRAINING_CTX_LEN)
if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, PROJECT_DIR_OFFSET))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Check if the directory exists
if not os.path.exists(TRAINER_DIR):
    raise Exception("The trainer directory does not exists. Did you move the notebook?")

ENABLE_WANDB: True
GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2
TRAINING_CTX_LEN: 4096
NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test
TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer


# Exporting Original models (Epoch 2, and Epoch 5)

In [15]:
# Export the EPoch 2/5, L{LAYER_COUNT}-D{HALF_DIM_SIZE} model
!cd "{TRAINER_DIR}" && \
    mkdir -p "../model/expansion-test/"
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "/checkpoint/v5-expansion-test/baseline/L{LAYER_COUNT}-D{HALF_DIM_SIZE}/epoch=1-step=256.ckpt" "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E02.pth"
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "/checkpoint/v5-expansion-test/baseline/L{LAYER_COUNT}-D{HALF_DIM_SIZE}/epoch=4-step=640.ckpt" "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E05.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E02.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E05.pth"

[2024-02-12 00:28:19,474] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '/checkpoint/v5-expansion-test/baseline/L32-D512/epoch=1-step=256.ckpt/checkpoint'
Detected checkpoint of type zero stage 2, world_size: 8
Parsing checkpoint created by deepspeed==0.12.6
Reconstructed fp32 state dict with 710 params 176392192 elements
Saving bf16 state dict to ../model/expansion-test/Base-L32-D512-E02.pth
[2024-02-12 00:28:24,963] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Processing zero checkpoint '/checkpoint/v5-expansion-test/baseline/L32-D512/epoch=4-step=640.ckpt/checkpoint'
Detected checkpoint of type zero stage 2, world_size: 8
Parsing checkpoint created by deepspeed==0.12.6
Reconstructed fp32 state dict with 710 params 176392192 elements
Saving bf16 state dict to ../model/expansion-test/Base-L32-D512-E05.pth
-rw-r--r-- 1 nobody root 337M Feb 12 00:28 ../model/expans

In [16]:
# Merge the models respectively with the core weights
!cd "{TRAINER_DIR}" && \
    python3 ./merge_model.py \
        --skip-if-exists --resize_mode "core" \
        "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E02.pth" \
        "../model/L{LAYER_COUNT}-D{DIM_SIZE}-world-init.pth" \
        "../model/expansion-test/Core-L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E02.pth"

!cd "{TRAINER_DIR}" && \
    python3 ./merge_model.py \
        --skip-if-exists --resize_mode "core" \
        "../model/expansion-test/Base-L{LAYER_COUNT}-D{HALF_DIM_SIZE}-E05.pth" \
        "../model/L{LAYER_COUNT}-D{DIM_SIZE}-world-init.pth" \
        "../model/expansion-test/Core-L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E05.pth"

---- Merging models ----
Source model path: ../model/expansion-test/Base-L32-D512-E02.pth
Target model path: ../model/L32-D1024-world-init.pth
Output model path: ../model/expansion-test/Core-L32-D1024-from-D512-E02.pth

Write mode : overwrite
Resize mode: core
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Output model exists, skipping init_model
---- Merging models ----
Source model path: ../model/expansion-test/Base-L32-D512-E05.pth
Target model path: ../model/L32-D1024-world-init.pth
Output model path: ../model/expansion-test/Core-L32-D1024-from-D512-E05.pth

Write mode : overwrite
Resize mode: core
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Output model exists, skipping init_model


# Train from the E02

In [3]:
# Lets start the training
!mkdir -p "/checkpoint/v5-expansion-test/core/L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E02/"
!cd "{TRAINER_DIR}" && \
    export RWKV_NO_CUDA=1 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki-100k-world-16k-rechunk.yaml" \
        --model.load_model="../model/expansion-test/Core-L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E02.pth" \
        --data.skip_datapath_setup=True \
        --trainer.callbacks.init_args.dirpath="/checkpoint/v5-expansion-test/core/L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E02/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E02 (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"


[2024-02-12 00:39:22,692] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test/config/enwiki-100k-world-16k-rechunk.yaml', '--model.load_model=../model/expansion-test/Core-L32-D1024-from-D512-E02.pth', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/v5-expansion-test/core/L32-D1024-from-D512-E02/', '--trainer.logger.init_args.name=Core-Expand - L32-D1024-from-D512-E02 (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-x-v5-Expans

# Train from the E05

In [2]:
# Lets start the training
!mkdir -p "/checkpoint/v5-expansion-test/core/L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E05/"
!cd "{TRAINER_DIR}" && \
    export RWKV_NO_CUDA=1 && \
    export RWKV_TORCH_COMPILE=0 && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki-100k-world-16k-rechunk.yaml" \
        --model.load_model="../model/expansion-test/Core-L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E05.pth" \
        --data.skip_datapath_setup=True \
        --trainer.callbacks.init_args.dirpath="/checkpoint/v5-expansion-test/core/L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E05/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - L{LAYER_COUNT}-D{DIM_SIZE}-from-D{HALF_DIM_SIZE}-E05 (tctxlen={TRAINING_CTX_LEN}, {DEEPSPEED_STRAT})" \
        --trainer.logger.init_args.project="{WANDB_PROJECT}" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.microbatch_size={MICROBATCH_SIZE} \
        --model.ctx_len={TRAINING_CTX_LEN} \
        --trainer.devices="{GPU_DEVICES}"


[2024-02-12 04:08:11,695] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test/config/enwiki-100k-world-16k-rechunk.yaml', '--model.load_model=../model/expansion-test/Core-L32-D1024-from-D512-E05.pth', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/v5-expansion-test/core/L32-D1024-from-D512-E05/', '--trainer.logger.init_args.name=Core-Expand - L32-D1024-from-D512-E05 (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-x-v5-Expans