# Testing the trainer code, against various model sizes

This helps validate the RWKV trainer can run against various sizes.

This does not perform any actual checkpointing / export (yet?)

In [1]:
# First lets setup the various directories required
!mkdir -p ../../model/
!mkdir -p ../../datapath/
!mkdir -p ../../checkpoint/

In [14]:
# Confgiure the deepspeed / gpu count to be tested
DEEPSPEED_STRAT="deepspeed_stage_3"
GPU_DEVICES="auto"

print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v4neo/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_3
NOTEBOOK_DIR: /root/picocreator-dev-infctx/notebook/trainer-validation
TRAINER_DIR: /root/picocreator-dev-infctx/RWKV-v4neo
PROJECT_DIR: /root/picocreator-dev-infctx


In [3]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml"

Found cached dataset parquet (/root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 779.90it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-1062ba7da82e617a_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-49681c8aebfc1cc9_*_of_00064.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/teven___parquet/teven--enwiki_10k-de63a925546e70ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-47774371f91aea93_*_of_00064.arrow
                                    

## 1.5B Size (L24-D2048)

In [4]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 24 --n_embd 2048 \
        --vocab_size neox \
        ../model/L24-D2048-neox-init.pth

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 24
Embedding size: 2048
Output model path: ../model/L24-D2048-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
[RWKV.model]: Finished initial model load
50277 2048  -0.1 emb.weight
2048  2048  0    blocks.0.att.key.weight
2048  2048  1.0  blocks.0.att.value.weight
2048  2048  0    blocks.0.att.receptance.weight
2048  2048  0    blocks.0.att.output.weight
8192  2048  1.0  blocks.0.ffn.key.weight
2048  

In [5]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L24-D2048-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3061363964
[RWKV.model]: Preloading model from '../model/L24-D2048-neox-init.pth'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
[RWKV.model]: Loading model weights ( L24-D2048-V50277 )
[RWKV.model]: Finished initial model load
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset parquet (/root/.cache

## 3B Size (L32-D2560)

In [6]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 32 --n_embd 2560 \
        --vocab_size neox \
        ../model/L32-D2560-neox-init.pth

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 32
Embedding size: 2560
Output model path: ../model/L32-D2560-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
[RWKV.model]: Finished initial model load
50277 2560  -0.1 emb.weight
2560  2560  0    blocks.0.att.key.weight
2560  2560  1.0  blocks.0.att.value.weight
2560  2560  0    blocks.0.att.receptance.weight
2560  2560  0    blocks.0.att.output.weight
10240 2560  1.0  blocks.0.ffn.key.weight
2560  

In [7]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L32-D2560-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2615317921
[RWKV.model]: Preloading model from '../model/L32-D2560-neox-init.pth'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
[RWKV.model]: Loading model weights ( L32-D2560-V50277 )
[RWKV.model]: Finished initial model load
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset parquet (/root/.cache

## 7B Size (L32-D4096)

In [8]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 32 --n_embd 4096 \
        --vocab_size neox \
        ../model/L32-D4096-neox-init.pth

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 32
Embedding size: 4096
Output model path: ../model/L32-D4096-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
[RWKV.model]: Finished initial model load
50277 4096  -0.1 emb.weight
4096  4096  0    blocks.0.att.key.weight
4096  4096  1.0  blocks.0.att.value.weight
4096  4096  0    blocks.0.att.receptance.weight
4096  4096  0    blocks.0.att.output.weight
16384 4096  1.0  blocks.0.ffn.key.weight
4096  

In [9]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L32-D4096-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1719644420
[RWKV.model]: Preloading model from '../model/L32-D4096-neox-init.pth'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
[RWKV.model]: Loading model weights ( L32-D4096-V50277 )
[RWKV.model]: Finished initial model load
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset parquet (/root/.cache

## 14B Size (L40-D5120)

In [10]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 40 --n_embd 5120 \
        --vocab_size neox \
        ../model/L40-D5120-neox-init.pth

Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 40
Embedding size: 5120
Output model path: ../model/L40-D5120-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
[RWKV.model]: Finished initial model load
50277 5120  -0.1 emb.weight
5120  5120  0    blocks.0.att.key.weight
5120  5120  1.0  blocks.0.att.value.weight
5120  5120  0    blocks.0.att.receptance.weight
5120  5120  0    blocks.0.att.output.weight
20480 5120  1.0  blocks.0.ffn.key.weight
5120  

In [15]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L40-D5120-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

[RWKV.lightning_trainer.py] Detected deepspeed_stage_3, disabling JIT using RWKV_JIT_ON=0
Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-native' with torch '2.0.1+cu118'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2913082349
[RWKV.model]: Preloading model from '../model/L40-D5120-neox-init.pth'
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
[RWKV.model]: Loading model weights ( L40-D5120-V50277 )
[RWKV.model]: Finished initial model load
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False,