# Testing the trainer code, against various model sizes

This helps validate the RWKV trainer can run against various sizes.

This does not perform any actual checkpointing / export (yet?)

In [1]:
# First lets setup the various directories required
!mkdir -p ../../model/
!mkdir -p ../../datapath/
!mkdir -p ../../checkpoint/

In [4]:
# Confgiure the deepspeed / gpu count to be tested
DEEPSPEED_STRAT="deepspeed_stage_2_offload"
GPU_DEVICES="auto"

print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v4neo/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2_offload
NOTEBOOK_DIR: /home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-validation
TRAINER_DIR: /home/picocreator/rwkv-proj/infctx-dev/RWKV-v4neo
PROJECT_DIR: /home/picocreator/rwkv-proj/infctx-dev


In [6]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml"

Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 5197.21 examples/s]
Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 11547.48 examples/s]
Map (num_proc=16): 100%|███████████| 8110/8110 [00:01<00:00, 5340.63 examples/s]
Saving the dataset (1/1 shards): 100%|█| 5318/5318 [00:00<00:00, 127176.21 examp
Saving the dataset (1/1 shards): 100%|█| 54/54 [00:00<00:00, 25572.14 examples/s


## 1.5B Size (L24-D2048)

In [16]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 24 --n_embd 2048 \
        --vocab_size neox \
        ../model/L24-D2048-neox-init.pth

[2023-08-01 13:42:48,867] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
---- Initializing model ----
No of layers: 24
Embedding size: 2048
Output model path: ../model/L24-D2048-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
50277 2048  -0.1 emb.weight
2048  2048  0    blocks.0.att.key.weight
2048  2048  1.0  blocks.0.att.value.weight
2048  2048  0    blocks.0.att.receptance.weight
2048  2048  0    blocks.0.at

In [18]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L24-D2048-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

[2023-08-01 14:38:42,592] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2917114153
[RWKV.model][Rank 0]: Preloading model from '../model/L24-D2048-neox-init.pth'
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
[RWKV.model][Rank 0]: Loading model weights
[RWKV.model][Rank 0]: Finished initial model load
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 

## 3B Size (L32-D2560)

In [9]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 32 --n_embd 2560 \
        --vocab_size neox \
        ../model/L32-D2560-neox-init.pth

[2023-08-01 12:45:33,496] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
---- Initializing model ----
No of layers: 32
Embedding size: 2560
Output model path: ../model/L32-D2560-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
50277 2560  -0.1 emb.weight
2560  2560  0    blocks.0.att.key.weight
2560  2560  1.0  blocks.0.att.value.weight
2560  2560  0    blocks.0.att.receptance.weight
2560  2560  0    blocks.0.at

In [10]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L32-D2560-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

[2023-08-01 12:46:52,754] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2966926537
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Creating extension directory /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_512_bf16...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=wkv_512_bf16 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/pico

## 7B Size (L32-D4096)

In [11]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 32 --n_embd 4096 \
        --vocab_size neox \
        ../model/L32-D4096-neox-init.pth

[2023-08-01 12:52:24,720] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
---- Initializing model ----
No of layers: 32
Embedding size: 4096
Output model path: ../model/L32-D4096-neox-init.pth
Vocab size: 50277
---- ----- ----
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_2048_bf16/build.ninja...
Building extension module wkv_2048_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_2048_bf16...
50277 4096  -0.1 emb.weight
4096  4096  0    blocks.0.att.key.weight
4096  4096  1.0  blocks.0.att.value.weight
4096  4096  0    blocks.0.att.receptance.weight
4096  4096  0    blocks.0.at

In [14]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L32-D4096-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

[2023-08-01 13:01:51,639] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2454860410
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Saving the dataset (1/1 shards): 100%|█| 5318/5318 [00:00<00:00, 88561.28 exampl
Saving t

## 14B Size (L40-D5120)

In [None]:
!cd "{TRAINER_DIR}" && \
    python3 init_model.py \
        --n_layer 40 --n_embd 5120 \
        --vocab_size neox \
        ../model/L40-D5120-neox-init.pth

In [None]:
!cd "{TRAINER_DIR}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/minimal-enwiki.yaml" \
        --model.load_model "../model/L40-D5120-neox-init.pth" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

[2023-08-01 12:56:30,793] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.0.dev20230725'
  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 3980896932
Using /home/picocreator/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu118/wkv_512_bf16/build.ninja...
Building extension module wkv_512_bf16...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module wkv_512_bf16...
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Saving the dataset (1/1 shards): 100%|█| 5318/5318 [00:00<00:00, 34667.39 exampl
Saving t