# RWKV v5 Baseline 1B5
This model is based on the RWKV standard 1B5 model

- 24 layers
- 2048 embedding size

Going through the modified memory training for v5 models

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [None]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

# Init the model
!cd ../../../../RWKV-v5 && python3 ./init_model.py --n_layer 24 --n_embd 2048 --vocab_size neox --skip-if-exists ../model/L24-D2048-neox-v5-init.pth

In [2]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="auto"
ENABLE_WANDB=True
WANDB_PREFIX="V5-Base-1B5-C"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_1
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/notebook/experiment/rwkv-x-exp/V5-Base-1B5
INFERENCE_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/RWKV-v5
TRAINER_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A/RWKV-v5
PROJECT_DIR: /home/ubuntu/rwkv5x-tokenshift-exp-A


## Stage 1 : Foundation model training

In [3]:
# Lets preload the requried dataset (enwiki_100k)
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/V5-Base-1B5-enwiki.yaml"

Downloading readme: 100%|██████████████████| 5.26k/5.26k [00:00<00:00, 9.37MB/s]
Downloading and preparing dataset json/RyokoExtra--SuperWIKI-Cleaned to /home/ubuntu/.cache/huggingface/datasets/RyokoExtra___json/RyokoExtra--SuperWIKI-Cleaned-abb930ec61eb61a3/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...
Downloading data files:   0%|                             | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                              | 0.00/442M [00:00<?, ?B/s][A
Downloading data:   3%|▋                     | 14.2M/442M [00:00<00:03, 142MB/s][A
Downloading data:   6%|█▎                   | 28.4M/442M [00:00<00:08, 50.4MB/s][A
Downloading data:   8%|█▋                   | 36.3M/442M [00:00<00:08, 45.4MB/s][A
Downloading data:  11%|██▎                  | 48.0M/442M [00:00<00:07, 50.8MB/s][A
Downloading data:  14%|██▉                  | 62.1M/442M [00:01<00:06, 59.2MB/s][A
Downloading data:  16%|███▎                 | 68.8M/442M [00:01<00:07, 46.9MB/s][

In [None]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/V5-Base-1B5-enwiki.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki Foundation (train-ctx=16k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=4

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/V5-Base-1B5-enwiki/last.ckpt" "../model/V5-Base-1B5-Stage1.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/V5-Base-1B5-Stage1.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && python3 dragon_test.py ../model/V5-Base-1B5-Stage1.pth "cuda fp32"

In [None]:
# # Lets do a quick memory test
# # (We dun expect this to work, as we have not finetune for memory recall, but its a baseline)
# !python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/V5-Base-1B5-Stage1.pth"

# Stage 2 : Instruct Tuning

In [None]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/V5-Base-1B5-instruct.yaml"

In [None]:
# Start the instruct finetuning
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/V5-Base-1B5-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Instruct (train-ctx=4096, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/V5-Base-1B5-instruct/last.ckpt" "../model/V5-Base-1B5-Stage2.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/V5-Base-1B5-Stage2.pth"

In [None]:
# Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && python3 dragon_test.py "../model/V5-Base-1B5-Stage2.pth" "cuda fp32"

In [None]:
# # Lets do a quick memory test
# # (We dun expect this to work, as we have not finetune for memory recall, but its a baseline)
# !python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/V5-Base-1B5-Stage2.pth"