# RWKV v5

Simple memory training for a small model.

These has been adjusted for 64 nodes, target batch size 512

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [None]:
# First lets setup the various directories, and init the model
!ls ../../../
!mkdir -p ../../../model/
!mkdir -p ../../../dataset/
!mkdir -p ../../../datapath/
!mkdir -p ../../../checkpoint/

In [None]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="8"
ENABLE_WANDB=True

# Layer count and embed dim to start with
LAYER_COUNT=48
EMBED_DIM=2048

EMBED_SCALE=0.1
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

WANDB_PREFIX=f"[3B Coding Model] v5r4-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"3B-CM-v5r4-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "./config/"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

# Get the notebook dir name
DIR_NAME=os.path.basename(NOTEBOOK_DIR)

# Log names and dir
print("DIR_NAME:", DIR_NAME)
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("CONFIG_DIR:", CONFIG_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Enwiki Stage 2 : Basic Instruct Tuning

In [None]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{CONFIG_DIR}/config-enwiki-instruct.yaml"

In [None]:
# Start the instruct finetuning
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-enwiki-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/" \
        --model.load_model="../model/{FILENAME_PREFIX}-enwiki-4k.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1 \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-instruct.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-instruct.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-instruct.pth" "cuda fp32"

## Tune 1 : Simple Memory instruct finetuning

- Tune 1: Low ctx size (512), Training with only the input masked. This does very limited memory training, and is used primarily to train the instruction set.

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

# We do a strong bias for smaller word count, to teach the concept from scratch
# so that the model can learn the function. 
#
# Note that all document samples, are randomized between the target word count, 
# to half of the target word count.
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-2-count.jsonl  2  5000 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-5-count.jsonl  5  5000 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-10-count.jsonl 10 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-15-count.jsonl 15 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-20-count.jsonl 20 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-25-count.jsonl 25 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-40-count.jsonl 40 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-50-count.jsonl 50 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-60-count.jsonl 80 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-80-count.jsonl 80 2500 &

# With a slight mix of the larger word count
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-100-count.jsonl 100 2500 &
python3 ./notebook/util-scripts/memory_script/gen_limited_segmented_jsonl.py ./dataset/memory-tune/word-200-count.jsonl 200 2500 &

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-instruct/" \
        --model.load_model="../model/{FILENAME_PREFIX}-enwiki-instruct.pth" \
        --model.ctx_len=512 \
        --model.bptt_learning_range=1 \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-instruct.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-instruct.pth"

## Tune 2 : Low ctx size (512), memory training

- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens.

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We switch over to fully masked instruct+input, to properly learn the memorization task
#
python3 ./notebook/util-scripts/memory_script/gen_limited_prompt_completion_jsonl.py ./dataset/memory-tune/word-2-count.jsonl  2  5000 &
for i in {5..95..5} 
do
    python3 ./notebook/util-scripts/memory_script/gen_limited_prompt_completion_jsonl.py ./dataset/memory-tune/gen-word-$i-count.jsonl $i 5000 & 
done
python3 ./notebook/util-scripts/memory_script/gen_limited_prompt_completion_jsonl.py ./dataset/memory-tune/word-100-count.jsonl 100 5000 &
python3 ./notebook/util-scripts/memory_script/gen_limited_prompt_completion_jsonl.py ./dataset/memory-tune/word-200-count.jsonl 200 5000 &

#
# We mixin the shuffled word list, so that we ensure all words / tokens are learned
# however this might intrduce an exclusion bias (if seen this word, never repeat it), 
# so we limit the mixture of this data samples
#
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-10-count.jsonl 10 20 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-15-count.jsonl 15 20 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-25-count.jsonl 25 30 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-50-count.jsonl 50 50 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-75-count.jsonl 75 50 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-100-count.jsonl 100 50 &
python3 ./notebook/util-scripts/memory_script/shuffle_limited_prompt_completion_jsonl.py ./dataset/memory-tune/shuffle-word-200-count.jsonl 200 50 &

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-512 (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/" \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-instruct.pth" \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-512.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-512.pth"