# RWKV v5

Simple memory training for a small model.

These has been adjusted for 64 nodes, target batch size 512

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [None]:
# First lets setup the various directories, and init the model
!ls ../../../
!mkdir -p ../../../model/
!mkdir -p ../../../dataset/
!mkdir -p ../../../datapath/
!mkdir -p ../../../checkpoint/

In [None]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="8"
ENABLE_WANDB=True

# Layer count and embed dim to start with
LAYER_COUNT=48
EMBED_DIM=2048

EMBED_SCALE=0.1
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

WANDB_PREFIX=f"[3B Coding Model] v5r4-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"3B-CM-v5r4-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "./config/"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

# Get the notebook dir name
DIR_NAME=os.path.basename(NOTEBOOK_DIR)

# Log names and dir
print("DIR_NAME:", DIR_NAME)
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("CONFIG_DIR:", CONFIG_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

## Tune 3 : Low ctx size (1024) memory training

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

for i in {5..45..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 800 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 20 & 
done
for i in {50..550..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 800 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-1k (train-ctx=1k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/" \
        --model.lr_init=5e-4 \
        --model.lr_final=2e-4 \
        --data.max_token_size=1024 \
        --model.ctx_len=1024 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-512.pth" \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-1k.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-1k.pth"

## Tune 4 : Mid ctx size (2048) memory training

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

for i in {5..100..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 100 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 1 & 
done
for i in {105..2000..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 200 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-2k (train-ctx=2k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/" \
        --model.lr_init=4e-4 \
        --model.lr_final=2e-4 \
        --data.max_token_size=2048 \
        --model.ctx_len=2048 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-1k.pth" \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-2k.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-2k.pth"

## Tune 5 : Mid ctx size (4k) memory training

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

for i in {5..500..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 100 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 1 & 
done
for i in {505..4000..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 200 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-4k (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/" \
        --model.lr_init=4e-4 \
        --model.lr_final=2e-4 \
        --data.max_token_size=4096 \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-2k.pth" \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-4k.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-4k.pth"

## Tune 6 : Mid ctx size (8k) memory training

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Go to config dir
cd "../../../"

# Reset the dataset dir
mkdir -p ./dataset/memory-tune/
rm -rf ./dataset/memory-tune/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

for i in {5..1000..5} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 50 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 1 & 
done
for i in {1100..8000..100} 
do
    python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/memory-tune/gen-word-$i-count.jsonl $i 2000 & 
    python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/memory-tune/shuffle-word-$i-count.jsonl $i 20 & 
done

wait
echo "## Done ##"

ls -alh ./dataset/memory-tune/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{CONFIG_DIR}/config-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-8k (train-ctx=8k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/" \
        --model.lr_init=3e-4 \
        --model.lr_final=1e-4 \
        --data.max_token_size=8192 \
        --model.ctx_len=8192 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/{FILENAME_PREFIX}-mem-ctx-4k.pth" \
        --auto-resume-ckpt-dir "auto"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py \
        "../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/last.ckpt" \
        "../model/{FILENAME_PREFIX}-mem-ctx-8k.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-mem-ctx-8k.pth"