# RWKV v5 Baseline 1B5
This model is based on the RWKV standard 1B5 model

- 24 layers
- 2048 embedding size

Going through the modified memory training for v5 models, across various initial embedding model weights

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [None]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [None]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="auto"
ENABLE_WANDB=True
EMBED_SCALE=0.1

WANDB_PREFIX=f"EWR-1B5-{EMBED_SCALE}"

EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("WANDB_PREFIX:", WANDB_PREFIX)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

## Enwiki Stage 1 : Foundation 16k model training

In [None]:
# Init the model
!cd "{TRAINER_DIR}" && \
    python3 ./init_model.py \
        --n_layer 24 --n_embd 2048 \
        --emb-scale "{EMBED_SCALE}" \
        --vocab_size neox --skip-if-exists \
        "../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5-init.pth"

In [None]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/EWR-1B5-enwiki-16k.yaml"

In [None]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/EWR-1B5-enwiki-16k.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-16k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k/" \
        --model.load_model="../model/L24-D2048-E{EMBED_SCALE_LABEL}-neox-v5-init.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=4

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k/last.ckpt" "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k.pth"

In [None]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && python3 dragon_test.py "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k.pth"

# Enwiki Stage 2 : Basic Instruct Tuning

In [None]:
# Lets preload the requried dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/EWR-1B5-enwiki-instruct.yaml"

In [None]:
# Start the instruct finetuning
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/EWR-1B5-enwiki-instruct.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct/" \
        --model.load_model="../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-16k.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py "../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct/last.ckpt" "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct.pth"

In [None]:
# Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && python3 dragon_test.py "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct.pth" "cuda fp32"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct.pth"

## Tune 1 : Simple Memory instruct finetuning

- Tune 1: Low ctx size (512), Training with only the input masked. This does very limited memory training, and is used primarily to train the instruction set.

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

# Lets generate for ctx len <= 512
python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-2-count.jsonl  2  1000 &
for i in {5..250..5} 
do
    python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 1000 & 
done

wait
echo "## Done ##"

ls -alh ../dataset/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/EWR-1B5-mem-instruct.yam" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct/" \
        --model.load_model="../model/EWR-1B5-E{EMBED_SCALE_LABEL}-enwiki-instruct.pth" \
        --model.ctx_len=512 \
        --model.bptt_learning_range=1

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct/" \
        "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct.pth"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct.pth"

## Tune 2 : Low ctx size (512), memory training

- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens.

In [None]:
%%script bash

########################################
# Generate the required jsonl dataset
########################################

# Reset the dataset dir
mkdir -p ../dataset
rm -rf ../dataset/*.jsonl

# Generate the various datasets
echo "## Generating word reptition dataset ##"

#
# We switch over to fully masked instruct+input, to properly learn the memorization task
#
python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl  2  1000 &
for i in {5..250..5} 
do
    python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 1000 & 
    python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 100 &
done

wait
echo "## Done ##"

ls -alh ../dataset/

In [None]:
# Start the finetune model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/EWR-1B5-mem-template.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Mem-Tune ctx-512 (train-ctx=512, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}"  \
        --trainer.callbacks.init_args.dirpath="../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-ctx-512/" \
        --model.lr_init=5e-4 \
        --model.lr_final=4e-4 \
        --data.max_token_size=512 \
        --model.ctx_len=512 \
        --model.bptt_learning_range=1 \
        --model.load_model="../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-instruct.pth"

In [None]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python export_checkpoint.py \
        "../checkpoint/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-ctx-512/" \
        "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-ctx-512.pth"
!cd "{TRAINER_DIR}" && ls -alh "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-ctx-512.pth"

In [None]:
# Lets do a quick memory test
!python3 ../memory_script/eval_v5_memory_guided.py "../model/EWR-1B5-E{EMBED_SCALE_LABEL}-mem-ctx-512.pth"