# Build the datapack

In [4]:
# -----------------------------------------------------------------
# Your configurable settings
# -----------------------------------------------------------------

# WANDB settings
ENABLE_WANDB=True
WANDB_PREFIX="Baseline"
WANDB_PROJECT="RWKV-x-v5-Expansion-Test"

# Project directory offset (you need to modify if, you move the notebook into another dir)
PROJECT_DIR_OFFSET="../../../../"

# GPU count to use
GPU_DEVICES="auto"

# -----------------------------------------------------------------
# # Training settings you can use to override the "auto" default above
# -----------------------------------------------------------------
DEEPSPEED_STRAT="deepspeed_stage_2"
TRAINING_CTX_LEN=4096
MICROBATCH_SIZE=8

# ---
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)
print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("TRAINING_CTX_LEN:", TRAINING_CTX_LEN)
if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, PROJECT_DIR_OFFSET))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Check if the directory exists
if not os.path.exists(TRAINER_DIR):
    raise Exception("The trainer directory does not exists. Did you move the notebook?")

ENABLE_WANDB: True
GPU_DEVICES: auto
DEEPSPEED_STRAT: deepspeed_stage_2
TRAINING_CTX_LEN: 4096
NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/rwkv-x-exp/v5-exp/expansion-test
TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer


In [10]:
# Lets build the dataset
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/config/enwiki-100k-world-16k-rechunk.yaml"

Map (num_proc=160): 100%|███| 1000000/1000000 [00:18<00:00, 54863.81 examples/s]
Filter (num_proc=160): 100%|█| 1000000/1000000 [00:06<00:00, 163552.93 examples/
Map (num_proc=160): 100%|█████| 648479/648479 [00:10<00:00, 61206.17 examples/s]
Map (num_proc=160): 100%|████████| 32835/32835 [00:06<00:00, 5159.39 examples/s]
Saving the dataset (7/7 shards): 100%|█| 32835/32835 [00:09<00:00, 3548.78 examp
Saving the dataset (1/1 shards): 100%|█| 165/165 [00:00<00:00, 3786.28 examples/


In [12]:
# Lets initialized the various models, and their sizes
!cd "{TRAINER_DIR}" && python3 init_model.py \
    --n_layer 32 --n_embd 4096 \
    --vocab_size world \
    --skip-if-exists --safe-init \
    ../model/L32-D4096-world-init.pth

!cd "{TRAINER_DIR}" && python3 init_model.py \
    --n_layer 32 --n_embd 2048 \
    --vocab_size world \
    --skip-if-exists --safe-init \
    ../model/L32-D2048-world-init.pth

!cd "{TRAINER_DIR}" && python3 init_model.py \
    --n_layer 32 --n_embd 1024 \
    --vocab_size world \
    --skip-if-exists --safe-init \
    ../model/L32-D1024-world-init.pth

!cd "{TRAINER_DIR}" && python3 init_model.py \
    --n_layer 32 --n_embd 512 \
    --vocab_size world \
    --skip-if-exists --safe-init \
    ../model/L32-D512-world-init.pth

[2024-02-11 09:07:45,896] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
---- Initializing model ----
No of layers: 32
Embedding size: 4096
Output model path: ../model/L32-D4096-world-init.pth
Vocab size: 65536
Emb scale: 0.0001
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Model exists, skipping init_model
[2024-02-11 09:07:49,619] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.1+cu121'
---- Initializing model ----
No of layers: 32
Embedding size: 2048
Output model path: ../model/L32-D2048-world-init.pth
Vocab size: 65536
Emb scale: 0.0001
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Model exists, skipping init_model
[2024-02-11 09:07:53,474] [INFO] [real_accelerator.py: