# Dataset repacking implementation

Advance dataset operations, of sorting, offset, and length support

In [1]:
GPU_DEVICES="auto"
ENABLE_WANDB=True
WANDB_PREFIX="infctx-v5-datapack"

print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /home/recursal/RWKV-infctx-trainer/notebook/trainer-v5-validation
TRAINER_DIR: /home/recursal/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /home/recursal/RWKV-infctx-trainer


In [3]:
# Init the model
!cd "{TRAINER_DIR}" && \
    python3 ./init_model.py \
        --n_layer 6 --n_embd 2048 \
        --vocab_size world --skip-if-exists \
        "../model/L6-D2048-world-v5base-init.pth"

[2024-01-28 02:00:22,765] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'
---- Initializing model ----
No of layers: 6
Embedding size: 2048
Output model path: ../model/L6-D2048-world-v5base-init.pth
Vocab size: 65536
Emb scale: 0.0001
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Model exists, skipping init_model


# Build the datapack

In [25]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 datapack_build.py "{NOTEBOOK_DIR}/config/datapack-example.yaml"

>> Starting datapack build process for: /home/recursal/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/datapack-example.yaml


In [None]:
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/test-dataset-repack-chunks.yaml" \
        --model.load_model="../model/L6-D2048-world-v5base-init.pth" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v5-enwiki-10k-chunked/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Chunking 4096 - (deepspeed_stage_1)" \
        --trainer.strategy="deepspeed_stage_1" \
        --trainer.microbatch_size=2 \
        --trainer.devices="{GPU_DEVICES}"

# With dataset packing

In [None]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/config/test-dataset-repack.yaml"

In [None]:
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/test-dataset-repack.yaml" \
        --model.load_model="../model/L6-D2048-world-v5base-init.pth" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v5-enwiki-10k-packing/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - Packing - (deepspeed_stage_1)" \
        --trainer.strategy="deepspeed_stage_1" \
        --trainer.microbatch_size=2 \
        --trainer.devices="{GPU_DEVICES}"