# L12 Channel Mix & Time Mix Quantized Training

Starting from scratch

In [1]:
GPU_DEVICES="auto"
ENABLE_WANDB=True
WANDB_PREFIX="MQT-"

print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v6-QT/"))


print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

# Baseline layer count, and embedding size
L_SIZE=12
D_SIZE=512

# Deepspeed and batch size
DEEPSPEED_STAGE="deepspeed_stage_2"
BATCH_SIZE=4

ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits
TRAINER_DIR: /home/recursal/RWKV-infctx-trainer/RWKV-v6-QT
PROJECT_DIR: /home/recursal/RWKV-infctx-trainer


# Dataset & Model Setup

In [3]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/config/enwiki_100k-world-4k-rechunk.yaml"

Map (num_proc=160): 100%|███| 1000000/1000000 [00:37<00:00, 26659.45 examples/s]
Filter (num_proc=160): 100%|█| 1000000/1000000 [00:06<00:00, 149382.73 examples/
Map (num_proc=160): 100%|█████| 472276/472276 [00:24<00:00, 19407.40 examples/s]
Map (num_proc=160): 100%|██████| 124119/124119 [00:20<00:00, 5977.08 examples/s]
Saving the dataset (7/7 shards): 100%|█| 124119/124119 [00:03<00:00, 39297.29 ex
Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 4449.34 examples/


In [11]:
# Baseline layer count, and embedding size
L_SIZE=12
D_SIZE=512

# Init the model
!cd "{TRAINER_DIR}" && \
    python3 ./init_model.py \
        --n_layer {L_SIZE} --n_embd {D_SIZE} \
        --vocab_size world --skip-if-exists \
        "../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth"
!ls -lh "{TRAINER_DIR}/../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth"

[2024-04-22 05:29:04,801] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'
---- Initializing model ----
No of layers: 6
Embedding size: 512
Output model path: ../model/L6-D512-world-v6base-init.pth
Vocab size: 65536
Emb scale: 0.0001
Note: this process takes a significant time (and ram) for large models
---- ----- ----
Output model exists, skipping init_model
-rw-rw-r-- 1 recursal recursal 170M Apr 22 02:46 /home/recursal/RWKV-infctx-trainer/RWKV-v6/../model/L6-D512-world-v6base-init.pth


# L12-D512 : Multiple quantization perf

In [3]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-28 05:33:37,910] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1
[RWKV] CMIX reuse multiplier : 1
[RWKV] TMIX Quantize type    : nf4 (RKV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L12-D512-Q

In [4]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-28 05:59:20,387] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1
[RWKV] CMIX reuse multiplier : 1
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L12-D512-Qn

In [5]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="0"
RWKV_TMIX_QVARS="KV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-28 06:24:53,437] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1
[RWKV] CMIX reuse multiplier : 1
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (0)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L12-D512-Qn

# L12-D512 : Multiple quantization perf + Layer repeat

In [2]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="2"
RWKV_CMIX_REUSE_VARS="KV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-29 18:17:06,406] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1 (RKV)
[RWKV] CMIX reuse multiplier : 2 (KV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-

In [3]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="2"
RWKV_CMIX_REUSE_VARS="R"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-29 18:41:26,195] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1 (RKV)
[RWKV] CMIX reuse multiplier : 2 (R)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L

In [4]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="2"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-29 19:06:53,179] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1 (RKV)
[RWKV] CMIX reuse multiplier : 2 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [5]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-29 19:31:34,807] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1 (RKV)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [7]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="KV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found


[2024-04-29 21:18:47,603] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 1 (RKV)
[RWKV] CMIX reuse multiplier : 4 (KV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L12-D512-CLR-4_KV-Qnf4-TM_

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="R"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="KV"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="R"

RWKV_TMIX_REUSE_MULTIPLIER="1"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

# C and R mix

In [2]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="R"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 07:12:57,976] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 2 (R)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k-L

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="KV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [None]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

In [2]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RKVO"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 12:13:50,602] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 2 (RKVO)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100

In [3]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RKVG"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 12:37:54,943] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 2 (RKVG)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100

In [4]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RKVOG"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 13:01:51,763] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 2 (RKVOG)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-10

In [5]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="2"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 13:25:49,110] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 2 (RKV)
[RWKV] CMIX reuse multiplier : 6 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [6]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="4"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="4"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 13:50:02,136] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 4 (RKV)
[RWKV] CMIX reuse multiplier : 4 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [7]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="4"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 14:30:13,816] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 4 (RKV)
[RWKV] CMIX reuse multiplier : 6 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [8]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="6"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="6"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 14:54:19,030] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 6 (RKV)
[RWKV] CMIX reuse multiplier : 6 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-100k

In [9]:
# Learning rate setting
LR_INIT="3e-4"
LR_FINAL="3e-4"

# Channel and timemix quantized settings
RWKV_CMIX_QTYPE="nf4"
RWKV_TIMX_QTYPE="nf4"

# mix quantized vars
RWKV_CMIX_QVARS="R"
RWKV_TMIX_QVARS="KV"

RWKV_CMIX_REUSE_MULTIPLIER="12"
RWKV_CMIX_REUSE_VARS="RKV"

RWKV_TMIX_REUSE_MULTIPLIER="12"
RWKV_TMIX_REUSE_VARS="RKV"

# Nuke python3 (for back to back run cleanup)
!killall -9 python3

# Run the training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    export RWKV_JIT_ON="0" && \
    export RWKV_TORCH_COMPILE="0" && \
    export RWKV_CMIX_QTYPE="{RWKV_CMIX_QTYPE}" && \
    export RWKV_TIMX_QTYPE="{RWKV_TIMX_QTYPE}" && \
    export RWKV_CMIX_QVARS="{RWKV_CMIX_QVARS}" && \
    export RWKV_TMIX_QVARS="{RWKV_TMIX_QVARS}" && \
    export RWKV_TMIX_REUSE_MULTIPLIER="{RWKV_TMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_MULTIPLIER="{RWKV_CMIX_REUSE_MULTIPLIER}" && \
    export RWKV_CMIX_REUSE_VARS="{RWKV_CMIX_REUSE_VARS}" && \
    export RWKV_TMIX_REUSE_VARS="{RWKV_TMIX_REUSE_VARS}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/config/enwiki_100k-world-32k-rechunk.yaml" \
        --model.load_model="../model/L{L_SIZE}-D{D_SIZE}-world-v6base-init.pth" \
        --model.lr_init="{LR_INIT}" \
        --model.lr_final="{LR_FINAL}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/v6-enwiki-100k-L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS}/" \
        --trainer.logger.init_args.name="{WANDB_PREFIX}L{L_SIZE}-D{D_SIZE}-TLR-{RWKV_TMIX_REUSE_MULTIPLIER}_{RWKV_TMIX_REUSE_VARS}-CLR-{RWKV_CMIX_REUSE_MULTIPLIER}_{RWKV_CMIX_REUSE_VARS}-Q{RWKV_TIMX_QTYPE}-TM_{RWKV_TMIX_QVARS}-CM_{RWKV_CMIX_QVARS} (Rechunk 4k, {DEEPSPEED_STAGE})" \
        --trainer.strategy="{DEEPSPEED_STAGE}" \
        --trainer.microbatch_size={BATCH_SIZE} \
        --trainer.devices="{GPU_DEVICES}"

python3: no process found
[2024-04-30 16:51:24,401] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[RWKV.model] Running RWKV infctx using 'torch-native' with torch '2.1.2'
[RWKV] TMIX reuse multiplier : 12 (RKV)
[RWKV] CMIX reuse multiplier : 12 (RKV)
[RWKV] TMIX Quantize type    : nf4 (KV)
[RWKV] CMIX Quantize type    : nf4 (R)
/home/recursal/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/recursal/RWKV-infctx-trainer/notebook/rwkv-x-exp/v6-layerNbits/config/enwiki_100k-world-32k-rechunk.yaml', '--model.load_model=../model/L12-D512-world-v6base-init.pth', '--model.lr_init=3e-4', '--model.lr_final=3e-4', '--trainer.callbacks.init_args.dirpath=../checkpoint/v6-enwiki-10