# Train Music Transformer
Since Fri. Feb. 25th, 2022

Set up training in colab


## Setup


### Ipython



In [1]:
%load_ext autoreload
%autoreload 2



### Colab



In [2]:
import os
import sys

import torch


if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    ! pip3 install sty icecream music21 transformers datasets
    ! pip3 install stefutils

    path = os.path.join('drive', 'My Drive', 'Research', 'Music with NLP', 'Symbolic-Music-Generation')
    sys.path.append(path)
    ! ls "{path}"


    import time, os
    os.environ['TZ'] = 'US/Eastern'
    time.tzset()

    # if torch.cuda.is_available():
    #     %env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

if torch.cuda.is_available():
    ! nvidia-smi


from stefutil import *
from musicnlp.util import *

mic(u.proj_dir, u.pkg_nm)



Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sty
  Downloading sty-1.0.4-py3-none-any.whl (11 kB)
Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 57.7 MB/s 
Collecting executing>=0.3.1
  Downloading executing-0.9.1-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████

ic| u.proj_dir: 'Symbolic-Music-Generation', u.pkg_nm: 'musicnlp'


('Symbolic-Music-Generation', 'musicnlp')

### code

In [3]:
from musicnlp.trainer import train


seed = sconfig('random-seed')



## Prep for training


In [4]:
md_nm = 'reformer'
# md_sz = 'debug'
# md_sz = 'tiny'
md_sz = 'base'
mic(md_nm, md_sz)

# TODO: smaller seq-len for now, until it shows longer dependency
# model_config = dict(max_position_embeddings=1024, axial_pos_shape=(32, 32))
model_config = dict(max_position_embeddings=4096, axial_pos_shape=(64, 64))

n_ep = 32
train_args = dict(save_strategy='epoch', num_train_epochs=n_ep)

augment_key = False
wordpiece_tokenize = False
my_train_args = dict(
    tqdm=True, logging_strategy='epoch', save_epochs=1,
    augment_key=augment_key,
    wordpiece_tokenize=wordpiece_tokenize,
)

if 'debug' in md_sz or md_sz == 'tiny':
    train_args.update(dict(
        per_device_train_batch_size=4,
        num_train_epochs=64,
    ))
    my_train_args['save_epochs'] = 16
else:
    train_args.update(dict(
        # learning_rate=1e-5,
        learning_rate=3e-5,
        fp16=torch.cuda.is_available(),
        # per_device_train_batch_size=64,
        per_device_train_batch_size=17,
        per_device_eval_batch_size=16,
    ))
    my_train_args['save_epochs'] = 4

# mode = 'melody'
mode = 'full'
if mode == 'melody':
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-04'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-28'
    lmd = 'musicnlp music extraction, dnm=LMD, n=176640, meta={mode=melody, prec=5, th=1}, 2022-05-27_15-23-20'
    dnms = [pop, mst, lmd]
else:
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=full, prec=5, th=1}, 2022-08-02_20-12-23'
    dnms = [pop, mst]

# n = 64
n = None

# checkpoint_path = os.path.join(u.model_path, '2022-07-12_00-09-14_reformer', 'checkpoint-16434')
# mic(os.listdir(checkpoint_path))

mdl, tokenizer, trainer = train.get_all_setup(
    model_name=md_nm, model_size=md_sz, model_config=model_config,
    dataset_names=dnms, dataset_args=dict(n_sample=n, shuffle_seed=seed, pbar=True),
    train_args=train_args, my_train_args=my_train_args, trainer_args=dict(
        disable_train_metrics=True
    )
)



ic| md_nm: 'reformer', md_sz: 'base'


[38;2;0;186;142m2022-08-04 05:45:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[Get Setup][38;2;97;175;239m::[38;2;198;120;221mget_all_setup[38;2;97;175;239m::[38;2;198;120;221mtrain.py[38;2;97;175;239m:[38;2;198;120;221m273[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mInitializing training with {
    [94m"model_name"[39;49;00m: [33m"reformer"[39;49;00m,
    [94m"model_size"[39;49;00m: [33m"base"[39;49;00m,
    [94m"model_config"[39;49;00m: {
        [94m"max_position_embeddings"[39;49;00m: [34m4096[39;49;00m,
        [94m"axial_pos_shape"[39;49;00m: [
            [34m64[39;49;00m,
            [34m64[39;49;00m
        ]
    },
    [94m"dataset_names"[39;49;00m: [
        [33m"musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17"[39;49;00m,
   

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 05:46:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[Get Dataset][38;2;97;175;239m::[38;2;198;120;221mget_dataset[38;2;97;175;239m::[38;2;198;120;221mdataset.py[38;2;97;175;239m:[38;2;198;120;221m77[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mShuffling with seed [34m77[39m[49m[0m... [39m[49m[22m[23m[24m[25m[27m[28m[29m


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

## Train


Check log and tensorboard files written



In [5]:
trainer.train()
# checkpoint_path = os.path.join(u.model_path, '2022-08-02_10-48-15_reformer', 'checkpoint-1088')
# trainer.train(checkpoint_path)

save_path = os.path.join(trainer.args.output_dir, 'trained')
trainer.save_model(save_path)


[38;2;0;186;142m2022-08-04 05:47:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221mon_train_begin[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m173[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mTraining started with model [35m{[39m[49m[0mmodel name[35m: [39m[49m[0m[34mMyReformerModelWithLMHead[39m[49m[0m, max length[35m: [39m[49m[0m[34m4096[39m[49m[0m, axial_pos_shape[35m: [39m[49m[0m[34m(64, 64)[39m[49m[0m, n_layer[35m: [39m[49m[0m[34m12[39m[49m[0m, hidden_size[35m: [39m[49m[0m[34m768[39m[49m[0m, ff_size[35m: [39m[49m[0m[34m3072[39m[49m[0m, attention_shape[35m: [39m[49m[0m[34m12x64[39m[49m[0m, parameter_count[35m: [39m[49m[0m[34m82.5M[39m

Train Epoch  1/32:   0%|          | 0/126 [00:00<?, ?ba/s]

config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...


[38;2;0;186;142m2022-08-04 06:06:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 126/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 1.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.7014[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  1/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 06:07:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 126/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 1/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.7342[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.61[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m  0.00[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  2/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 06:26:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 252/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 2.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.870e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6532[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  2/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 06:27:00[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 252/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 2/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6664[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 3.94[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  3/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 06:46:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 378/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 3.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.810e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6368[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  3/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 06:46:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 378/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 3/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6314[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.75[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  4/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:06:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 504/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 4.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.990e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.721[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  4/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:06:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 504/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 4/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.7549[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m25.01[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m41.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  5/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:26:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 630/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 5.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.970e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5888[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  5/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:26:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 630/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 5/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6397[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m27.64[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.98[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  6/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:46:42[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 756/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 6.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.930e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6319[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  6/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 07:46:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 756/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 6/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5907[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.69[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.66[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  7/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:06:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 882/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 7.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.870e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5336[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  7/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:06:49[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 882/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 7/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5694[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.83[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.41[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  8/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:26:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1008/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 8.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.800e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4806[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  8/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:26:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1008/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 8/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5541[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.21[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 44.3[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  9/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:46:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1134/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 9.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.710e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3685[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  9/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 08:46:50[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1134/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 9/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5357[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.59[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.74[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 10/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:06:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1260/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m10.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.610e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5838[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 10/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:06:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1260/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m10/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5106[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.51[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.03[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 11/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:26:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1386/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m11.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.490e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4289[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 11/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:26:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1386/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m11/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4653[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.03[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 12/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:46:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1512/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m12.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.360e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3562[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 12/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 09:46:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1512/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m12/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4159[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.06[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 51.4[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 13/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:06:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1638/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m13.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.220e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2756[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 13/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:06:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1638/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m13/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3659[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m34.96[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.11[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 14/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:26:33[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1764/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m14.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.070e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3568[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 14/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:26:42[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1764/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m14/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3024[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.09[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.25[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 15/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:46:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1890/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m15.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.920e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2182[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 15/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 10:46:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1890/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m15/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.248[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m39.04[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m53.05[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 16/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:06:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2016/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m16.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.760e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1488[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 16/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:06:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2016/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m16/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2296[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.96[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m54.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 17/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:26:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m17.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.600e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2123[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 17/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:26:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m17/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.1943[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m40.72[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m54.07[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 18/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:46:27[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2268/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m18.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.440e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2295[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 18/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 11:46:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2268/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m18/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.1426[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m41.71[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m59.35[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 19/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:06:25[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2394/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m19.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.270e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1737[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 19/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:06:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2394/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m19/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.1144[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m41.99[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m64.48[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 20/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:26:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2520/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m20.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.110e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.9964[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 20/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:26:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2520/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m20/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.1024[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 42.3[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m67.74[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 21/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:46:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2646/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m21.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.9908[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 21/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 12:46:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2646/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m21/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0755[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m42.88[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.66[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 22/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:06:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2772/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m22.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.9969[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 22/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:06:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2772/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m22/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0763[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m42.94[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 67.5[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 23/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:26:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2898/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m23.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.121[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 23/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:26:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2898/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m23/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0672[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m42.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 66.7[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 24/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:46:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3024/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m24.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2199[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 24/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 13:46:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3024/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m24/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0518[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.32[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m67.73[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 25/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:06:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3150/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m25.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0348[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 25/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:06:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3150/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m25/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0495[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.45[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.02[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 26/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:26:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3276/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m26.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0064[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 26/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:26:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3276/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m26/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0462[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.37[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m69.19[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 27/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:46:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3402/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m27.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m 2.01[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 27/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 14:46:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3402/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m27/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0482[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.43[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.95[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 28/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:06:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3528/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m28.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.115[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 28/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:06:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3528/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m28/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0389[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.54[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.86[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 29/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:26:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3654/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m29.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0519[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 29/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:26:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3654/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m29/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0385[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.55[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.44[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 30/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:46:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3780/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m30.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0262[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 30/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 15:46:26[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3780/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m30/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0419[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.52[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.98[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 31/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 16:06:15[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3906/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m31.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m 2.18[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 31/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 16:06:24[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3906/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m31/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0379[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.56[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 32/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 16:26:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4032/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m32.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0632[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 32/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-04 16:26:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4032/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m32/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0397[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m43.54[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m68.61[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m
[38;2;0;186;142m2022-08-04 16:26:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;2

In [6]:
mic(trainer.args.output_dir)
mic(os.listdir(trainer.args.output_dir))



ic| trainer.args.output_dir: 'drive/My Drive/Research/Music with NLP/models/2022-08-04_05-46-55_reformer'
ic| os.listdir(trainer.args.output_dir): ['md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=3e-05, bsz=17, n_ep=32.log',
                                          'tb - md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=3e-05, bsz=17, n_ep=32',
                                          'checkpoint-504',
                                          'checkpoint-1008',
                                          'checkpoint-1512',
                                          'checkpoint-2016',
                                          'checkpoint-2520',
                                          'checkpoint-3024',
                          

['md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=3e-05, bsz=17, n_ep=32.log',
 'tb - md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=3e-05, bsz=17, n_ep=32',
 'checkpoint-504',
 'checkpoint-1008',
 'checkpoint-1512',
 'checkpoint-2016',
 'checkpoint-2520',
 'checkpoint-3024',
 'checkpoint-3528',
 'checkpoint-4032',
 'trained']