# Train Music Transformer
Since Fri. Feb. 25th, 2022

Set up training in colab


## Setup


### Ipython



In [None]:
%load_ext autoreload
%autoreload 2



### Colab



In [None]:
import os
import sys

import torch


if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    ! pip3 install sty icecream music21 transformers datasets
    ! pip3 install stefutils

    path = os.path.join('drive', 'My Drive', 'Research', 'Music with NLP', 'Symbolic-Music-Generation')
    sys.path.append(path)
    ! ls "{path}"


    import time, os
    os.environ['TZ'] = 'US/Eastern'
    time.tzset()

    # if torch.cuda.is_available():
    #     %env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

if torch.cuda.is_available():
    ! nvidia-smi


from stefutil import *
from musicnlp.util import *

mic(u.proj_dir, u.pkg_nm)



Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sty
  Downloading sty-1.0.4-py3-none-any.whl (11 kB)
Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 11.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 54.7 MB/s 
[?25hCollecting executing>=0.3.1
  Downloading executing-0.9.1-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.1 MB/s 
Collecting pyyaml>

ic| u.proj_dir: 'Symbolic-Music-Generation', u.pkg_nm: 'musicnlp'


('Symbolic-Music-Generation', 'musicnlp')

### code

In [None]:
from musicnlp.trainer import train


seed = sconfig('random-seed')



## Prep for training


In [None]:
md_nm = 'reformer'
# md_sz = 'debug'
# md_sz = 'tiny'
md_sz = 'base'
mic(md_nm, md_sz)

# TODO: smaller seq-len for now, until it shows longer dependency
model_config = dict(max_position_embeddings=1024, axial_pos_shape=(32, 32))

n_ep = 128
train_args = dict(save_strategy='epoch', num_train_epochs=n_ep)

augment_key = False
wordpiece_tokenize = False
my_train_args = dict(
    tqdm=True, logging_strategy='epoch', save_epochs=1,
    augment_key=augment_key,
    wordpiece_tokenize=wordpiece_tokenize,
)

if 'debug' in md_sz or md_sz == 'tiny':
    train_args.update(dict(
        per_device_train_batch_size=4,
        num_train_epochs=64,
    ))
    my_train_args['save_epochs'] = 16
else:
    train_args.update(dict(
        learning_rate=1e-5,
        fp16=torch.cuda.is_available(),
        per_device_train_batch_size=64,
    ))
    my_train_args['save_epochs'] = 16

# mode = 'melody'
mode = 'full'
if mode == 'melody':
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-04'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-28'
    lmd = 'musicnlp music extraction, dnm=LMD, n=176640, meta={mode=melody, prec=5, th=1}, 2022-05-27_15-23-20'
    dnms = [pop, mst, lmd]
else:
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=full, prec=5, th=1}, 2022-08-02_20-12-23'
    dnms = [pop, mst]

# n = 64
n = None

# checkpoint_path = os.path.join(u.model_path, '2022-07-12_00-09-14_reformer', 'checkpoint-16434')
# mic(os.listdir(checkpoint_path))

mdl, tokenizer, trainer = train.get_all_setup(
    model_name=md_nm, model_size=md_sz, model_config=model_config,
    dataset_names=dnms, dataset_args=dict(n_sample=n, shuffle_seed=seed, pbar=True),
    train_args=train_args, my_train_args=my_train_args, trainer_args=dict(
        disable_train_metrics=True
    )
)



ic| md_nm: 'reformer', md_sz: 'base'


[38;2;0;186;142m2022-08-02 23:54:26[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[Get Setup][38;2;97;175;239m::[38;2;198;120;221mget_all_setup[38;2;97;175;239m::[38;2;198;120;221mtrain.py[38;2;97;175;239m:[38;2;198;120;221m273[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mInitializing training with {
    [94m"model_name"[39;49;00m: [33m"reformer"[39;49;00m,
    [94m"model_size"[39;49;00m: [33m"base"[39;49;00m,
    [94m"model_config"[39;49;00m: {
        [94m"max_position_embeddings"[39;49;00m: [34m1024[39;49;00m,
        [94m"axial_pos_shape"[39;49;00m: [
            [34m32[39;49;00m,
            [34m32[39;49;00m
        ]
    },
    [94m"dataset_names"[39;49;00m: [
        [33m"musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17"[39;49;00m,
   

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-02 23:55:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[Get Dataset][38;2;97;175;239m::[38;2;198;120;221mget_dataset[38;2;97;175;239m::[38;2;198;120;221mdataset.py[38;2;97;175;239m:[38;2;198;120;221m77[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mShuffling with seed [34m77[39m[49m[0m... [39m[49m[22m[23m[24m[25m[27m[28m[29m


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

## Train


Check log and tensorboard files written



In [5]:
trainer.train()
# checkpoint_path = os.path.join(u.model_path, '2022-08-02_10-48-15_reformer', 'checkpoint-1088')
# trainer.train(checkpoint_path)

save_path = os.path.join(trainer.args.output_dir, 'trained')
trainer.save_model(save_path)


[38;2;0;186;142m2022-08-02 23:55:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221mon_train_begin[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m173[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mTraining started with model [35m{[39m[49m[0mmodel name[35m: [39m[49m[0m[34mMyReformerModelWithLMHead[39m[49m[0m, max length[35m: [39m[49m[0m[34m1024[39m[49m[0m, axial_pos_shape[35m: [39m[49m[0m[34m(32, 32)[39m[49m[0m, n_layer[35m: [39m[49m[0m[34m12[39m[49m[0m, hidden_size[35m: [39m[49m[0m[34m768[39m[49m[0m, ff_size[35m: [39m[49m[0m[34m3072[39m[49m[0m, attention_shape[35m: [39m[49m[0m[34m12x64[39m[49m[0m, parameter_count[35m: [39m[49m[0m[34m82.5M[39m

Train Epoch   1/128:   0%|          | 0/34 [00:00<?, ?ba/s]

config.num_buckets is not set. Setting config.num_buckets to 32...
config.num_buckets is not set. Setting config.num_buckets to 32...
config.num_buckets is not set. Setting config.num_buckets to 32...
config.num_buckets is not set. Setting config.num_buckets to 32...
config.num_buckets is not set. Setting config.num_buckets to 32...
config.num_buckets is not set. Setting config.num_buckets to 32...


[38;2;0;186;142m2022-08-03 00:00:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m  34/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  1.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m6.0836[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   1/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:00:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m  34/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  1/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m6.0454[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m  0.7[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m55.46[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   2/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:05:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m  68/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  2.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m5.1148[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   2/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:05:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m  68/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  2/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m5.0141[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 9.54[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.53[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   3/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:10:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 102/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  3.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m4.2515[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   3/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:10:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 102/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  3/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m4.168[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m10.04[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   4/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:15:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 136/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  4.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.8473[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   4/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:15:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 136/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  4/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.8266[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m10.66[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   5/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:20:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 170/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  5.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.769[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   5/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:20:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 170/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  5/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.7206[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.01[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   6/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:25:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 204/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  6.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.7561[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   6/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:25:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 204/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  6/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6805[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.07[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   7/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:30:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 238/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  7.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6977[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   7/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:30:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 238/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  7/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6579[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m10.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m34.97[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   8/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:35:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 272/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  8.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6613[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   8/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:35:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 272/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  8/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6462[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m10.94[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch   9/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:40:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 306/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  9.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6248[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch   9/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:40:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 306/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m  9/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6377[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.08[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m17.79[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  10/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:45:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 340/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 10.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6915[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  10/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:45:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 340/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 10/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6239[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m10.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  11/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:50:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 374/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 11.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6684[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  11/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:50:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 374/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 11/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6194[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 11.1[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  12/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:55:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 408/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 12.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m 3.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  12/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 00:55:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 408/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 12/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6096[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.29[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  13/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:00:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 442/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 13.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5983[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  13/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:00:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 442/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 13/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6035[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.13[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  14/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:05:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 476/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 14.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5911[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  14/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:05:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 476/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 14/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.5936[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.07[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  15/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:10:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 510/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 15.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5828[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  15/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:10:58[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 510/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 15/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.5821[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.33[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  16/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:15:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 544/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 16.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5298[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  16/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:16:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 544/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 16/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.5704[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m12.48[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  17/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:21:00[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 578/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 17.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5517[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  17/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:21:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 578/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 17/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.5392[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m16.64[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  18/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:26:00[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 612/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 18.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.4611[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  18/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:26:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 612/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 18/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.4493[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m17.49[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  19/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:31:01[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 646/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 19.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.3938[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  19/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:31:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 646/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 19/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.2764[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m18.59[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  20/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:36:01[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 680/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 20.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.0587[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  20/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:36:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 680/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 20/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.8884[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m25.91[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  21/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:41:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 714/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 21.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7051[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  21/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:41:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 714/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 21/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.729[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m25.23[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  22/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:46:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 748/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 22.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7258[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  22/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:46:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 748/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 22/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6903[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m26.35[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  23/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:51:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 782/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 23.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7364[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  23/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:51:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 782/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 23/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6673[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m27.13[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  24/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:56:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 816/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 24.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.764[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  24/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 01:56:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 816/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 24/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6618[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m26.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  25/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:01:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 850/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 25.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7142[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  25/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:01:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 850/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 25/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6471[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m27.35[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  26/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:06:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 884/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 26.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6776[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  26/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:06:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 884/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 26/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6347[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 27.9[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  27/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:11:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 918/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 27.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7322[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  27/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:11:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 918/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 27/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6196[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.09[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.04[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  28/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:16:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 952/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 28.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7154[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  28/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:16:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 952/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 28/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6159[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.08[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.72[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  29/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:21:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 986/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 29.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6232[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  29/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:21:09[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 986/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 29/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5929[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.23[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.33[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  30/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:26:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1020/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 30.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5349[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  30/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:26:09[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1020/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 30/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5804[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.91[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.35[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  31/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:31:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1054/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 31.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6784[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  31/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:31:10[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1054/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 31/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5746[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.12[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 48.9[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  32/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:36:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1088/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 32.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6689[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  32/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:36:15[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1088/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 32/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5638[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.86[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.95[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  33/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:41:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1122/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 33.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6185[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  33/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:41:15[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1122/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 33/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.559[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m28.99[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.14[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  34/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:46:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1156/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 34.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5858[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  34/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:46:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1156/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 34/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5516[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.26[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.53[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  35/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:51:14[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1190/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 35.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6033[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  35/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:51:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1190/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 35/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5428[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.88[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.73[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  36/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:56:14[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1224/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 36.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6195[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  36/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 02:56:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1224/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 36/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5364[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.63[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.07[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  37/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:01:15[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1258/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 37.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6664[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  37/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:01:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1258/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 37/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5419[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.07[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.49[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  38/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:06:15[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1292/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 38.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.517[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  38/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:06:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1292/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 38/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5353[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.56[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.27[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  39/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:11:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1326/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 39.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6172[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  39/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:11:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1326/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 39/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5192[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.99[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.56[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  40/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:16:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1360/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 40.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.505[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  40/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:16:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1360/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 40/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.519[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.57[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 48.00[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  41/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:21:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1394/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 41.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5036[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  41/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:21:19[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1394/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 41/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5187[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.52[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  42/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:26:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1428/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 42.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5492[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  42/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:26:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1428/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 42/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5164[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.88[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.22[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  43/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:31:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1462/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 43.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4431[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  43/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:31:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1462/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 43/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5178[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.75[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 49.1[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  44/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:36:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1496/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 44.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5844[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  44/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:36:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1496/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 44/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5107[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.22[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.54[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  45/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:41:19[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1530/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 45.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5054[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  45/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:41:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1530/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 45/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5105[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.61[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 48.8[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  46/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:46:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1564/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 46.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5061[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  46/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:46:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1564/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 46/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4944[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.46[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.63[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  47/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:51:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1598/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 47.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5261[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  47/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:51:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1598/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 47/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.504[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.52[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.03[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  48/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:56:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1632/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 48.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.482[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  48/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 03:56:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1632/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 48/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5039[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.22[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m46.94[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  49/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:01:27[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1666/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 49.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6273[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  49/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:01:29[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1666/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 49/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4906[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.51[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  50/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:06:27[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1700/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 50.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.482[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  50/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:06:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1700/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 50/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m 2.49[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.34[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.27[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  51/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:11:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1734/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 51.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5836[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  51/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:11:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1734/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 51/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4878[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.55[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  52/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:16:29[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1768/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 52.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5585[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  52/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:16:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1768/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 52/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4924[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.25[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.82[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  53/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:21:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1802/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 53.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5357[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  53/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:21:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1802/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 53/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5014[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m30.67[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.91[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  54/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:26:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1836/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 54.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5013[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  54/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:26:33[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1836/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 54/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4803[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.87[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.01[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  55/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:31:31[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1870/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 55.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4644[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  55/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:31:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1870/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 55/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4786[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.75[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.14[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  56/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:36:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1904/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 56.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5152[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  56/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:36:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1904/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 56/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4722[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.14[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.22[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  57/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:41:33[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1938/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 57.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.479[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  57/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:41:36[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1938/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 57/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4824[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.67[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m46.87[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  58/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:46:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1972/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 58.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4234[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  58/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:46:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1972/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 58/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4748[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 32.00[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.44[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  59/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:51:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2006/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 59.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4985[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  59/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:51:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2006/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 59/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4699[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.25[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  60/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:56:36[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2040/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 60.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6223[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  60/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 04:56:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2040/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 60/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4834[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.91[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.41[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  61/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:01:36[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2074/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 61.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4752[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  61/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:01:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2074/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 61/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4746[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.03[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.72[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  62/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:06:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2108/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 62.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4681[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  62/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:06:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2108/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 62/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4657[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 32.6[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.92[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  63/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:11:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 63.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5085[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  63/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:11:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 63/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4763[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.92[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  64/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:16:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2176/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 64.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4867[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  64/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:16:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2176/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 64/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4811[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m31.86[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.05[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  65/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:21:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2210/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 65.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3978[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  65/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:21:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2210/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 65/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4572[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.48[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  66/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:26:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2244/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 66.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4721[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  66/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:26:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2244/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 66/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.453[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.75[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m47.74[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  67/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:31:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2278/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 67.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3861[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  67/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:31:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2278/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 67/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4629[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.53[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.37[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  68/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:36:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2312/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 68.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4206[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  68/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:36:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2312/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 68/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4522[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.76[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.69[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  69/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:41:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2346/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 69.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m 2.47[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  69/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:41:49[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2346/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 69/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4472[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.02[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 48.4[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  70/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:46:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2380/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 70.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4768[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  70/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:46:50[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2380/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 70/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4484[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.16[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  71/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:51:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2414/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 71.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4221[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  71/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:51:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2414/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 71/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4434[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.16[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.76[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  72/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:56:49[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2448/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 72.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.5087[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  72/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 05:56:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2448/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 72/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4322[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.41[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 49.5[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  73/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:01:49[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2482/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 73.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4682[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  73/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:01:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2482/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 73/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4268[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.32[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.89[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  74/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:06:50[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2516/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 74.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3564[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  74/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:06:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2516/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 74/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4345[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.43[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.05[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  75/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:11:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2550/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 75.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4469[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  75/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:11:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2550/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 75/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4204[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.74[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.65[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  76/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:16:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2584/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 76.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3945[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  76/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:16:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2584/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 76/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4244[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.55[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.12[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  77/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:21:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2618/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 77.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4329[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  77/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:21:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2618/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 77/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.415[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 33.9[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.13[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  78/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:26:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2652/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 78.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3004[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  78/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:26:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2652/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 78/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4129[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m33.99[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 49.1[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  79/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:31:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2686/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 79.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2982[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  79/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:31:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2686/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 79/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4011[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m34.17[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.21[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  80/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:36:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2720/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 80.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4462[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  80/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:37:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2720/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 80/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3854[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m34.68[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 49.00[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  81/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:42:00[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2754/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 81.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4687[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  81/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:42:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2754/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 81/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3798[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 34.6[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.61[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  82/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:47:01[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2788/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 82.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3767[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  82/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:47:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2788/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 82/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3782[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m34.74[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.25[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  83/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:52:02[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2822/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 83.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2502[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  83/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:52:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2822/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 83/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3659[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.31[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.62[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  84/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:57:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2856/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 84.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2176[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  84/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 06:57:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2856/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 84/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3612[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.18[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.86[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  85/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:02:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2890/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 85.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4302[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  85/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:02:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2890/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 85/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3503[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.51[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.47[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  86/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:07:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2924/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 86.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4139[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  86/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:07:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2924/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 86/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3458[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.63[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.36[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  87/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:12:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2958/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 87.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.275[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  87/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:12:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2958/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 87/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3463[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.19[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.52[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  88/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:17:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2992/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 88.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3014[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  88/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:17:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2992/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 88/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3265[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m35.92[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.31[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  89/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:22:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3026/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 89.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2758[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  89/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:22:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3026/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 89/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3175[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.22[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.86[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  90/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:27:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3060/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 90.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.413[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  90/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:27:09[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3060/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 90/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3124[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.28[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m49.84[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  91/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:32:07[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3094/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 91.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3485[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  91/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:32:10[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3094/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 91/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3116[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.26[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.51[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  92/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:37:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3128/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 92.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2376[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  92/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:37:10[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3128/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 92/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.3045[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.28[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.83[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  93/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:42:08[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3162/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 93.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2338[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  93/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:42:11[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3162/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 93/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2995[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.47[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m52.01[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  94/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:47:09[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3196/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 94.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3711[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  94/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:47:12[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3196/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 94/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2924[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.79[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.91[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  95/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:52:10[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3230/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 95.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.900e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3178[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  95/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:52:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3230/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 95/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2884[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.86[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.01[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  96/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:57:11[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3264/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 96.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.800e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3886[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  96/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 07:57:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3264/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 96/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.288[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m36.96[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.31[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  97/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:02:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3298/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 97.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2242[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  97/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:02:19[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3298/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 97/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2814[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.07[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.16[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  98/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:07:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3332/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 98.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.600e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3832[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  98/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:07:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3332/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 98/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2844[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 36.8[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.63[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  99/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:12:18[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3366/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 99.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.500e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.4364[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  99/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:12:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3366/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 99/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2822[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 36.9[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.59[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 100/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:17:19[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3400/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m100.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.400e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1943[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 100/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:17:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3400/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m100/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2748[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.06[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.49[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 101/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:22:19[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3434/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m101.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.183[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 101/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:22:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3434/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m101/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2712[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.24[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.52[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 102/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:27:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3468/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m102.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m 2.28[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 102/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:27:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3468/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m102/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2671[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.25[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.37[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 103/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:32:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3502/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m103.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.100e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1999[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 103/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:32:24[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3502/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m103/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2632[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.37[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.87[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 104/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:37:22[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3536/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m104.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2775[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 104/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:37:24[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3536/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m104/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2607[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.28[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 105/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:42:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3570/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m105.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3538[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 105/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:42:25[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3570/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m105/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2627[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.33[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 106/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:47:24[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3604/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m106.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2249[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 106/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:47:26[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3604/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m106/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2594[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.49[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.89[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 107/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:52:25[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3638/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m107.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2629[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 107/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:52:27[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3638/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m107/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2566[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.75[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.23[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 108/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:57:26[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3672/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m108.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2961[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 108/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 08:57:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3672/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m108/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2553[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.73[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.44[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 109/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:02:27[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3706/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m109.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3081[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 109/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:02:29[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3706/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m109/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2576[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.78[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.34[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 110/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:07:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3740/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m110.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3258[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 110/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:07:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3740/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m110/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2511[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.86[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 111/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:12:29[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3774/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m111.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3682[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 111/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:12:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3774/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m111/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2511[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.79[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.69[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 112/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:17:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3808/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m112.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2725[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 112/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:17:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3808/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m112/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2483[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.71[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 113/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:22:35[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3842/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m113.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2658[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 113/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:22:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3842/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m113/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.251[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.82[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.79[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 114/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:27:36[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3876/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m114.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2719[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 114/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:27:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3876/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m114/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2497[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.88[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.47[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 115/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:32:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3910/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m115.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2844[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 115/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:32:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3910/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m115/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2482[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.92[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.84[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 116/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:37:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3944/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m116.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2964[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 116/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:37:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3944/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m116/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2469[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 37.9[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.87[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 117/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:42:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3978/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m117.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.268[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 117/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:42:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3978/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m117/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2477[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.79[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 118/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:47:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4012/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m118.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3439[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 118/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:47:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4012/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m118/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2482[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.94[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.88[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 119/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:52:42[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4046/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m119.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3122[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 119/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:52:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4046/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m119/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2452[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.93[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.62[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 120/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:57:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4080/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m120.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3037[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 120/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 09:57:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4080/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m120/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2456[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.01[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.93[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 121/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:02:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4114/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m121.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2795[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 121/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:02:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4114/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m121/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2467[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.95[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.68[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 122/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:07:45[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4148/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m122.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.347[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 122/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:07:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4148/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m122/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2455[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.95[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.97[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 123/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:12:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4182/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m123.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3284[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 123/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:12:49[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4182/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m123/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2461[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.04[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.84[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 124/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:17:47[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4216/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m124.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2123[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 124/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:17:50[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4216/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m124/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2453[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.07[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 50.9[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 125/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:22:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4250/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m125.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3334[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 125/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:22:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4250/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m125/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.246[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m37.96[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 50.9[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 126/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:27:50[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4284/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m126.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0765[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 126/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:27:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4284/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m126/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2467[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 37.9[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m51.22[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 127/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:32:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4318/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m127.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2866[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 127/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:32:53[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4318/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m127/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2479[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 38.00[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.63[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 128/128:   0%|          | 0/34 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:37:51[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4352/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m128.000/128[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.3297[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 128/128:   0%|          | 0/2 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-03 10:37:59[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m255[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4352/4352[39m[49m[0m, epoch[35m: [39m[49m[0m[34m128/128[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2465[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.02[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m50.82[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m
[38;2;0;186;142m2022-08-03 10:37:59[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175

In [6]:
mic(trainer.args.output_dir)
mic(os.listdir(trainer.args.output_dir))



ic| trainer.args.output_dir: 'drive/My Drive/Research/Music with NLP/models/2022-08-02_23-55-45_reformer'
ic| os.listdir(trainer.args.output_dir): ['md={nm=MyReformerModelWithLMHead, l=1024, ax_pos_sp=(32, 32), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=1e-05, bsz=64, n_ep=128.log',
                                          'tb - md={nm=MyReformerModelWithLMHead, l=1024, ax_pos_sp=(32, 32), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=1e-05, bsz=64, n_ep=128',
                                          'checkpoint-544',
                                          'checkpoint-1088',
                                          'checkpoint-1632',
                                          'checkpoint-2176',
                                          'checkpoint-2720',
                                          'checkpoint-3264',
                        

['md={nm=MyReformerModelWithLMHead, l=1024, ax_pos_sp=(32, 32), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=1e-05, bsz=64, n_ep=128.log',
 'tb - md={nm=MyReformerModelWithLMHead, l=1024, ax_pos_sp=(32, 32), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=1e-05, bsz=64, n_ep=128',
 'checkpoint-544',
 'checkpoint-1088',
 'checkpoint-1632',
 'checkpoint-2176',
 'checkpoint-2720',
 'checkpoint-3264',
 'checkpoint-3808',
 'checkpoint-4352',
 'trained']