# Train Music Transformer
Since Fri. Feb. 25th, 2022

Set up training in colab


## Setup


### Ipython



In [1]:
%load_ext autoreload
%autoreload 2



### Colab



In [2]:
import os
import sys

import torch


if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    ! pip3 install sty icecream music21 transformers datasets
    ! pip3 install stefutils

    path = os.path.join('drive', 'My Drive', 'Research', 'Music with NLP', 'Symbolic-Music-Generation')
    sys.path.append(path)
    ! ls "{path}"


    import time, os
    os.environ['TZ'] = 'US/Eastern'
    time.tzset()

    # if torch.cuda.is_available():
    #     %env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

if torch.cuda.is_available():
    ! nvidia-smi


from stefutil import *
from musicnlp.util import *

mic(u.proj_dir, u.pkg_nm)



Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sty
  Downloading sty-1.0.4-py3-none-any.whl (11 kB)
Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 52.8 MB/s 
[?25hCollecting colorama>=0.3.9
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting executing>=0.3.1
  Downloading executing-0.9.1-py2.py3-none-any.whl (16 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████

ic| u.proj_dir: 'Symbolic-Music-Generation', u.pkg_nm: 'musicnlp'


('Symbolic-Music-Generation', 'musicnlp')

### code

In [3]:
from musicnlp.trainer import train


seed = sconfig('random-seed')



## Prep for training


In [4]:
md_nm = 'reformer'
# md_sz = 'debug'
# md_sz = 'tiny'
md_sz = 'base'
mic(md_nm, md_sz)

# TODO: smaller seq-len for now, until it shows longer dependency
# model_config = dict(max_position_embeddings=1024, axial_pos_shape=(32, 32))
model_config = dict(max_position_embeddings=4096, axial_pos_shape=(64, 64))

n_ep = 32
train_args = dict(save_strategy='epoch', num_train_epochs=n_ep)

# augment_key = False
augment_key = True
wordpiece_tokenize = False
my_train_args = dict(
    tqdm=True, logging_strategy='epoch', save_epochs=1,
    augment_key=augment_key,
    wordpiece_tokenize=wordpiece_tokenize,
)

if 'debug' in md_sz or md_sz == 'tiny':
    train_args.update(dict(
        per_device_train_batch_size=4,
        num_train_epochs=64,
    ))
    my_train_args['save_epochs'] = 16
else:
    train_args.update(dict(
        # learning_rate=1e-5,
        # learning_rate=3e-5,
        learning_rate=1e-4,
        fp16=torch.cuda.is_available(),
        # per_device_train_batch_size=64,
        per_device_train_batch_size=17,
        per_device_eval_batch_size=16,
    ))
    my_train_args['save_epochs'] = 4

# mode = 'melody'
mode = 'full'
if mode == 'melody':
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-04'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=melody, prec=5, th=1}, 2022-05-20_14-52-28'
    lmd = 'musicnlp music extraction, dnm=LMD, n=176640, meta={mode=melody, prec=5, th=1}, 2022-05-27_15-23-20'
    dnms = [pop, mst, lmd]
else:
    pop = 'musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17'
    mst = 'musicnlp music extraction, dnm=MAESTRO, n=1276, meta={mode=full, prec=5, th=1}, 2022-08-02_20-12-23'
    dnms = [pop, mst]

# n = 64
n = None

# checkpoint_path = os.path.join(u.model_path, '2022-07-12_00-09-14_reformer', 'checkpoint-16434')
# mic(os.listdir(checkpoint_path))

mdl, tokenizer, trainer = train.get_all_setup(
    model_name=md_nm, model_size=md_sz, model_config=model_config,
    dataset_names=dnms, dataset_args=dict(n_sample=n, shuffle_seed=seed, pbar=True),
    train_args=train_args, my_train_args=my_train_args, trainer_args=dict(
        disable_train_metrics=True
    )
)



ic| md_nm: 'reformer', md_sz: 'base'


[38;2;0;186;142m2022-08-07 05:12:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[Get Setup][38;2;97;175;239m::[38;2;198;120;221mget_all_setup[38;2;97;175;239m::[38;2;198;120;221mtrain.py[38;2;97;175;239m:[38;2;198;120;221m274[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mInitializing training with {
    [94m"model_name"[39;49;00m: [33m"reformer"[39;49;00m,
    [94m"model_size"[39;49;00m: [33m"base"[39;49;00m,
    [94m"model_config"[39;49;00m: {
        [94m"max_position_embeddings"[39;49;00m: [34m4096[39;49;00m,
        [94m"axial_pos_shape"[39;49;00m: [
            [34m64[39;49;00m,
            [34m64[39;49;00m
        ]
    },
    [94m"dataset_names"[39;49;00m: [
        [33m"musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17"[39;49;00m,
   

Loading cached shuffled indices for dataset at drive/My Drive/Research/Music with NLP/datasets/processed/hf/musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17/train/cache-f2b0a100acd13ff7.arrow
Loading cached shuffled indices for dataset at drive/My Drive/Research/Music with NLP/datasets/processed/hf/musicnlp music extraction, dnm=POP909, n=909, meta={mode=full, prec=5, th=1}, 2022-08-02_20-11-17/test/cache-5c0482695d1bfa01.arrow


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

## Train


Check log and tensorboard files written



In [5]:
trainer.train()
# checkpoint_path = os.path.join(u.model_path, '2022-08-02_10-48-15_reformer', 'checkpoint-1088')
# trainer.train(checkpoint_path)

save_path = os.path.join(trainer.args.output_dir, 'trained')
trainer.save_model(save_path)


[38;2;0;186;142m2022-08-07 05:13:01[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221mon_train_begin[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m173[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29mTraining started with model [35m{[39m[49m[0mmodel name[35m: [39m[49m[0m[34mMyReformerModelWithLMHead[39m[49m[0m, max length[35m: [39m[49m[0m[34m4096[39m[49m[0m, axial_pos_shape[35m: [39m[49m[0m[34m(64, 64)[39m[49m[0m, n_layer[35m: [39m[49m[0m[34m12[39m[49m[0m, hidden_size[35m: [39m[49m[0m[34m768[39m[49m[0m, ff_size[35m: [39m[49m[0m[34m3072[39m[49m[0m, attention_shape[35m: [39m[49m[0m[34m12x64[39m[49m[0m, parameter_count[35m: [39m[49m[0m[34m82.5M[39m

Train Epoch  1/32:   0%|          | 0/126 [00:00<?, ?ba/s]

config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...
config.num_buckets is not set. Setting config.num_buckets to 128...


[38;2;0;186;142m2022-08-07 05:37:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 126/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 1.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.120e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.6756[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  1/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 05:38:00[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 126/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 1/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6886[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.56[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m  0.00[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  2/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:02:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 252/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 2.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.240e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m3.5249[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  2/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:02:44[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 252/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 2/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m3.6043[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m11.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m48.89[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  3/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:27:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 378/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 3.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.360e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6867[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  3/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:27:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 378/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 3/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.7429[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m24.65[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 44.2[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  4/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:52:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 504/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 4.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.980e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7134[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  4/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 06:52:28[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 504/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 4/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.6178[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m26.55[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m54.62[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  5/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 07:16:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 630/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 5.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.900e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.7732[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  5/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 07:17:12[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 630/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 5/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.5487[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m29.81[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m54.73[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  6/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 07:41:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 756/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 6.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.770e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.6705[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  6/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 07:41:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 756/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 6/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.4536[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m32.85[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m53.03[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  7/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:06:21[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 882/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 7.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.580e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1394[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  7/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:06:37[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m 882/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 7/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.2243[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m38.97[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m64.12[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  8/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:31:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1008/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 8.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.330e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.2541[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  8/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:31:26[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1008/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 8/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.1264[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m41.12[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m67.33[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch  9/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:55:57[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1134/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 9.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m9.030e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.9244[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch  9/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 08:56:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1134/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m 9/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m2.0588[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m42.39[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m73.27[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 10/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 09:20:43[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1260/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m10.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.690e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1943[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 10/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 09:20:58[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1260/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m10/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.9899[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m44.05[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m71.63[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 11/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 09:45:25[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1386/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m11.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m8.300e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.0476[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 11/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 09:45:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1386/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m11/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.932[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m44.87[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m74.92[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 12/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 10:10:11[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1512/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m12.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.870e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.9176[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 12/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 10:10:33[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1512/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m12/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.895[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m45.63[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m77.87[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 13/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 10:35:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1638/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m13.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.410e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m2.1263[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 13/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 10:35:20[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1638/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m13/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.8781[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m46.19[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m74.39[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 14/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 10:59:52[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1764/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m14.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.920e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6746[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 14/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 11:00:09[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1764/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m14/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.8158[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m47.14[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m73.71[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 15/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 11:24:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1890/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m15.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m6.400e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.7263[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 15/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 11:25:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m1890/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m15/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7828[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m48.02[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m83.75[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 16/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 11:49:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2016/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m16.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.870e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6896[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 16/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 11:50:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2016/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m16/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7526[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m48.77[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 78.7[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 17/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 12:14:38[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m17.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m5.330e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.7442[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 17/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 12:14:54[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2142/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m17/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7484[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m48.98[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m78.86[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 18/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 12:39:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2268/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m18.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.780e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.8813[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 18/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 12:39:59[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2268/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m18/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7179[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 49.6[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m78.79[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 19/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:04:46[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2394/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m19.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.240e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.7509[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 19/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:05:03[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2394/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m19/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7064[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 49.8[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 78.00[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 20/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:29:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2520/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m20.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.710e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6397[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 20/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:30:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2520/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m20/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.7101[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m49.98[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m77.37[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 21/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:55:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2646/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m21.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.190e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.8235[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 21/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 13:55:24[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2646/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m21/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6931[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m50.26[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m79.28[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 22/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 14:20:16[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2772/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m22.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.690e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.8201[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 22/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 14:20:34[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2772/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m22/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6812[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m 50.5[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m80.98[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 23/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 14:45:25[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2898/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m23.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.220e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.5534[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 23/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 14:45:41[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m2898/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m23/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6668[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m50.78[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m82.04[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 24/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 15:10:32[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3024/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m24.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.790e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.5241[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 24/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 15:10:56[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3024/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m24/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6699[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m50.77[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 81.3[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 25/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 15:35:48[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3150/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m25.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.390e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6102[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 25/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 15:36:05[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3150/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m25/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6695[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m50.76[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m 76.8[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 26/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:00:55[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3276/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m26.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.030e-05[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.5866[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 26/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:01:12[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3276/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m26/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6595[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.03[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m82.14[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 27/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:26:01[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3402/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m27.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m7.300e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6454[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 27/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:26:17[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3402/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m27/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6522[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.21[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m81.34[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 28/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:51:04[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3528/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m28.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m4.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6239[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 28/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 16:51:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3528/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m28/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6511[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.23[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m81.62[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 29/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 17:16:13[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3654/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m29.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m2.700e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.7381[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 29/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 17:16:29[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3654/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m29/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6512[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.14[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m79.34[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 30/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 17:41:14[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3780/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m30.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m1.200e-06[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.527[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 30/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 17:41:30[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3780/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m30/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6503[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.18[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m83.91[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 31/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 18:06:23[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3906/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m31.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m3.000e-07[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.6694[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 31/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 18:06:40[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m3906/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m31/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6511[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.26[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m80.64[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Train Epoch 32/32:   0%|          | 0/126 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 18:31:39[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4032/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m32.000/32[39m[49m[0m, train/learning_rate[35m: [39m[49m[0m[34m0.000e+00[39m[49m[0m, train/loss[35m: [39m[49m[0m[34m1.7762[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m


Eval Epoch 32/32:   0%|          | 0/3 [00:00<?, ?ba/s]

[38;2;0;186;142m2022-08-07 18:32:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;239m::[38;2;198;120;221m_log[38;2;97;175;239m::[38;2;198;120;221mtrain_util_wrap.py[38;2;97;175;239m:[38;2;198;120;221m257[38;2;97;175;239m:[39m[49m[22m[23m[24m[25m[27m[28m[29mINFO[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;97;175;239m: [39m[49m[22m[23m[24m[25m[27m[28m[29m[35m{[39m[49m[0mstep[35m: [39m[49m[0m[34m4032/4032[39m[49m[0m, epoch[35m: [39m[49m[0m[34m32/32[39m[49m[0m, eval/loss[35m: [39m[49m[0m[34m1.6507[39m[49m[0m, eval/ntp_acc[35m: [39m[49m[0m[34m51.17[39m[49m[0m, eval/ikr[35m: [39m[49m[0m[34m80.54[39m[49m[0m[35m}[39m[49m[0m[39m[49m[22m[23m[24m[25m[27m[28m[29m
[38;2;0;186;142m2022-08-07 18:32:06[38;2;97;175;239m|[39m[49m[22m[23m[24m[25m[27m[28m[29m[38;2;198;120;221m[MyReformerModelWithLMHead Train][38;2;97;175;2

In [6]:
mic(trainer.args.output_dir)
mic(os.listdir(trainer.args.output_dir))



ic| trainer.args.output_dir: 'drive/My Drive/Research/Music with NLP/models/2022-08-07_05-12-51_reformer'
ic| os.listdir(trainer.args.output_dir): ['md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=0.0001, bsz=17, n_ep=32.log',
                                          'tb - md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, '
                                          'n_param=82.5M}, n=2140, a=0.0001, bsz=17, n_ep=32',
                                          'checkpoint-504',
                                          'checkpoint-1008',
                                          'checkpoint-1512',
                                          'checkpoint-2016',
                                          'checkpoint-2520',
                                          'checkpoint-3024',
                        

['md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=0.0001, bsz=17, n_ep=32.log',
 'tb - md={nm=MyReformerModelWithLMHead, l=4096, ax_pos_sp=(64, 64), n_l=12, hd_sz=768, ff_sz=3072, attn_sh=12x64, n_param=82.5M}, n=2140, a=0.0001, bsz=17, n_ep=32',
 'checkpoint-504',
 'checkpoint-1008',
 'checkpoint-1512',
 'checkpoint-2016',
 'checkpoint-2520',
 'checkpoint-3024',
 'checkpoint-3528',
 'checkpoint-4032',
 'trained']