In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.glow_tts_config import GlowTTSConfig

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [2]:
output_path = "train"
dataset_path = "LJSpeech-1.1"

In [3]:

dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata.csv", path=dataset_path
)

In [4]:
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=8,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [5]:
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

In [6]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size
)

In [7]:
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8


 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=train/run-February-22-2025_01+01AM-9b6e3e6

 > Model has 28610449 parameters


: 

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/10[0m
 --> train/run-February-22-2025_01+01AM-9b6e3e6

[1m > TRAINING (2025-02-22 01:01:20) [0m
d͡ʒoʊsɪf di. nɪkɔl,
Character '͡' not found in the vocabulary. Discarding it.
ðə ɪŋkaʊntɚ ɪn ðə lʌnt͡ʃɹum.
Character '͡' not found in the vocabulary. Discarding it.
noʊ soʊld͡ʒɚz iðɚ.
Character '͡' not found in the vocabulary. Discarding it.
æftɚ fɚðɚ kwɛst͡ʃənɪŋ
Character '͡' not found in the vocabulary. Discarding it.
ðə soʊld͡ʒɚz ðɛn?
Character '͡' not found in the vocabulary. Discarding it.
ɪn kwɪɡliz d͡ʒʌd͡ʒmənt,
Character '͡' not found in the vocabulary. Discarding it.
ðə dʌt͡ʃəs əv kɛnt.
Character '͡' not found in the vocabulary. Discarding it.
hɚ kæptən wəz d͡ʒɑn smɪθ,
Character '͡' not found in the vocabulary. Discarding it.

[1m   --> TIME: 2025-02-22 01:01:28 -- STEP: 0/811 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 4.5945  (4.594481706619263)
     | > loader_time: 3.4383  (3.438286781311035)

 [!] `train_step()` retuned `None`

In [8]:
# resume training

trainer_args = TrainerArgs(
    restore_path=os.path.join(output_path, "run-February-22-2025_02+19AM-9b6e3e6/best_model.pth")
)

In [9]:
trainer = Trainer(
    trainer_args, config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False


 > Start Tensorboard: tensorboard --logdir=train/run-February-22-2025_10+59PM-fa84af3
 > Restoring from best_model.pth ...
 > Restoring Model...
 > Restoring Optimizer...
 > Restoring Scaler...
 > Model restored from step 22729

 > Model has 28610449 parameters


In [10]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-22 22:59:43) [0m

[1m   --> TIME: 2025-02-22 22:59:59 -- STEP: 20/406 -- GLOBAL_STEP: 22750[0m
     | > loss: 0.06345455348491669  (0.04589533656835556)
     | > log_mle: -0.1602722406387329  (-0.17688823938369752)
     | > loss_dur: 0.2237267941236496  (0.22278357595205306)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(1.3683, device='cuda:0')  (tensor(2.2992, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.4049  (0.5272315979003906)
     | > loader_time: 0.003  (0.006970047950744629)


[1m   --> TIME: 2025-02-22 23:00:16 -- STEP: 45/406 -- GLOBAL_STEP: 22775[0m
     | > loss: 0.046437233686447144  (0.04957967334323459)
     | > log_mle: -0.1792736053466797  (-0.17695660061306423)
     | > loss_dur: 0.22571083903312683  (0.22653627395629883)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(0.9909, device='cuda:0')  (tensor




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.003610953688621521 [0m(+0.0)
     | > avg_loss: 0.02802815195173025 [0m(+0.0)
     | > avg_log_mle: -0.20404601097106934 [0m(+0.0)
     | > avg_loss_dur: 0.2320741629227996 [0m(+0.0)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_23136.pth

[4m[1m > EPOCH: 1/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-22 23:05:41) [0m

[1m   --> TIME: 2025-02-22 23:05:48 -- STEP: 14/406 -- GLOBAL_STEP: 23150[0m
     | > loss: 0.052101537585258484  (0.04619621272597994)
     | > log_mle: -0.19037580490112305  (-0.17942217418125697)
     | > loss_dur: 0.24247734248638153  (0.22561838690723693)
     | > amp_scaler: 16384.0  (16384.0)
     | > grad_norm: tensor(4.0855, device='cuda:0')  (tensor(2.2254, device='cuda:0'))
     | > current_lr: 2.5e-07 
     | > step_time: 0.7128  (0.3739775078637259)
     | > loader_time: 0.0034  (0.00515672138759068)


[1m   --> TIME: 2025-02-22




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.00457213819026947 [0m(+0.0007495731115341182)
     | > avg_loss:[92m 0.026591720059514046 [0m(-0.0013372628018260002)
     | > avg_log_mle:[92m -0.20326144248247147 [0m(-0.0006939992308616638)
     | > avg_loss_dur:[92m 0.2298531625419855 [0m(-0.0006432635709643364)


[4m[1m > EPOCH: 6/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-22 23:30:58) [0m

[1m   --> TIME: 2025-02-22 23:31:03 -- STEP: 9/406 -- GLOBAL_STEP: 25175[0m
     | > loss: 0.04912281036376953  (0.0377556367052926)
     | > log_mle: -0.18724000453948975  (-0.17943898836771646)
     | > loss_dur: 0.23636281490325928  (0.21719462507300907)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(2.5075, device='cuda:0')  (tensor(2.4870, device='cuda:0'))
     | > current_lr: 1.5e-06 
     | > step_time: 0.3612  (0.374241484536065)
     | > loader_time: 0.0026  (0.007754431830512153)


[1m   --> TIME: 202




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0033722668886184692 [0m(-0.001199871301651001)
     | > avg_loss:[92m 0.02618110552430153 [0m(-0.0004106145352125168)
     | > avg_log_mle:[91m -0.2027028277516365 [0m(+0.0005586147308349609)
     | > avg_loss_dur:[92m 0.22888393327593803 [0m(-0.0009692292660474777)


[4m[1m > EPOCH: 7/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-22 23:35:50) [0m

[1m   --> TIME: 2025-02-22 23:35:53 -- STEP: 3/406 -- GLOBAL_STEP: 25575[0m
     | > loss: 0.04951350390911102  (0.040132299065589905)
     | > log_mle: -0.18557071685791016  (-0.1857469081878662)
     | > loss_dur: 0.23508422076702118  (0.22587920725345612)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(3.1364, device='cuda:0')  (tensor(4.8230, device='cuda:0'))
     | > current_lr: 1.75e-06 
     | > step_time: 0.3555  (0.4495578606923421)
     | > loader_time: 0.0148  (0.009146690368652344)


[1m   --> TIME: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0032906383275985718 [0m(-0.00032761693000793457)
     | > avg_loss:[92m -0.005326342768967152 [0m(-0.0007206536829471588)
     | > avg_log_mle:[91m -0.22326771169900894 [0m(+0.0005528926849365234)
     | > avg_loss_dur:[92m 0.2179413689300418 [0m(-0.0012735463678836823)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_30038.pth

[4m[1m > EPOCH: 18/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 00:32:01) [0m

[1m   --> TIME: 2025-02-23 00:32:08 -- STEP: 12/406 -- GLOBAL_STEP: 30050[0m
     | > loss: 0.016749680042266846  (0.009250263373057047)
     | > log_mle: -0.20702111721038818  (-0.19670350352923074)
     | > loss_dur: 0.22377079725265503  (0.2059537669022878)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(9.5191, device='cuda:0')  (tensor(7.7704, device='cuda:0'))
     | > current_lr: 4.5e-06 
     | > step_time: 0.2888  (0.3865




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00309617817401886 [0m(-0.0008428543806076041)
     | > avg_loss:[92m -0.0115852952003479 [0m(-0.003726538270711899)
     | > avg_log_mle:[92m -0.2279633954167366 [0m(-0.003408394753932953)
     | > avg_loss_dur:[92m 0.2163781002163887 [0m(-0.0003181435167789459)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_30850.pth

[4m[1m > EPOCH: 20/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 00:42:57) [0m

[1m   --> TIME: 2025-02-23 00:42:59 -- STEP: 0/406 -- GLOBAL_STEP: 30850[0m
     | > loss: 0.003916293382644653  (0.003916293382644653)
     | > log_mle: -0.19055068492889404  (-0.19055068492889404)
     | > loss_dur: 0.1944669783115387  (0.1944669783115387)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(2.6573, device='cuda:0')  (tensor(2.6573, device='cuda:0'))
     | > current_lr: 4.9999999999999996e-06 
     | > step_time: 0.9261  (0




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004880979657173157 [0m(-0.0017853230237960815)
     | > avg_loss:[92m -0.020757511258125305 [0m(-0.0018734214827418327)
     | > avg_log_mle:[92m -0.2342015579342842 [0m(-0.002094157040119171)
     | > avg_loss_dur:[91m 0.2134440466761589 [0m(+0.0002207355573773384)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_32068.pth

[4m[1m > EPOCH: 23/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 00:58:29) [0m

[1m   --> TIME: 2025-02-23 00:58:34 -- STEP: 7/406 -- GLOBAL_STEP: 32075[0m
     | > loss: 0.0048448145389556885  (-0.0017382694142205374)
     | > log_mle: -0.19839203357696533  (-0.20300938401903426)
     | > loss_dur: 0.20323684811592102  (0.2012711146048137)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(6.5163, device='cuda:0')  (tensor(6.0754, device='cuda:0'))
     | > current_lr: 5.75e-06 
     | > step_time: 0.3555  (0.44519




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.005126446485519409 [0m(-0.000677749514579773)
     | > avg_loss:[91m -0.05719295609742403 [0m(+0.0009072916582226753)
     | > avg_log_mle:[91m -0.25602559745311737 [0m(+0.0010546445846557617)
     | > avg_loss_dur:[92m 0.19883264135569334 [0m(-0.0001473529264330864)


[4m[1m > EPOCH: 35/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:04:59) [0m

[1m   --> TIME: 2025-02-23 02:05:06 -- STEP: 10/406 -- GLOBAL_STEP: 36950[0m
     | > loss: -0.06513884663581848  (-0.047914940118789676)
     | > log_mle: -0.23391449451446533  (-0.23076537847518921)
     | > loss_dur: 0.16877564787864685  (0.18285043835639953)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(9.0089, device='cuda:0')  (tensor(12.9302, device='cuda:0'))
     | > current_lr: 8.750000000000001e-06 
     | > step_time: 0.434  (0.47193691730499265)
     | > loader_time: 0.0027  (0.0048800230026245115




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.005266949534416199 [0m(+0.0016700327396392822)
     | > avg_loss:[92m -0.07430402934551239 [0m(-0.008836180903017521)
     | > avg_log_mle:[92m -0.26974084228277206 [0m(-0.006199218332767487)
     | > avg_loss_dur:[92m 0.19543681293725967 [0m(-0.0026369625702500343)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_38158.pth

[4m[1m > EPOCH: 38/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:21:26) [0m

[1m   --> TIME: 2025-02-23 02:21:36 -- STEP: 17/406 -- GLOBAL_STEP: 38175[0m
     | > loss: -0.051693692803382874  (-0.05635802886065315)
     | > log_mle: -0.23795068264007568  (-0.23853862986845129)
     | > loss_dur: 0.1862569898366928  (0.18218060100779815)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(15.7523, device='cuda:0')  (tensor(17.4260, device='cuda:0'))
     | > current_lr: 9.499999999999999e-06 
     | > step_time: 0.




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0042645931243896484 [0m(-0.0010023564100265503)
     | > avg_loss:[91m -0.07322606164962056 [0m(+0.0010779676958918294)
     | > avg_log_mle:[91m -0.2686977013945579 [0m(+0.0010431408882141668)
     | > avg_loss_dur:[91m 0.19547163974493742 [0m(+3.482680767774582e-05)


[4m[1m > EPOCH: 39/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:26:43) [0m

[1m   --> TIME: 2025-02-23 02:26:49 -- STEP: 11/406 -- GLOBAL_STEP: 38575[0m
     | > loss: -0.07354700565338135  (-0.0620842995968732)
     | > log_mle: -0.2336806058883667  (-0.2379410591992465)
     | > loss_dur: 0.16013360023498535  (0.1758567596023733)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(14.5432, device='cuda:0')  (tensor(11.9389, device='cuda:0'))
     | > current_lr: 9.75e-06 
     | > step_time: 0.2769  (0.3687615828080611)
     | > loader_time: 0.0037  (0.006036281585693359)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.021439328789711 [0m(+0.01717473566532135)
     | > avg_loss:[91m -0.07319615222513676 [0m(+2.9909424483803848e-05)
     | > avg_log_mle:[91m -0.26799528300762177 [0m(+0.0007024183869361322)
     | > avg_loss_dur:[92m 0.194799130782485 [0m(-0.0006725089624524117)


[4m[1m > EPOCH: 40/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:31:41) [0m

[1m   --> TIME: 2025-02-23 02:31:45 -- STEP: 5/406 -- GLOBAL_STEP: 38975[0m
     | > loss: -0.04679243266582489  (-0.05797266960144043)
     | > log_mle: -0.2326500415802002  (-0.23963398933410646)
     | > loss_dur: 0.1858576089143753  (0.18166131973266603)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(3.5140, device='cuda:0')  (tensor(12.5728, device='cuda:0'))
     | > current_lr: 9.999999999999999e-06 
     | > step_time: 0.311  (0.4201423168182373)
     | > loader_time: 0.0043  (0.012240266799926758)


[1m   




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0045633018016815186 [0m(-0.01687602698802948)
     | > avg_loss:[92m -0.07536557782441378 [0m(-0.0021694255992770195)
     | > avg_log_mle:[92m -0.26929037272930145 [0m(-0.0012950897216796875)
     | > avg_loss_dur:[92m 0.19392479490488768 [0m(-0.000874335877597332)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_39376.pth

[4m[1m > EPOCH: 41/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:36:42) [0m

[1m   --> TIME: 2025-02-23 02:36:55 -- STEP: 24/406 -- GLOBAL_STEP: 39400[0m
     | > loss: -0.04767565429210663  (-0.06288984914620717)
     | > log_mle: -0.24422705173492432  (-0.2423894206682841)
     | > loss_dur: 0.1965513974428177  (0.17949957152207693)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(19.1749, device='cuda:0')  (tensor(14.6635, device='cuda:0'))
     | > current_lr: 1.025e-05 
     | > step_time: 0.3526  (0.44339




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.005096882581710815 [0m(+0.0005335807800292969)
     | > avg_loss:[92m -0.08157243020832539 [0m(-0.00620685238391161)
     | > avg_log_mle:[92m -0.27451975643634796 [0m(-0.005229383707046509)
     | > avg_loss_dur:[92m 0.19294732622802258 [0m(-0.0009774686768651009)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_39782.pth

[4m[1m > EPOCH: 42/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:41:46) [0m

[1m   --> TIME: 2025-02-23 02:41:58 -- STEP: 18/406 -- GLOBAL_STEP: 39800[0m
     | > loss: -0.06852826476097107  (-0.0657131274541219)
     | > log_mle: -0.2406095266342163  (-0.2470053964191013)
     | > loss_dur: 0.17208126187324524  (0.18129226896497938)
     | > amp_scaler: 8192.0  (8192.0)
     | > grad_norm: tensor(9.6793, device='cuda:0')  (tensor(14.3663, device='cuda:0'))
     | > current_lr: 1.05e-05 
     | > step_time: 0.4407  (0.520875268




[1m   --> TIME: 2025-02-23 02:44:49 -- STEP: 243/406 -- GLOBAL_STEP: 40025[0m
     | > loss: -0.05778372287750244  (-0.05470762049219736)
     | > log_mle: -0.2859644889831543  (-0.25913612293117816)
     | > loss_dur: 0.22818076610565186  (0.20442850243898084)
     | > amp_scaler: 4096.0  (6438.979423868313)
     | > grad_norm: tensor(37.4610, device='cuda:0')  (tensor(21.8348, device='cuda:0'))
     | > current_lr: 1.05e-05 
     | > step_time: 0.6415  (0.6905226746704353)
     | > loader_time: 0.0123  (0.011484842732119463)


[1m   --> TIME: 2025-02-23 02:45:08 -- STEP: 268/406 -- GLOBAL_STEP: 40050[0m
     | > loss: -0.07414957880973816  (-0.055084471604717314)
     | > log_mle: -0.2809332609176636  (-0.2601917733007402)
     | > loss_dur: 0.20678368210792542  (0.20510730169602293)
     | > amp_scaler: 4096.0  (6220.417910447762)
     | > grad_norm: tensor(29.1408, device='cuda:0')  (tensor(22.4504, device='cuda:0'))
     | > current_lr: 1.05e-05 
     | > step_time: 0.5916  (




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0033725351095199585 [0m(-0.001724347472190857)
     | > avg_loss:[91m -0.07594362087547779 [0m(+0.005628809332847595)
     | > avg_log_mle:[91m -0.2703799679875374 [0m(+0.004139788448810577)
     | > avg_loss_dur:[91m 0.1944363471120596 [0m(+0.0014890208840370178)


[4m[1m > EPOCH: 43/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:47:08) [0m

[1m   --> TIME: 2025-02-23 02:47:15 -- STEP: 12/406 -- GLOBAL_STEP: 40200[0m
     | > loss: -0.0673997700214386  (-0.07333939149975777)
     | > log_mle: -0.2575697898864746  (-0.2480644186337789)
     | > loss_dur: 0.190170019865036  (0.17472502713402113)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(23.5822, device='cuda:0')  (tensor(14.0775, device='cuda:0'))
     | > current_lr: 1.075e-05 
     | > step_time: 0.295  (0.3607800006866455)
     | > loader_time: 0.0047  (0.0068560441335042315)


[1m   --> TIME: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0028672069311141968 [0m(-0.0005053281784057617)
     | > avg_loss:[92m -0.08857961371541023 [0m(-0.012635992839932442)
     | > avg_log_mle:[92m -0.27976731956005096 [0m(-0.00938735157251358)
     | > avg_loss_dur:[92m 0.19118770584464073 [0m(-0.0032486412674188614)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_40594.pth

[4m[1m > EPOCH: 44/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:52:04) [0m

[1m   --> TIME: 2025-02-23 02:52:08 -- STEP: 6/406 -- GLOBAL_STEP: 40600[0m
     | > loss: -0.09811937808990479  (-0.07701095938682556)
     | > log_mle: -0.25066614151000977  (-0.24955010414123535)
     | > loss_dur: 0.15254676342010498  (0.1725391447544098)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(4.3670, device='cuda:0')  (tensor(15.4000, device='cuda:0'))
     | > current_lr: 1.1e-05 
     | > step_time: 0.2798  (0.49390010




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0033174902200698853 [0m(+0.0004502832889556885)
     | > avg_loss:[92m -0.09272736962884665 [0m(-0.004147755913436413)
     | > avg_log_mle:[92m -0.28191196173429495 [0m(-0.0021446421742439825)
     | > avg_loss_dur:[92m 0.18918459210544825 [0m(-0.002003113739192486)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_41000.pth

[4m[1m > EPOCH: 45/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 02:57:21) [0m

[1m   --> TIME: 2025-02-23 02:57:22 -- STEP: 0/406 -- GLOBAL_STEP: 41000[0m
     | > loss: -0.08463898301124573  (-0.08463898301124573)
     | > log_mle: -0.24691283702850342  (-0.24691283702850342)
     | > loss_dur: 0.1622738540172577  (0.1622738540172577)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(5.3383, device='cuda:0')  (tensor(5.3383, device='cuda:0'))
     | > current_lr: 1.125e-05 
     | > step_time: 0.7957  (0.7956683




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.002970770001411438 [0m(-0.00034672021865844727)
     | > avg_loss:[91m -0.08316662535071373 [0m(+0.009560744278132915)
     | > avg_log_mle:[91m -0.27504233270883566 [0m(+0.0068696290254592896)
     | > avg_loss_dur:[91m 0.19187570735812187 [0m(+0.002691115252673626)


[4m[1m > EPOCH: 46/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:02:15) [0m

[1m   --> TIME: 2025-02-23 03:02:26 -- STEP: 19/406 -- GLOBAL_STEP: 41425[0m
     | > loss: -0.0822472870349884  (-0.0805565008991643)
     | > log_mle: -0.23939871788024902  (-0.2540744103883442)
     | > loss_dur: 0.15715143084526062  (0.17351790948917992)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(11.7295, device='cuda:0')  (tensor(17.0156, device='cuda:0'))
     | > current_lr: 1.15e-05 
     | > step_time: 0.3691  (0.4618287964871055)
     | > loader_time: 0.0047  (0.006894701405575401)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004098325967788696 [0m(+0.0011275559663772583)
     | > avg_loss:[92m -0.09874147083610296 [0m(-0.015574845485389233)
     | > avg_log_mle:[92m -0.28683875501155853 [0m(-0.011796422302722875)
     | > avg_loss_dur:[92m 0.18809728417545557 [0m(-0.0037784231826663017)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_41812.pth

[4m[1m > EPOCH: 47/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:07:11) [0m

[1m   --> TIME: 2025-02-23 03:07:19 -- STEP: 13/406 -- GLOBAL_STEP: 41825[0m
     | > loss: -0.08664408326148987  (-0.09024222424397102)
     | > log_mle: -0.2589918375015259  (-0.25707873931297887)
     | > loss_dur: 0.172347754240036  (0.16683651506900787)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(10.5832, device='cuda:0')  (tensor(19.5916, device='cuda:0'))
     | > current_lr: 1.1750000000000001e-05 
     | > step_time: 0.61




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003452807664871216 [0m(-0.0006455183029174805)
     | > avg_loss:[91m -0.0955754891037941 [0m(+0.0031659817323088646)
     | > avg_log_mle:[91m -0.2853529378771782 [0m(+0.0014858171343803406)
     | > avg_loss_dur:[91m 0.1897774487733841 [0m(+0.001680164597928524)


[4m[1m > EPOCH: 48/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:12:03) [0m

[1m   --> TIME: 2025-02-23 03:12:08 -- STEP: 7/406 -- GLOBAL_STEP: 42225[0m
     | > loss: -0.0899558961391449  (-0.08967736576284681)
     | > log_mle: -0.2526731491088867  (-0.2571960176740374)
     | > loss_dur: 0.16271725296974182  (0.16751865191119059)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(20.7638, device='cuda:0')  (tensor(16.6780, device='cuda:0'))
     | > current_lr: 1.2e-05 
     | > step_time: 0.2588  (0.37272279603140696)
     | > loader_time: 0.0041  (0.006477015359061105)


[1m   --> TIME: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.005610853433609009 [0m(+0.002158045768737793)
     | > avg_loss:[92m -0.09924364555627108 [0m(-0.0036681564524769783)
     | > avg_log_mle:[92m -0.2893090844154358 [0m(-0.003956146538257599)
     | > avg_loss_dur:[91m 0.19006543885916471 [0m(+0.0002879900857806206)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_42624.pth

[4m[1m > EPOCH: 49/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:16:59) [0m

[1m   --> TIME: 2025-02-23 03:17:02 -- STEP: 1/406 -- GLOBAL_STEP: 42625[0m
     | > loss: -0.1002843827009201  (-0.1002843827009201)
     | > log_mle: -0.2576467990875244  (-0.2576467990875244)
     | > loss_dur: 0.1573624163866043  (0.1573624163866043)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(23.3471, device='cuda:0')  (tensor(23.3471, device='cuda:0'))
     | > current_lr: 1.225e-05 
     | > step_time: 0.9626  (0.96264553070




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00340406596660614 [0m(-0.0022067874670028687)
     | > avg_loss:[92m -0.10881219245493412 [0m(-0.009568546898663044)
     | > avg_log_mle:[92m -0.2967046797275543 [0m(-0.00739559531211853)
     | > avg_loss_dur:[92m 0.1878924872726202 [0m(-0.0021729515865445137)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_43030.pth

[4m[1m > EPOCH: 50/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:21:53) [0m

[1m   --> TIME: 2025-02-23 03:22:04 -- STEP: 20/406 -- GLOBAL_STEP: 43050[0m
     | > loss: -0.08778505027294159  (-0.09222386851906776)
     | > log_mle: -0.24424207210540771  (-0.26204934120178225)
     | > loss_dur: 0.15645702183246613  (0.16982547268271447)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(4.6692, device='cuda:0')  (tensor(17.9533, device='cuda:0'))
     | > current_lr: 1.2499999999999999e-05 
     | > step_time: 0.9264




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003821849822998047 [0m(+0.00041778385639190674)
     | > avg_loss:[92m -0.10937992762774229 [0m(-0.0005677351728081703)
     | > avg_log_mle:[92m -0.2969270274043083 [0m(-0.0002223476767539978)
     | > avg_loss_dur:[92m 0.18754709977656603 [0m(-0.0003453874960541725)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_43436.pth

[4m[1m > EPOCH: 51/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:26:48) [0m

[1m   --> TIME: 2025-02-23 03:26:55 -- STEP: 14/406 -- GLOBAL_STEP: 43450[0m
     | > loss: -0.07358141243457794  (-0.09779856247561318)
     | > log_mle: -0.27536606788635254  (-0.2659557972635542)
     | > loss_dur: 0.2017846554517746  (0.16815723478794098)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(24.4697, device='cuda:0')  (tensor(17.0584, device='cuda:0'))
     | > current_lr: 1.275e-05 
     | > step_time: 0.5378  (0.382




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0033137500286102295 [0m(-0.0005080997943878174)
     | > avg_loss:[92m -0.11170761100947857 [0m(-0.0023276833817362785)
     | > avg_log_mle:[92m -0.2980780825018883 [0m(-0.001151055097579956)
     | > avg_loss_dur:[92m 0.1863704714924097 [0m(-0.0011766282841563225)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_43842.pth

[4m[1m > EPOCH: 52/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:31:47) [0m

[1m   --> TIME: 2025-02-23 03:31:52 -- STEP: 8/406 -- GLOBAL_STEP: 43850[0m
     | > loss: -0.12580673396587372  (-0.1020219624042511)
     | > log_mle: -0.27539122104644775  (-0.2662690132856369)
     | > loss_dur: 0.14958448708057404  (0.1642470508813858)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(26.9448, device='cuda:0')  (tensor(23.9821, device='cuda:0'))
     | > current_lr: 1.3e-05 
     | > step_time: 0.4828  (0.389728754




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003899648785591127 [0m(+0.0005858987569808973)
     | > avg_loss:[92m -0.11291216406971216 [0m(-0.001204553060233593)
     | > avg_log_mle:[92m -0.2984122186899185 [0m(-0.0003341361880302429)
     | > avg_loss_dur:[92m 0.18550005462020636 [0m(-0.0008704168722033501)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_44248.pth

[4m[1m > EPOCH: 53/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:36:42) [0m

[1m   --> TIME: 2025-02-23 03:36:45 -- STEP: 2/406 -- GLOBAL_STEP: 44250[0m
     | > loss: -0.12613557279109955  (-0.12309597432613373)
     | > log_mle: -0.2809901237487793  (-0.2736174464225769)
     | > loss_dur: 0.15485455095767975  (0.15052147209644318)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(25.4321, device='cuda:0')  (tensor(21.1775, device='cuda:0'))
     | > current_lr: 1.325e-05 
     | > step_time: 0.3691  (0.510987




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003232300281524658 [0m(-0.0006673485040664686)
     | > avg_loss:[92m -0.11511114332824945 [0m(-0.0021989792585372925)
     | > avg_log_mle:[92m -0.2990977391600609 [0m(-0.0006855204701423645)
     | > avg_loss_dur:[92m 0.18398659583181143 [0m(-0.001513458788394928)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_44654.pth

[4m[1m > EPOCH: 54/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:41:34) [0m

[1m   --> TIME: 2025-02-23 03:41:47 -- STEP: 21/406 -- GLOBAL_STEP: 44675[0m
     | > loss: -0.09430044889450073  (-0.10144783556461334)
     | > log_mle: -0.27129650115966797  (-0.268403104373387)
     | > loss_dur: 0.17699605226516724  (0.1669552688087736)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(16.6156, device='cuda:0')  (tensor(22.1678, device='cuda:0'))
     | > current_lr: 1.35e-05 
     | > step_time: 0.5364  (0.4482366




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003908157348632813 [0m(+0.0006758570671081552)
     | > avg_loss:[92m -0.11694339383393526 [0m(-0.0018322505056858063)
     | > avg_log_mle:[92m -0.3013983592391014 [0m(-0.0023006200790405273)
     | > avg_loss_dur:[91m 0.18445496540516615 [0m(+0.00046836957335472107)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_45060.pth

[4m[1m > EPOCH: 55/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:46:38) [0m

[1m   --> TIME: 2025-02-23 03:46:45 -- STEP: 15/406 -- GLOBAL_STEP: 45075[0m
     | > loss: -0.09101343154907227  (-0.10585810343424479)
     | > log_mle: -0.2725414037704468  (-0.27295780976613365)
     | > loss_dur: 0.1815279722213745  (0.16709970633188884)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(31.1442, device='cuda:0')  (tensor(26.0253, device='cuda:0'))
     | > current_lr: 1.375e-05 
     | > step_time: 0.519  (0.3675




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003559216856956482 [0m(-0.00034894049167633143)
     | > avg_loss:[92m -0.11976622324436903 [0m(-0.0028228294104337692)
     | > avg_log_mle:[92m -0.30519184470176697 [0m(-0.003793485462665558)
     | > avg_loss_dur:[91m 0.18542562145739794 [0m(+0.0009706560522317886)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_45466.pth

[4m[1m > EPOCH: 56/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:51:29) [0m

[1m   --> TIME: 2025-02-23 03:51:34 -- STEP: 9/406 -- GLOBAL_STEP: 45475[0m
     | > loss: -0.0907905101776123  (-0.11149569849173228)
     | > log_mle: -0.2765108346939087  (-0.27383266554938424)
     | > loss_dur: 0.1857203245162964  (0.16233696705765194)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(25.7537, device='cuda:0')  (tensor(21.1825, device='cuda:0'))
     | > current_lr: 1.4e-05 
     | > step_time: 0.2744  (0.3499859




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003877848386764526 [0m(+0.000318631529808044)
     | > avg_loss:[92m -0.12581003084778783 [0m(-0.006043807603418799)
     | > avg_log_mle:[92m -0.30868804454803467 [0m(-0.0034961998462677)
     | > avg_loss_dur:[92m 0.1828780137002468 [0m(-0.002547607757151127)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_45872.pth

[4m[1m > EPOCH: 57/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 03:56:21) [0m

[1m   --> TIME: 2025-02-23 03:56:25 -- STEP: 3/406 -- GLOBAL_STEP: 45875[0m
     | > loss: -0.10669612884521484  (-0.11986365914344788)
     | > log_mle: -0.2815312147140503  (-0.28273101647694904)
     | > loss_dur: 0.17483508586883545  (0.1628673573335012)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(34.8021, device='cuda:0')  (tensor(30.6327, device='cuda:0'))
     | > current_lr: 1.425e-05 
     | > step_time: 0.2917  (0.42196695009




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0034363120794296265 [0m(-0.00044153630733489947)
     | > avg_loss:[92m -0.13074125628918412 [0m(-0.004931225441396292)
     | > avg_log_mle:[92m -0.310304656624794 [0m(-0.0016166120767593384)
     | > avg_loss_dur:[92m 0.1795634003356099 [0m(-0.003314613364636898)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_46278.pth

[4m[1m > EPOCH: 58/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:01:13) [0m

[1m   --> TIME: 2025-02-23 04:01:25 -- STEP: 22/406 -- GLOBAL_STEP: 46300[0m
     | > loss: -0.08626747131347656  (-0.11233527687462894)
     | > log_mle: -0.26568603515625  (-0.2761745344508778)
     | > loss_dur: 0.17941856384277344  (0.16383925757624887)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(12.4685, device='cuda:0')  (tensor(21.8591, device='cuda:0'))
     | > current_lr: 1.45e-05 
     | > step_time: 0.4004  (0.442608670




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004582032561302185 [0m(+0.0011457204818725586)
     | > avg_loss:[92m -0.13325390592217445 [0m(-0.0025126496329903325)
     | > avg_log_mle:[92m -0.3126504570245743 [0m(-0.0023458003997802734)
     | > avg_loss_dur:[92m 0.17939655110239983 [0m(-0.00016684923321008682)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_46684.pth

[4m[1m > EPOCH: 59/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:06:07) [0m

[1m   --> TIME: 2025-02-23 04:06:15 -- STEP: 16/406 -- GLOBAL_STEP: 46700[0m
     | > loss: -0.11629067361354828  (-0.12176601402461529)
     | > log_mle: -0.2811928987503052  (-0.28138429671525955)
     | > loss_dur: 0.1649022251367569  (0.15961828269064426)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(20.6439, device='cuda:0')  (tensor(23.2995, device='cuda:0'))
     | > current_lr: 1.475e-05 
     | > step_time: 0.4529  (0.382




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0034376829862594604 [0m(-0.0011443495750427246)
     | > avg_loss:[91m -0.12770349159836772 [0m(+0.005550414323806735)
     | > avg_log_mle:[91m -0.30929598957300186 [0m(+0.003354467451572418)
     | > avg_loss_dur:[91m 0.18159249797463417 [0m(+0.0021959468722343445)


[4m[1m > EPOCH: 60/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:10:58) [0m

[1m   --> TIME: 2025-02-23 04:11:04 -- STEP: 10/406 -- GLOBAL_STEP: 47100[0m
     | > loss: -0.14001287519931793  (-0.12425225377082824)
     | > log_mle: -0.2858097553253174  (-0.28134191036224365)
     | > loss_dur: 0.14579688012599945  (0.1570896565914154)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(17.9096, device='cuda:0')  (tensor(21.8384, device='cuda:0'))
     | > current_lr: 1.4999999999999999e-05 
     | > step_time: 0.3459  (0.41073424816131593)
     | > loader_time: 0.0028  (0.007928180694580077)




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0035121142864227295 [0m(+7.443130016326904e-05)
     | > avg_loss:[92m -0.1313770730048418 [0m(-0.0036735814064740857)
     | > avg_log_mle:[92m -0.31334853917360306 [0m(-0.004052549600601196)
     | > avg_loss_dur:[91m 0.18197146616876125 [0m(+0.0003789681941270828)


[4m[1m > EPOCH: 61/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:15:50) [0m

[1m   --> TIME: 2025-02-23 04:15:54 -- STEP: 4/406 -- GLOBAL_STEP: 47500[0m
     | > loss: -0.10970067977905273  (-0.1257842518389225)
     | > log_mle: -0.2678046226501465  (-0.2840851843357086)
     | > loss_dur: 0.15810394287109375  (0.15830093249678612)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(18.2369, device='cuda:0')  (tensor(32.6531, device='cuda:0'))
     | > current_lr: 1.525e-05 
     | > step_time: 0.2985  (0.46642202138900757)
     | > loader_time: 0.0038  (0.00969153642654419)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003980562090873718 [0m(+0.00046844780445098877)
     | > avg_loss:[92m -0.1365936230868101 [0m(-0.0052165500819683075)
     | > avg_log_mle:[92m -0.31372776627540583 [0m(-0.0003792271018027704)
     | > avg_loss_dur:[92m 0.17713414318859577 [0m(-0.004837322980165482)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_47902.pth

[4m[1m > EPOCH: 62/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:20:43) [0m

[1m   --> TIME: 2025-02-23 04:20:56 -- STEP: 23/406 -- GLOBAL_STEP: 47925[0m
     | > loss: -0.1062772274017334  (-0.12289174484170004)
     | > log_mle: -0.28684568405151367  (-0.28463765849237854)
     | > loss_dur: 0.18056845664978027  (0.16174591365067856)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(29.7224, device='cuda:0')  (tensor(33.2346, device='cuda:0'))
     | > current_lr: 1.55e-05 
     | > step_time: 0.4596  (0.4542




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.00367186963558197 [0m(-0.00030869245529174805)
     | > avg_loss:[92m -0.13973656576126814 [0m(-0.003142942674458027)
     | > avg_log_mle:[92m -0.31584708392620087 [0m(-0.0021193176507950384)
     | > avg_loss_dur:[92m 0.17611051816493273 [0m(-0.001023625023663044)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_48308.pth

[4m[1m > EPOCH: 63/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:25:33) [0m

[1m   --> TIME: 2025-02-23 04:25:42 -- STEP: 17/406 -- GLOBAL_STEP: 48325[0m
     | > loss: -0.12055826187133789  (-0.1276967762147679)
     | > log_mle: -0.2866630554199219  (-0.28648902388180003)
     | > loss_dur: 0.16610479354858398  (0.15879224766703212)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(10.8390, device='cuda:0')  (tensor(33.4214, device='cuda:0'))
     | > current_lr: 1.575e-05 
     | > step_time: 0.5938  (0.40905




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0033358782529830933 [0m(-0.00033599138259887695)
     | > avg_loss:[92m -0.14280259143561125 [0m(-0.003066025674343109)
     | > avg_log_mle:[92m -0.32012833654880524 [0m(-0.00428125262260437)
     | > avg_loss_dur:[91m 0.177325745113194 [0m(+0.001215226948261261)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_48714.pth

[4m[1m > EPOCH: 64/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:30:27) [0m

[1m   --> TIME: 2025-02-23 04:30:34 -- STEP: 11/406 -- GLOBAL_STEP: 48725[0m
     | > loss: -0.13883136212825775  (-0.13486190004782242)
     | > log_mle: -0.2859393358230591  (-0.2899595824154941)
     | > loss_dur: 0.14710797369480133  (0.15509768236767163)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(20.8893, device='cuda:0')  (tensor(25.5128, device='cuda:0'))
     | > current_lr: 1.6e-05 
     | > step_time: 0.422  (0.3786639733




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.007779508829116821 [0m(+0.004443630576133728)
     | > avg_loss:[92m -0.14717913139611483 [0m(-0.004376539960503578)
     | > avg_log_mle:[92m -0.32262784242630005 [0m(-0.002499505877494812)
     | > avg_loss_dur:[92m 0.17544871103018522 [0m(-0.0018770340830087662)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_49120.pth

[4m[1m > EPOCH: 65/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:35:22) [0m

[1m   --> TIME: 2025-02-23 04:35:26 -- STEP: 5/406 -- GLOBAL_STEP: 49125[0m
     | > loss: -0.11874695122241974  (-0.1381382405757904)
     | > log_mle: -0.28186511993408203  (-0.28751091957092284)
     | > loss_dur: 0.1631181687116623  (0.14937267899513246)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(15.8163, device='cuda:0')  (tensor(14.7797, device='cuda:0'))
     | > current_lr: 1.625e-05 
     | > step_time: 0.2496  (0.4078331




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004674792289733887 [0m(-0.0031047165393829346)
     | > avg_loss:[91m -0.1414103228598833 [0m(+0.005768808536231518)
     | > avg_log_mle:[91m -0.3193267658352852 [0m(+0.003301076591014862)
     | > avg_loss_dur:[91m 0.17791644297540188 [0m(+0.0024677319452166557)


[4m[1m > EPOCH: 66/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:40:17) [0m

[1m   --> TIME: 2025-02-23 04:40:29 -- STEP: 24/406 -- GLOBAL_STEP: 49550[0m
     | > loss: -0.13413631916046143  (-0.13334787885348)
     | > log_mle: -0.2921711206436157  (-0.29098385075728095)
     | > loss_dur: 0.1580348014831543  (0.15763597190380096)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(12.7755, device='cuda:0')  (tensor(23.6069, device='cuda:0'))
     | > current_lr: 1.65e-05 
     | > step_time: 0.624  (0.4330865740776062)
     | > loader_time: 0.0056  (0.0071305930614471436)


[1m   --> TIME: 2




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003737837076187134 [0m(-0.0009369552135467529)
     | > avg_loss:[92m -0.1489560566842556 [0m(-0.0075457338243722916)
     | > avg_log_mle:[92m -0.32388734817504883 [0m(-0.004560582339763641)
     | > avg_loss_dur:[92m 0.17493129149079323 [0m(-0.00298515148460865)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_49932.pth

[4m[1m > EPOCH: 67/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:45:17) [0m

[1m   --> TIME: 2025-02-23 04:45:27 -- STEP: 18/406 -- GLOBAL_STEP: 49950[0m
     | > loss: -0.13473673164844513  (-0.13975533677472007)
     | > log_mle: -0.2887082099914551  (-0.2961366375287374)
     | > loss_dur: 0.15397147834300995  (0.1563813007540173)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(14.4891, device='cuda:0')  (tensor(23.7206, device='cuda:0'))
     | > current_lr: 1.675e-05 
     | > step_time: 0.5152  (0.43677042




[1m   --> TIME: 2025-02-23 04:46:16 -- STEP: 93/406 -- GLOBAL_STEP: 50025[0m
     | > loss: -0.12235534191131592  (-0.12775209081429306)
     | > log_mle: -0.324285626411438  (-0.29837177261229486)
     | > loss_dur: 0.20193028450012207  (0.17061968179800174)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(27.8795, device='cuda:0')  (tensor(26.0891, device='cuda:0'))
     | > current_lr: 1.675e-05 
     | > step_time: 0.8672  (0.5621722795630012)
     | > loader_time: 0.0063  (0.01027088011464765)


[1m   --> TIME: 2025-02-23 04:46:32 -- STEP: 118/406 -- GLOBAL_STEP: 50050[0m
     | > loss: -0.1150558739900589  (-0.12672268599271777)
     | > log_mle: -0.31478452682495117  (-0.30039820933746075)
     | > loss_dur: 0.19972865283489227  (0.17367552334474304)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(29.7210, device='cuda:0')  (tensor(25.9197, device='cuda:0'))
     | > current_lr: 1.675e-05 
     | > step_time: 0.5481  (0.57538101228617)
     




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0031577497720718384 [0m(-0.0005800873041152954)
     | > avg_loss:[92m -0.1537669114768505 [0m(-0.00481085479259491)
     | > avg_log_mle:[92m -0.3274060934782028 [0m(-0.0035187453031539917)
     | > avg_loss_dur:[92m 0.1736391820013523 [0m(-0.001292109489440918)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_50338.pth

[4m[1m > EPOCH: 68/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:50:28) [0m

[1m   --> TIME: 2025-02-23 04:50:35 -- STEP: 12/406 -- GLOBAL_STEP: 50350[0m
     | > loss: -0.14496257901191711  (-0.1457968627413114)
     | > log_mle: -0.3049297332763672  (-0.2962477008501689)
     | > loss_dur: 0.15996715426445007  (0.15045083810885748)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(27.4425, device='cuda:0')  (tensor(36.0976, device='cuda:0'))
     | > current_lr: 1.7e-05 
     | > step_time: 0.298  (0.346721212069




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0038919895887374887 [0m(+0.0007342398166656503)
     | > avg_loss:[92m -0.15485240053385496 [0m(-0.0010854890570044518)
     | > avg_log_mle:[92m -0.32890549302101135 [0m(-0.0014993995428085327)
     | > avg_loss_dur:[91m 0.1740530924871564 [0m(+0.00041391048580408096)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_50744.pth

[4m[1m > EPOCH: 69/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 04:55:41) [0m

[1m   --> TIME: 2025-02-23 04:55:46 -- STEP: 6/406 -- GLOBAL_STEP: 50750[0m
     | > loss: -0.1650433987379074  (-0.14810416350762048)
     | > log_mle: -0.2978832721710205  (-0.2985902825991313)
     | > loss_dur: 0.1328398734331131  (0.15048611909151077)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(23.1818, device='cuda:0')  (tensor(27.2755, device='cuda:0'))
     | > current_lr: 1.725e-05 
     | > step_time: 0.3925  (0.41896




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0029001235961914062 [0m(-0.0009918659925460824)
     | > avg_loss:[92m -0.15875046607106924 [0m(-0.003898065537214279)
     | > avg_log_mle:[92m -0.33180759847164154 [0m(-0.002902105450630188)
     | > avg_loss_dur:[92m 0.1730571324005723 [0m(-0.0009959600865840912)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_51150.pth

[4m[1m > EPOCH: 70/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:00:51) [0m

[1m   --> TIME: 2025-02-23 05:00:53 -- STEP: 0/406 -- GLOBAL_STEP: 51150[0m
     | > loss: -0.15336504578590393  (-0.15336504578590393)
     | > log_mle: -0.30114877223968506  (-0.30114877223968506)
     | > loss_dur: 0.14778372645378113  (0.14778372645378113)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(12.0716, device='cuda:0')  (tensor(12.0716, device='cuda:0'))
     | > current_lr: 1.7500000000000002e-05 
     | > step_time: 1.




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.005543708801269531 [0m(+0.002643585205078125)
     | > avg_loss:[92m -0.16203115973621607 [0m(-0.0032806936651468277)
     | > avg_log_mle:[92m -0.3339210972189903 [0m(-0.0021134987473487854)
     | > avg_loss_dur:[92m 0.17188993748277426 [0m(-0.0011671949177980423)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_51556.pth

[4m[1m > EPOCH: 71/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:06:11) [0m

[1m   --> TIME: 2025-02-23 05:06:23 -- STEP: 19/406 -- GLOBAL_STEP: 51575[0m
     | > loss: -0.13566072285175323  (-0.1491203331633618)
     | > log_mle: -0.28561830520629883  (-0.3015897462242528)
     | > loss_dur: 0.1499575823545456  (0.152469413060891)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(37.9091, device='cuda:0')  (tensor(24.7452, device='cuda:0'))
     | > current_lr: 1.775e-05 
     | > step_time: 0.7626  (0.47432213




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003611251711845398 [0m(-0.0019324570894241333)
     | > avg_loss:[91m -0.16102516278624535 [0m(+0.0010059969499707222)
     | > avg_log_mle:[91m -0.3325834199786186 [0m(+0.001337677240371704)
     | > avg_loss_dur:[92m 0.17155825719237328 [0m(-0.0003316802904009819)


[4m[1m > EPOCH: 72/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:11:25) [0m

[1m   --> TIME: 2025-02-23 05:11:33 -- STEP: 13/406 -- GLOBAL_STEP: 51975[0m
     | > loss: -0.12792937457561493  (-0.15318731619761541)
     | > log_mle: -0.3030017614364624  (-0.303181529045105)
     | > loss_dur: 0.17507238686084747  (0.14999421284748957)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(50.1143, device='cuda:0')  (tensor(31.3990, device='cuda:0'))
     | > current_lr: 1.8e-05 
     | > step_time: 0.4989  (0.3791739023648776)
     | > loader_time: 0.0043  (0.0066986083984375)


[1m   --> TIME: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004387393593788147 [0m(+0.000776141881942749)
     | > avg_loss:[91m -0.15850598085671663 [0m(+0.0025191819295287132)
     | > avg_log_mle:[91m -0.32785723358392715 [0m(+0.004726186394691467)
     | > avg_loss_dur:[92m 0.16935125272721052 [0m(-0.002207004465162754)


[4m[1m > EPOCH: 73/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:16:35) [0m

[1m   --> TIME: 2025-02-23 05:16:40 -- STEP: 7/406 -- GLOBAL_STEP: 52375[0m
     | > loss: -0.16361106932163239  (-0.1635994975055967)
     | > log_mle: -0.29872190952301025  (-0.3040471758161272)
     | > loss_dur: 0.13511084020137787  (0.14044767831053054)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(27.7337, device='cuda:0')  (tensor(31.1818, device='cuda:0'))
     | > current_lr: 1.825e-05 
     | > step_time: 0.3416  (0.40532915932791574)
     | > loader_time: 0.0031  (0.009284734725952148)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004943251609802246 [0m(+0.0005558580160140991)
     | > avg_loss:[92m -0.1641017161309719 [0m(-0.005595735274255276)
     | > avg_log_mle:[92m -0.3352867290377617 [0m(-0.007429495453834534)
     | > avg_loss_dur:[91m 0.17118501290678978 [0m(+0.001833760179579258)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_52774.pth

[4m[1m > EPOCH: 74/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:21:49) [0m

[1m   --> TIME: 2025-02-23 05:21:52 -- STEP: 1/406 -- GLOBAL_STEP: 52775[0m
     | > loss: -0.16468513011932373  (-0.16468513011932373)
     | > log_mle: -0.3067495822906494  (-0.3067495822906494)
     | > loss_dur: 0.14206445217132568  (0.14206445217132568)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(16.7525, device='cuda:0')  (tensor(16.7525, device='cuda:0'))
     | > current_lr: 1.8500000000000002e-05 
     | > step_time: 0.4681 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004894331097602844 [0m(-4.8920512199401855e-05)
     | > avg_loss:[91m -0.1632862789556384 [0m(+0.0008154371753334999)
     | > avg_log_mle:[91m -0.33418862521648407 [0m(+0.0010981038212776184)
     | > avg_loss_dur:[92m 0.17090234626084566 [0m(-0.0002826666459441185)


[4m[1m > EPOCH: 75/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:27:13) [0m

[1m   --> TIME: 2025-02-23 05:27:24 -- STEP: 20/406 -- GLOBAL_STEP: 53200[0m
     | > loss: -0.146598681807518  (-0.15489529371261596)
     | > log_mle: -0.2902425527572632  (-0.30665205121040345)
     | > loss_dur: 0.14364387094974518  (0.1517567578703165)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(12.9080, device='cuda:0')  (tensor(39.1838, device='cuda:0'))
     | > current_lr: 1.875e-05 
     | > step_time: 0.8156  (0.45069730281829834)
     | > loader_time: 0.0063  (0.006329643726348877)


[1m   --> 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003397807478904724 [0m(-0.0014965236186981201)
     | > avg_loss:[92m -0.17172334995120764 [0m(-0.008437070995569229)
     | > avg_log_mle:[92m -0.3407226726412773 [0m(-0.006534047424793243)
     | > avg_loss_dur:[92m 0.16899932269006968 [0m(-0.0019030235707759857)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_53586.pth

[4m[1m > EPOCH: 76/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:32:26) [0m

[1m   --> TIME: 2025-02-23 05:32:34 -- STEP: 14/406 -- GLOBAL_STEP: 53600[0m
     | > loss: -0.13623479008674622  (-0.16303448379039762)
     | > log_mle: -0.31828927993774414  (-0.3105431539671762)
     | > loss_dur: 0.18205448985099792  (0.14750867017677852)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(15.3273, device='cuda:0')  (tensor(30.6104, device='cuda:0'))
     | > current_lr: 1.8999999999999998e-05 
     | > step_time: 0.7




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004057079553604126 [0m(+0.0006592720746994019)
     | > avg_loss:[91m -0.16365679074078798 [0m(+0.008066559210419655)
     | > avg_log_mle:[91m -0.3403759300708771 [0m(+0.00034674257040023804)
     | > avg_loss_dur:[91m 0.17671913979575038 [0m(+0.007719817105680704)


[4m[1m > EPOCH: 77/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:37:38) [0m

[1m   --> TIME: 2025-02-23 05:37:43 -- STEP: 8/406 -- GLOBAL_STEP: 54000[0m
     | > loss: -0.17183007299900055  (-0.1701611652970314)
     | > log_mle: -0.3179757595062256  (-0.31081423163414)
     | > loss_dur: 0.14614568650722504  (0.1406530663371086)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(56.3408, device='cuda:0')  (tensor(34.2547, device='cuda:0'))
     | > current_lr: 1.925e-05 
     | > step_time: 0.3555  (0.3952004015445709)
     | > loader_time: 0.007  (0.005542576313018799)


[1m   --> TIME: 2




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0036727339029312134 [0m(-0.0003843456506729126)
     | > avg_loss:[92m -0.1731868525967002 [0m(-0.009530061855912209)
     | > avg_log_mle:[92m -0.342694416642189 [0m(-0.0023184865713119507)
     | > avg_loss_dur:[92m 0.16950756404548883 [0m(-0.007211575750261545)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_54398.pth

[4m[1m > EPOCH: 78/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:42:48) [0m

[1m   --> TIME: 2025-02-23 05:42:51 -- STEP: 2/406 -- GLOBAL_STEP: 54400[0m
     | > loss: -0.20585092902183533  (-0.1959747076034546)
     | > log_mle: -0.3293813467025757  (-0.321320116519928)
     | > loss_dur: 0.12353041023015976  (0.1253454051911831)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(39.9396, device='cuda:0')  (tensor(33.4641, device='cuda:0'))
     | > current_lr: 1.95e-05 
     | > step_time: 0.4704  (0.503565907478




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0046873539686203 [0m(+0.001014620065689087)
     | > avg_loss:[91m -0.17214983236044645 [0m(+0.0010370202362537384)
     | > avg_log_mle:[92m -0.3433757424354553 [0m(-0.0006813257932662964)
     | > avg_loss_dur:[91m 0.17122591054067016 [0m(+0.001718346495181322)


[4m[1m > EPOCH: 79/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:48:06) [0m

[1m   --> TIME: 2025-02-23 05:48:18 -- STEP: 21/406 -- GLOBAL_STEP: 54825[0m
     | > loss: -0.15756218135356903  (-0.16632794766199022)
     | > log_mle: -0.3117849826812744  (-0.3135868197395688)
     | > loss_dur: 0.15422280132770538  (0.14725887243236815)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(37.9866, device='cuda:0')  (tensor(37.4502, device='cuda:0'))
     | > current_lr: 1.975e-05 
     | > step_time: 0.4787  (0.4539408116113572)
     | > loader_time: 0.0121  (0.010563907169160388)


[1m   --> TIME




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003938347101211548 [0m(-0.0007490068674087524)
     | > avg_loss:[92m -0.17602885514497757 [0m(-0.0038790227845311165)
     | > avg_log_mle:[92m -0.34553591907024384 [0m(-0.002160176634788513)
     | > avg_loss_dur:[92m 0.16950706439092755 [0m(-0.0017188461497426033)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_55210.pth

[4m[1m > EPOCH: 80/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:53:18) [0m

[1m   --> TIME: 2025-02-23 05:53:26 -- STEP: 15/406 -- GLOBAL_STEP: 55225[0m
     | > loss: -0.15588431060314178  (-0.17403334975242615)
     | > log_mle: -0.31775975227355957  (-0.318018118540446)
     | > loss_dur: 0.16187544167041779  (0.14398476878801986)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(40.5282, device='cuda:0')  (tensor(41.1512, device='cuda:0'))
     | > current_lr: 1.9999999999999998e-05 
     | > step_time: 0.




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0035561025142669678 [0m(-0.0003822445869445801)
     | > avg_loss:[92m -0.17981432378292084 [0m(-0.003785468637943268)
     | > avg_log_mle:[92m -0.3466491997241974 [0m(-0.0011132806539535522)
     | > avg_loss_dur:[92m 0.16683487594127655 [0m(-0.002672188449651003)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_55616.pth

[4m[1m > EPOCH: 81/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 05:58:31) [0m

[1m   --> TIME: 2025-02-23 05:58:38 -- STEP: 9/406 -- GLOBAL_STEP: 55625[0m
     | > loss: -0.15091855823993683  (-0.17899363570743138)
     | > log_mle: -0.320245623588562  (-0.3178691864013672)
     | > loss_dur: 0.16932706534862518  (0.13887554903825125)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(20.4426, device='cuda:0')  (tensor(32.7755, device='cuda:0'))
     | > current_lr: 2.025e-05 
     | > step_time: 0.2924  (0.4195710




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004763677716255188 [0m(+0.0012075752019882202)
     | > avg_loss:[92m -0.18254298716783524 [0m(-0.002728663384914398)
     | > avg_log_mle:[92m -0.34952379763126373 [0m(-0.002874597907066345)
     | > avg_loss_dur:[91m 0.16698081046342847 [0m(+0.00014593452215191927)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_56022.pth

[4m[1m > EPOCH: 82/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:03:52) [0m

[1m   --> TIME: 2025-02-23 06:03:56 -- STEP: 3/406 -- GLOBAL_STEP: 56025[0m
     | > loss: -0.17343711853027344  (-0.1892613818248113)
     | > log_mle: -0.32447731494903564  (-0.3258732557296753)
     | > loss_dur: 0.1510401964187622  (0.13661187390486398)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(55.6301, device='cuda:0')  (tensor(35.9005, device='cuda:0'))
     | > current_lr: 2.05e-05 
     | > step_time: 0.3659  (0.4783401




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004628106951713562 [0m(-0.00013557076454162598)
     | > avg_loss:[92m -0.18405008595436811 [0m(-0.0015070987865328789)
     | > avg_log_mle:[92m -0.3505837172269821 [0m(-0.0010599195957183838)
     | > avg_loss_dur:[92m 0.166533631272614 [0m(-0.00044717919081446733)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_56428.pth

[4m[1m > EPOCH: 83/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:09:04) [0m

[1m   --> TIME: 2025-02-23 06:09:17 -- STEP: 22/406 -- GLOBAL_STEP: 56450[0m
     | > loss: -0.15524403750896454  (-0.17300861396572806)
     | > log_mle: -0.3062649965286255  (-0.3187147704037754)
     | > loss_dur: 0.15102095901966095  (0.14570615609938448)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(56.8284, device='cuda:0')  (tensor(30.0324, device='cuda:0'))
     | > current_lr: 2.075e-05 
     | > step_time: 0.8294  (0.4773




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004302874207496644 [0m(-0.0003252327442169181)
     | > avg_loss:[92m -0.19241503067314625 [0m(-0.008364944718778133)
     | > avg_log_mle:[92m -0.3565824553370476 [0m(-0.00599873811006546)
     | > avg_loss_dur:[92m 0.16416742512956262 [0m(-0.002366206143051386)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_56834.pth

[4m[1m > EPOCH: 84/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:14:22) [0m

[1m   --> TIME: 2025-02-23 06:14:31 -- STEP: 16/406 -- GLOBAL_STEP: 56850[0m
     | > loss: -0.16377857327461243  (-0.18056170269846916)
     | > log_mle: -0.32369303703308105  (-0.3236074820160866)
     | > loss_dur: 0.15991446375846863  (0.14304577931761742)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(18.6698, device='cuda:0')  (tensor(34.4568, device='cuda:0'))
     | > current_lr: 2.1e-05 
     | > step_time: 0.8184  (0.446867719




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0038348138332366943 [0m(-0.0004680603742599496)
     | > avg_loss:[91m -0.18910446669906378 [0m(+0.00331056397408247)
     | > avg_log_mle:[91m -0.35334593057632446 [0m(+0.003236524760723114)
     | > avg_loss_dur:[91m 0.16424146434292197 [0m(+7.403921335935593e-05)


[4m[1m > EPOCH: 85/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:19:29) [0m

[1m   --> TIME: 2025-02-23 06:19:37 -- STEP: 10/406 -- GLOBAL_STEP: 57250[0m
     | > loss: -0.19485749304294586  (-0.1892242342233658)
     | > log_mle: -0.33021867275238037  (-0.32445459365844725)
     | > loss_dur: 0.1353611797094345  (0.13523036018013954)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(35.6709, device='cuda:0')  (tensor(42.8134, device='cuda:0'))
     | > current_lr: 2.125e-05 
     | > step_time: 0.3969  (0.4345935583114624)
     | > loader_time: 0.0158  (0.00798494815826416)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.003972500562667847 [0m(+0.00013768672943115234)
     | > avg_loss:[92m -0.19540230091661215 [0m(-0.00629783421754837)
     | > avg_log_mle:[92m -0.3601209223270416 [0m(-0.006774991750717163)
     | > avg_loss_dur:[91m 0.16471862141042948 [0m(+0.0004771570675075054)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_57646.pth

[4m[1m > EPOCH: 86/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:24:43) [0m

[1m   --> TIME: 2025-02-23 06:24:47 -- STEP: 4/406 -- GLOBAL_STEP: 57650[0m
     | > loss: -0.17283087968826294  (-0.1907222792506218)
     | > log_mle: -0.3068023920059204  (-0.3260882794857025)
     | > loss_dur: 0.13397151231765747  (0.13536600023508072)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(12.0253, device='cuda:0')  (tensor(18.5789, device='cuda:0'))
     | > current_lr: 2.15e-05 
     | > step_time: 0.3358  (0.456088125




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.00601997971534729 [0m(+0.0020474791526794434)
     | > avg_loss:[91m -0.19063615892082453 [0m(+0.0047661419957876205)
     | > avg_log_mle:[91m -0.3556227460503578 [0m(+0.004498176276683807)
     | > avg_loss_dur:[91m 0.1649865871295333 [0m(+0.00026796571910381317)


[4m[1m > EPOCH: 87/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:29:51) [0m

[1m   --> TIME: 2025-02-23 06:30:04 -- STEP: 23/406 -- GLOBAL_STEP: 58075[0m
     | > loss: -0.1655040681362152  (-0.17871891091699182)
     | > log_mle: -0.3228820562362671  (-0.32242045195206354)
     | > loss_dur: 0.15737798810005188  (0.14370154103507166)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(24.7883, device='cuda:0')  (tensor(26.1379, device='cuda:0'))
     | > current_lr: 2.175e-05 
     | > step_time: 0.4988  (0.47400788638902747)
     | > loader_time: 0.0046  (0.010643534038377846)


[1m   --> T




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.004264920949935913 [0m(-0.001755058765411377)
     | > avg_loss:[92m -0.19360852614045143 [0m(-0.0029723672196269035)
     | > avg_log_mle:[92m -0.3583218976855278 [0m(-0.002699151635169983)
     | > avg_loss_dur:[92m 0.16471337154507637 [0m(-0.0002732155844569206)


[4m[1m > EPOCH: 88/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:35:01) [0m

[1m   --> TIME: 2025-02-23 06:35:10 -- STEP: 17/406 -- GLOBAL_STEP: 58475[0m
     | > loss: -0.16938742995262146  (-0.18674863699604483)
     | > log_mle: -0.3288954496383667  (-0.3301240416134105)
     | > loss_dur: 0.15950801968574524  (0.14337540417909622)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(17.7008, device='cuda:0')  (tensor(30.1697, device='cuda:0'))
     | > current_lr: 2.2e-05 
     | > step_time: 0.4378  (0.4373334435855641)
     | > loader_time: 0.0149  (0.00760342093075023)


[1m   --> TIME:




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004420936107635498 [0m(+0.00015601515769958496)
     | > avg_loss:[92m -0.19620532356202602 [0m(-0.0025967974215745926)
     | > avg_log_mle:[92m -0.36119935661554337 [0m(-0.002877458930015564)
     | > avg_loss_dur:[91m 0.16499403305351734 [0m(+0.0002806615084409714)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_58864.pth

[4m[1m > EPOCH: 89/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:40:09) [0m

[1m   --> TIME: 2025-02-23 06:40:16 -- STEP: 11/406 -- GLOBAL_STEP: 58875[0m
     | > loss: -0.19856199622154236  (-0.19395092536102643)
     | > log_mle: -0.32181215286254883  (-0.3291724811900746)
     | > loss_dur: 0.12325016409158707  (0.13522155650637366)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(21.7581, device='cuda:0')  (tensor(42.7826, device='cuda:0'))
     | > current_lr: 2.2250000000000002e-05 
     | > step_time: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0046539753675460815 [0m(+0.0002330392599105835)
     | > avg_loss:[92m -0.1996511546894908 [0m(-0.0034458311274647713)
     | > avg_log_mle:[92m -0.3638685345649719 [0m(-0.0026691779494285583)
     | > avg_loss_dur:[92m 0.16421737987548113 [0m(-0.0007766531780362129)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_59270.pth

[4m[1m > EPOCH: 90/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:45:19) [0m

[1m   --> TIME: 2025-02-23 06:45:23 -- STEP: 5/406 -- GLOBAL_STEP: 59275[0m
     | > loss: -0.17408619821071625  (-0.19262251257896423)
     | > log_mle: -0.32415246963500977  (-0.3316798210144043)
     | > loss_dur: 0.15006627142429352  (0.13905730843544006)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(32.7224, device='cuda:0')  (tensor(40.0314, device='cuda:0'))
     | > current_lr: 2.25e-05 
     | > step_time: 0.3689  (0.44974




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003537014126777649 [0m(-0.0011169612407684326)
     | > avg_loss:[91m -0.19715501926839352 [0m(+0.0024961354210972786)
     | > avg_log_mle:[91m -0.36320216208696365 [0m(+0.0006663724780082703)
     | > avg_loss_dur:[91m 0.16604714281857014 [0m(+0.0018297629430890083)


[4m[1m > EPOCH: 91/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:50:28) [0m

[1m   --> TIME: 2025-02-23 06:50:42 -- STEP: 24/406 -- GLOBAL_STEP: 59700[0m
     | > loss: -0.18695296347141266  (-0.19307338508466879)
     | > log_mle: -0.3324861526489258  (-0.33325400948524475)
     | > loss_dur: 0.14553318917751312  (0.1401806247110168)
     | > amp_scaler: 4096.0  (4096.0)
     | > grad_norm: tensor(48.5414, device='cuda:0')  (tensor(33.9609, device='cuda:0'))
     | > current_lr: 2.275e-05 
     | > step_time: 0.7194  (0.48923184474309284)
     | > loader_time: 0.0132  (0.007114768028259277)


[1m   --




[1m   --> TIME: 2025-02-23 06:54:41 -- STEP: 349/406 -- GLOBAL_STEP: 60025[0m
     | > loss: -0.15420502424240112  (-0.17427368897592438)
     | > log_mle: -0.34699976444244385  (-0.3468902753212391)
     | > loss_dur: 0.19279474020004272  (0.1726165863666629)
     | > amp_scaler: 2048.0  (3673.489971346705)
     | > grad_norm: tensor(43.5939, device='cuda:0')  (tensor(37.9899, device='cuda:0'))
     | > current_lr: 2.275e-05 
     | > step_time: 0.8944  (0.6835982102719281)
     | > loader_time: 0.0148  (0.013005349560931618)


[1m   --> TIME: 2025-02-23 06:55:03 -- STEP: 374/406 -- GLOBAL_STEP: 60050[0m
     | > loss: -0.17287756502628326  (-0.1744540827000205)
     | > log_mle: -0.34990692138671875  (-0.3476688294487205)
     | > loss_dur: 0.17702935636043549  (0.17321474676862106)
     | > amp_scaler: 2048.0  (3564.8342245989306)
     | > grad_norm: tensor(46.0877, device='cuda:0')  (tensor(37.7929, device='cuda:0'))
     | > current_lr: 2.275e-05 
     | > step_time: 0.8017  




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.006068810820579529 [0m(+0.00253179669380188)
     | > avg_loss:[91m -0.19466667156666517 [0m(+0.002488347701728344)
     | > avg_log_mle:[91m -0.35714656114578247 [0m(+0.006055600941181183)
     | > avg_loss_dur:[92m 0.1624798895791173 [0m(-0.003567253239452839)


[4m[1m > EPOCH: 92/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 06:55:42) [0m

[1m   --> TIME: 2025-02-23 06:55:53 -- STEP: 18/406 -- GLOBAL_STEP: 60100[0m
     | > loss: -0.1852129101753235  (-0.1959581118490961)
     | > log_mle: -0.3258039951324463  (-0.33623164892196655)
     | > loss_dur: 0.1405910849571228  (0.14027353665894932)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(19.0235, device='cuda:0')  (tensor(25.6767, device='cuda:0'))
     | > current_lr: 2.3e-05 
     | > step_time: 0.871  (0.45343661308288574)
     | > loader_time: 0.0178  (0.008093661732143827)


[1m   --> TIME: 202




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0036029964685440063 [0m(-0.0024658143520355225)
     | > avg_loss:[92m -0.2037859419360757 [0m(-0.009119270369410515)
     | > avg_log_mle:[92m -0.36741502583026886 [0m(-0.01026846468448639)
     | > avg_loss_dur:[91m 0.16362908389419317 [0m(+0.0011491943150758743)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_60488.pth

[4m[1m > EPOCH: 93/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:00:54) [0m

[1m   --> TIME: 2025-02-23 07:01:01 -- STEP: 12/406 -- GLOBAL_STEP: 60500[0m
     | > loss: -0.2035200148820877  (-0.20739859342575073)
     | > log_mle: -0.3479008674621582  (-0.3386295934518178)
     | > loss_dur: 0.1443808525800705  (0.13123100188871226)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(32.7145, device='cuda:0')  (tensor(35.6641, device='cuda:0'))
     | > current_lr: 2.3250000000000003e-05 
     | > step_time: 0.4019




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0034188926219940186 [0m(-0.0001841038465499878)
     | > avg_loss:[92m -0.21135957818478346 [0m(-0.007573636248707771)
     | > avg_log_mle:[92m -0.3729463294148445 [0m(-0.005531303584575653)
     | > avg_loss_dur:[92m 0.16158675123006105 [0m(-0.0020423326641321182)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_60894.pth

[4m[1m > EPOCH: 94/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:06:05) [0m

[1m   --> TIME: 2025-02-23 07:06:10 -- STEP: 6/406 -- GLOBAL_STEP: 60900[0m
     | > loss: -0.21971388161182404  (-0.20475703477859497)
     | > log_mle: -0.333926796913147  (-0.33558541536331177)
     | > loss_dur: 0.11421291530132294  (0.1308283805847168)
     | > amp_scaler: 2048.0  (2048.0)
     | > grad_norm: tensor(92.3885, device='cuda:0')  (tensor(36.6432, device='cuda:0'))
     | > current_lr: 2.3500000000000002e-05 
     | > step_time: 0.302




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004133179783821106 [0m(+0.0007142871618270874)
     | > avg_loss:[91m -0.21044709719717503 [0m(+0.0009124809876084328)
     | > avg_log_mle:[91m -0.37212832272052765 [0m(+0.000818006694316864)
     | > avg_loss_dur:[91m 0.16168122505769134 [0m(+9.447382763028145e-05)


[4m[1m > EPOCH: 95/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:11:20) [0m

[1m   --> TIME: 2025-02-23 07:11:23 -- STEP: 0/406 -- GLOBAL_STEP: 61300[0m
     | > loss: -0.2112756073474884  (-0.2112756073474884)
     | > log_mle: -0.3357229232788086  (-0.3357229232788086)
     | > loss_dur: 0.1244473084807396  (0.1244473084807396)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(29.8428, device='cuda:0')  (tensor(29.8428, device='cuda:0'))
     | > current_lr: 2.375e-05 
     | > step_time: 1.2973  (1.2973182201385498)
     | > loader_time: 0.9576  (0.9575793743133545)


[1m   --> TIME: 2




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0034085065126419067 [0m(-0.0007246732711791992)
     | > avg_loss:[91m -0.20419018808752298 [0m(+0.006256909109652042)
     | > avg_log_mle:[91m -0.36712752282619476 [0m(+0.005000799894332886)
     | > avg_loss_dur:[91m 0.1629373342730105 [0m(+0.0012561092153191566)


[4m[1m > EPOCH: 96/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:16:28) [0m

[1m   --> TIME: 2025-02-23 07:16:39 -- STEP: 19/406 -- GLOBAL_STEP: 61725[0m
     | > loss: -0.19080083072185516  (-0.19620829585351443)
     | > log_mle: -0.323199987411499  (-0.3336028111608405)
     | > loss_dur: 0.13239915668964386  (0.13739451530732608)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(8.1888, device='cuda:0')  (tensor(30.2034, device='cuda:0'))
     | > current_lr: 2.4e-05 
     | > step_time: 0.4284  (0.44176976304305227)
     | > loader_time: 0.0036  (0.006231082113165604)


[1m   --> TIME




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004503026604652405 [0m(+0.001094520092010498)
     | > avg_loss:[92m -0.21025425102561712 [0m(-0.006064062938094139)
     | > avg_log_mle:[92m -0.37209565192461014 [0m(-0.004968129098415375)
     | > avg_loss_dur:[92m 0.16184140089899302 [0m(-0.001095933374017477)


[4m[1m > EPOCH: 97/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:21:37) [0m

[1m   --> TIME: 2025-02-23 07:21:44 -- STEP: 13/406 -- GLOBAL_STEP: 62125[0m
     | > loss: -0.18137867748737335  (-0.19979934508983904)
     | > log_mle: -0.34130656719207764  (-0.33697417149176967)
     | > loss_dur: 0.15992788970470428  (0.1371748264019306)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(19.2827, device='cuda:0')  (tensor(72.6781, device='cuda:0'))
     | > current_lr: 2.425e-05 
     | > step_time: 0.3643  (0.3703913872058575)
     | > loader_time: 0.0057  (0.006899631940401518)


[1m   --> TI




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.003743469715118408 [0m(-0.0007595568895339966)
     | > avg_loss:[92m -0.21035286039113998 [0m(-9.860936552286148e-05)
     | > avg_log_mle:[91m -0.36782292276620865 [0m(+0.004272729158401489)
     | > avg_loss_dur:[92m 0.1574700623750687 [0m(-0.004371338523924323)


[4m[1m > EPOCH: 98/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:26:47) [0m

[1m   --> TIME: 2025-02-23 07:26:52 -- STEP: 7/406 -- GLOBAL_STEP: 62525[0m
     | > loss: -0.20265570282936096  (-0.2127133309841156)
     | > log_mle: -0.33571887016296387  (-0.3417069741657802)
     | > loss_dur: 0.1330631673336029  (0.1289936431816646)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(43.1438, device='cuda:0')  (tensor(30.6528, device='cuda:0'))
     | > current_lr: 2.45e-05 
     | > step_time: 0.3984  (0.44710779190063477)
     | > loader_time: 0.005  (0.006742783955165318)


[1m   --> TIME: 




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.004013121128082275 [0m(+0.0002696514129638672)
     | > avg_loss:[92m -0.21864322666078806 [0m(-0.008290366269648075)
     | > avg_log_mle:[92m -0.37796231359243393 [0m(-0.01013939082622528)
     | > avg_loss_dur:[91m 0.15931908693164587 [0m(+0.001849024556577178)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_62924.pth

[4m[1m > EPOCH: 99/100[0m
 --> train/run-February-22-2025_10+59PM-fa84af3

[1m > TRAINING (2025-02-23 07:31:54) [0m

[1m   --> TIME: 2025-02-23 07:31:57 -- STEP: 1/406 -- GLOBAL_STEP: 62925[0m
     | > loss: -0.22806298732757568  (-0.22806298732757568)
     | > log_mle: -0.3431999683380127  (-0.3431999683380127)
     | > loss_dur: 0.11513698101043701  (0.11513698101043701)
     | > amp_scaler: 1024.0  (1024.0)
     | > grad_norm: tensor(39.6526, device='cuda:0')  (tensor(39.6526, device='cuda:0'))
     | > current_lr: 2.475e-05 
     | > step_time: 0.4753  (0.47525715




  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0034129321575164795 [0m(-0.0006001889705657959)
     | > avg_loss:[92m -0.22337066661566496 [0m(-0.0047274399548769)
     | > avg_log_mle:[92m -0.381663553416729 [0m(-0.003701239824295044)
     | > avg_loss_dur:[92m 0.15829288680106401 [0m(-0.0010262001305818558)

 > BEST MODEL : train/run-February-22-2025_10+59PM-fa84af3/best_model_63330.pth
