## Imports

In [1]:
import os
import sys
sys.path.append(os.path.join(".."))

In [2]:
import argparse
import time
import math
import os
import itertools
import torch
import torch.optim as optim
import numpy as np
import dataclasses
from dataclasses import dataclass
from tqdm import tqdm

from configs.xlmodelconfig import XlModelConfig
from configs.fnetarmodelconfig import FnetarModelConfig
from configs.feedbackmodelconfig import FeedbackModelConfig

from configs.xladaptiveconfig import XlAdaptiveConfig
from configs.feedbackadaptiveconfig import FeedbackAdaptiveConfig

from configs.xldataconfig import XlDataConfig
from configs.feedbackdataconfig import FeedbackDataConfig

from configs.runconfig import RunConfig
from configs.optimizerconfig import OptimizerConfig

from blur import Blur

from models.xl import Xl
from models.fnetar import Fnetar
from models.feedback import Feedback

from modules.xlmemories import XlMemories
from modules.feedbackmemories import FeedbackMemories

from modules.adaptiveinput import AdaptiveInput
from modules.adaptivelogsoftmax import AdaptiveLogSoftmax

from utils.data_utils import get_lm_corpus
from utils.exp_utils import create_exp_dir

from models.utils.normaluniforminitializer import NormalUniformInitializer

## Model and data arguments

In [3]:
@dataclass
class Arguments:
    model_name: str
    dataset: str = 'wt103'
    data: str = '../../data/wikitext-103'
    cuda_device: str = 'cuda'

### Choose which model to train from ['xl', 'fnetar', 'feedback']

In [4]:
args = Arguments(model_name = 'xl')

print(args)

Arguments(model_name='xl', dataset='wt103', data='../../data/wikitext-103', cuda_device='cuda')


## Setup checkpoint and device

In [5]:
run_config = RunConfig()
optimizer_config = OptimizerConfig()

run_config.work_dir = os.path.join(run_config.work_dir, time.strftime('%Y%m%d-%H%M%S'))
logging = create_exp_dir(run_config.work_dir, scripts_to_save=['../train.py', '../blur.py'], debug=run_config.debug)

# Set the random seed manually for reproducibility.
np.random.seed(run_config.seed)
torch.manual_seed(run_config.seed)

if torch.cuda.is_available():
    if not run_config.cuda:
        device = torch.device('cpu')
        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
    else:
        device = torch.device(args.cuda_device)
        torch.cuda.manual_seed_all(run_config.seed)
else:
    device = torch.device('cpu')
    
print(run_config)
print(optimizer_config)

Experiment dir : LM-TFM\20211028-113433
RunConfig(work_dir='LM-TFM\\20211028-113433', cuda=True, seed=1111, log_interval=200, eval_interval=1000, debug=False, max_eval_steps=-1)
OptimizerConfig(max_step=200000, eta_min=0.0, clip=0.25, lr_min=0.0, decay_rate=0.5, warmup_step=0, scheduler='cosine', lr=0.00025, optim='adam')


## Load corpus and config files

In [6]:
corpus = get_lm_corpus(args.data, args.dataset)

if args.model_name == 'xl':
    data_config = XlDataConfig()
    adaptive_config = XlAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = XlModelConfig()
elif args.model_name == 'fnetar':
    data_config = XlDataConfig()
    adaptive_config = XlAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = FnetarModelConfig()
elif args.model_name == 'feedback':
    data_config = FeedbackDataConfig()
    adaptive_config = FeedbackAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = FeedbackModelConfig()
else:
    raise ValueError
    
assert data_config.batch_size % data_config.batch_chunk == 0

print(data_config)
print(adaptive_config)
print(model_config)

Loading cached dataset...
XlDataConfig(data='../data/wikitext-103', dataset='wt103', tgt_len=150, mem_len=150, batch_size=60, batch_chunk=10, eval_tgt_len=150, eval_mem_len=150, eval_batch_size=10, n_layer=16)
XlAdaptiveConfig(d_model=410, n_classes=267735, cutoffs=[20000, 40000, 200000], div_value=1.0)
XlModelConfig(n_layer=16, d_model=410, n_head=10, d_head=41, d_inner=2100, drop_out=0.1, drop_att=0.0, tgt_len=150, mem_len=150, same_length=False, clamp_len=-1)


## Load data and construct model

In [7]:
tr_iter = corpus.get_iterator('train', data_config.batch_size, data_config.tgt_len,
    device=device, ext_len=0)
va_iter = corpus.get_iterator('valid', data_config.eval_batch_size, data_config.eval_tgt_len,
    device=device, ext_len=0)
te_iter = corpus.get_iterator('test', data_config.eval_batch_size, data_config.eval_tgt_len,
    device=device, ext_len=0)

encoder = AdaptiveInput(**dataclasses.asdict(adaptive_config))
decoder = AdaptiveLogSoftmax(**dataclasses.asdict(adaptive_config))

if args.model_name == 'xl':
    transformer = Xl(**dataclasses.asdict(model_config))
elif args.model_name == 'fnetar':
    transformer = Fnetar(**dataclasses.asdict(model_config))
elif args.model_name == 'feedback':
    transformer = Feedback(**dataclasses.asdict(model_config))
else:
    raise ValueError

model = Blur(encoder=encoder, transformer=transformer, decoder=decoder, tie_weight=True)

In [8]:
initializer = NormalUniformInitializer()
model.apply(initializer)
model.encoder.apply(initializer) # ensure embedding init is not overridden by out_layer in case of weight sharing

args.n_all_param = sum([p.nelement() for p in model.parameters()])
args.n_nonemb_param = sum([p.nelement() for p in model.transformer.parameters()])
args.n_encoder_param = sum([p.nelement() for p in model.encoder.parameters()])
args.n_decoder_param = sum([p.nelement() for p in model.decoder.parameters()])

para_model = model.to(device)

#### optimizer
optimizer = optim.Adam(model.parameters(), lr=optimizer_config.lr)

#### scheduler
# here we do not set eta_min to lr_min to be backward compatible
# because in previous versions eta_min is default to 0
# rather than the default value of lr_min 1e-6
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
    optimizer_config.max_step, eta_min=optimizer_config.eta_min) # should use eta_min arg

logging('=' * 100)
for k, v in args.__dict__.items():
    logging('    - {} : {}'.format(k, v))
logging('=' * 100)
# logging('#params = {}'.format(args.n_all_param))
# logging('#non emb params = {}'.format(args.n_nonemb_param))
# logging('#encoder params = {}'.format(args.n_encoder_param))
# logging('#decoder params = {}'.format(args.n_decoder_param))

    - model_name : xl
    - dataset : wt103
    - data : ../../data/wikitext-103
    - cuda_device : cuda
    - n_all_param : 151107538
    - n_nonemb_param : 41067220
    - n_encoder_param : 109771350
    - n_decoder_param : 110040318


## Define training and evaluation functions

In [11]:
def train():
    # Turn on training mode which enables dropout.
    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
    model.train()


    if args.model_name == 'xl' or args.model_name == 'fnetar':
        memories = XlMemories(
            n_stream=data_config.batch_chunk,
            n_layer=data_config.n_layer,
            tgt_len=data_config.tgt_len,
            mem_len=data_config.mem_len,
            ext_len=0,
            dtype=next(model.parameters()).dtype
        )
    else:
        memories = FeedbackMemories(n_stream=data_config.batch_chunk)

    train_iter = tr_iter
    for batch, (data, target, seq_len) in tqdm(
        enumerate(train_iter), 
        total = train_iter.n_batch
#         total=len(train_iter) // (data_config.batch_chunk * data_config.batch_size)
    ):
        model.zero_grad()

        data_chunks = torch.chunk(data, data_config.batch_chunk, 0)
        target_chunks = torch.chunk(target, data_config.batch_chunk, 0)
        for i in range(data_config.batch_chunk):
            data_i = data_chunks[i]
            target_i = target_chunks[i]
            memory_i = memories[i]
            loss, new_memory_i = para_model(data_i, target_i, memory_i)
            memories.update_memory_stream(stream_index=i, memory=new_memory_i)

            loss = loss.float().mean().type_as(loss) / data_config.batch_chunk
            loss.backward()
            train_loss += loss.float().item()


        torch.nn.utils.clip_grad_norm_(model.parameters(), optimizer_config.clip)
        optimizer.step()

        # step-wise learning rate annealing
        train_step += 1

        # linear warmup stage
        if train_step < optimizer_config.warmup_step:
            curr_lr = optimizer_config.lr * train_step / optimizer_config.warmup_step
            optimizer.param_groups[0]['lr'] = curr_lr

        else:
            scheduler.step()


        if train_step % run_config.log_interval == 0:
            cur_loss = train_loss / run_config.log_interval
            elapsed = time.time() - log_start_time
            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / run_config.log_interval, cur_loss)
            log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
            logging(log_str)
            train_loss = 0
            log_start_time = time.time()

        if train_step % run_config.eval_interval == 0:
            val_loss = evaluate(va_iter)
            logging('-' * 100)
            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
                      '| valid loss {:5.2f}'.format(
                train_step // run_config.eval_interval, train_step,
                (time.time() - eval_start_time), val_loss)
            log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
            logging(log_str)
            logging('-' * 100)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                if not run_config.debug:
                    with open(os.path.join(run_config.work_dir, 'model.pt'), 'wb') as f:
                        torch.save(model, f)
                    with open(os.path.join(run_config.work_dir, 'optimizer.pt'), 'wb') as f:
                        torch.save(optimizer.state_dict(), f)
                best_val_loss = val_loss

            eval_start_time = time.time()

        if train_step == optimizer_config.max_step:
            break

In [12]:
def evaluate(eval_iter):
    # Turn on evaluation mode which disables dropout.
    model.eval()

    # Evaluation
    total_len, total_loss = 0, 0.

    if args.model_name == 'xl' or args.model_name == 'fnetar':
        eval_memories = XlMemories(
            n_stream=1,
            n_layer=data_config.n_layer,
            tgt_len=data_config.eval_tgt_len,
            mem_len=data_config.eval_mem_len,
            ext_len=0,
            dtype=next(model.parameters()).dtype
        )
    else:
        eval_memories = FeedbackMemories(n_stream=1)


    with torch.no_grad():

        for i, (data, target, seq_len) in enumerate(eval_iter):
            if run_config.max_eval_steps > 0 and i >= run_config.max_eval_steps:
                break
            loss, new_eval_memory = model(data, target, eval_memories[0])
            eval_memories.update_memory_stream(stream_index=0, memory=new_eval_memory)

            loss = loss.mean()
            total_loss += seq_len * loss.float().item()
            total_len += seq_len

    # Switch back to the training mode
    model.train()

    return total_loss / total_len

## Train Model

In [None]:
# Loop over epochs.
train_step = 0
train_loss = 0
best_val_loss = None

log_start_time = time.time()
eval_start_time = time.time()

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in itertools.count(start=1):
        train()
        if train_step == optimizer_config.max_step:
            logging('-' * 100)
            logging('End of training')
            break
except KeyboardInterrupt:
    logging('-' * 100)
    logging('Exiting from training early')

# Load the best saved model.
with open(os.path.join(run_config.work_dir, 'model.pt'), 'rb') as f:
    model = torch.load(f)
para_model = model.to(device)

# Run on test data.
test_loss = evaluate(te_iter)
logging('=' * 100)

logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
    test_loss, math.exp(test_loss)))
logging('=' * 100)

  2%|█▎                                                                          | 200/11470 [03:59<3:48:02,  1.21s/it]

| epoch   1 step      200 |    200 batches | lr 0.00025 | ms/batch 1198.48 | loss  6.94 | ppl  1029.021


  3%|██▋                                                                         | 400/11470 [08:04<3:50:58,  1.25s/it]

| epoch   1 step      400 |    400 batches | lr 0.00025 | ms/batch 1222.04 | loss  6.01 | ppl   409.227


  5%|███▉                                                                        | 600/11470 [12:09<3:43:31,  1.23s/it]

| epoch   1 step      600 |    600 batches | lr 0.00025 | ms/batch 1224.87 | loss  5.68 | ppl   292.134


  7%|█████▎                                                                      | 800/11470 [16:16<3:40:55,  1.24s/it]

| epoch   1 step      800 |    800 batches | lr 0.00025 | ms/batch 1238.06 | loss  5.47 | ppl   237.225


  9%|██████▌                                                                     | 999/11470 [20:22<3:34:59,  1.23s/it]

| epoch   1 step     1000 |   1000 batches | lr 0.00025 | ms/batch 1237.11 | loss  5.28 | ppl   196.662
----------------------------------------------------------------------------------------------------
| Eval   1 at step     1000 | time: 1232.12s | valid loss  5.17 | valid ppl   176.445
----------------------------------------------------------------------------------------------------


 10%|███████▊                                                                   | 1200/11470 [24:40<3:28:16,  1.22s/it]

| epoch   1 step     1200 |   1200 batches | lr 0.00025 | ms/batch 1281.07 | loss  5.16 | ppl   174.434


 12%|█████████▏                                                                 | 1400/11470 [28:46<3:25:57,  1.23s/it]

| epoch   1 step     1400 |   1400 batches | lr 0.00025 | ms/batch 1231.68 | loss  5.06 | ppl   157.592


 14%|██████████▍                                                                | 1600/11470 [32:52<3:20:03,  1.22s/it]

| epoch   1 step     1600 |   1600 batches | lr 0.00025 | ms/batch 1231.21 | loss  4.97 | ppl   144.117


 16%|███████████▊                                                               | 1800/11470 [36:53<3:11:37,  1.19s/it]

| epoch   1 step     1800 |   1800 batches | lr 0.00025 | ms/batch 1204.60 | loss  4.91 | ppl   135.038


 17%|█████████████                                                              | 1999/11470 [40:53<3:10:47,  1.21s/it]

| epoch   1 step     2000 |   2000 batches | lr 0.00025 | ms/batch 1202.14 | loss  4.84 | ppl   126.516
----------------------------------------------------------------------------------------------------
| Eval   2 at step     2000 | time: 1227.78s | valid loss  4.69 | valid ppl   108.408
----------------------------------------------------------------------------------------------------


 19%|██████████████▍                                                            | 2200/11470 [45:07<3:09:16,  1.23s/it]

| epoch   1 step     2200 |   2200 batches | lr 0.00025 | ms/batch 1266.77 | loss  4.77 | ppl   118.405


 21%|███████████████▋                                                           | 2400/11470 [49:12<3:05:51,  1.23s/it]

| epoch   1 step     2400 |   2400 batches | lr 0.00025 | ms/batch 1225.98 | loss  4.72 | ppl   112.689


 23%|█████████████████                                                          | 2600/11470 [53:17<3:04:21,  1.25s/it]

| epoch   1 step     2600 |   2600 batches | lr 0.00025 | ms/batch 1223.53 | loss  4.67 | ppl   106.885


 24%|██████████████████▎                                                        | 2800/11470 [57:23<2:55:00,  1.21s/it]

| epoch   1 step     2800 |   2800 batches | lr 0.00025 | ms/batch 1231.11 | loss  4.62 | ppl   101.035


 26%|███████████████████                                                      | 2999/11470 [1:01:23<2:49:34,  1.20s/it]

| epoch   1 step     3000 |   3000 batches | lr 0.00025 | ms/batch 1206.87 | loss  4.59 | ppl    98.193
----------------------------------------------------------------------------------------------------
| Eval   3 at step     3000 | time: 1228.41s | valid loss  4.44 | valid ppl    84.970
----------------------------------------------------------------------------------------------------


 28%|████████████████████▎                                                    | 3200/11470 [1:05:34<2:44:17,  1.19s/it]

| epoch   1 step     3200 |   3200 batches | lr 0.00025 | ms/batch 1244.28 | loss  4.54 | ppl    93.753


 30%|█████████████████████▋                                                   | 3400/11470 [1:09:37<2:45:37,  1.23s/it]

| epoch   1 step     3400 |   3400 batches | lr 0.00025 | ms/batch 1217.54 | loss  4.51 | ppl    90.795


 31%|██████████████████████▉                                                  | 3600/11470 [1:13:43<2:43:06,  1.24s/it]

| epoch   1 step     3600 |   3600 batches | lr 0.00025 | ms/batch 1231.90 | loss  4.44 | ppl    84.511


 33%|████████████████████████▏                                                | 3800/11470 [1:17:51<2:38:40,  1.24s/it]

| epoch   1 step     3800 |   3800 batches | lr 0.00025 | ms/batch 1238.34 | loss  4.48 | ppl    88.145


 35%|█████████████████████████▍                                               | 3999/11470 [1:21:57<2:34:39,  1.24s/it]

| epoch   1 step     4000 |   4000 batches | lr 0.00025 | ms/batch 1236.48 | loss  4.44 | ppl    84.783
----------------------------------------------------------------------------------------------------
| Eval   4 at step     4000 | time: 1231.46s | valid loss  4.28 | valid ppl    71.903
----------------------------------------------------------------------------------------------------


 37%|██████████████████████████▋                                              | 4200/11470 [1:26:17<2:31:03,  1.25s/it]

| epoch   1 step     4200 |   4200 batches | lr 0.00025 | ms/batch 1292.52 | loss  4.39 | ppl    80.720


 38%|████████████████████████████                                             | 4400/11470 [1:30:25<2:25:11,  1.23s/it]

| epoch   1 step     4400 |   4400 batches | lr 0.00025 | ms/batch 1239.17 | loss  4.37 | ppl    79.354


 40%|█████████████████████████████▎                                           | 4600/11470 [1:34:32<2:23:16,  1.25s/it]

| epoch   1 step     4600 |   4600 batches | lr 0.00025 | ms/batch 1236.75 | loss  4.35 | ppl    77.739


 42%|██████████████████████████████▌                                          | 4800/11470 [1:38:40<2:19:00,  1.25s/it]

| epoch   1 step     4800 |   4800 batches | lr 0.00025 | ms/batch 1238.45 | loss  4.30 | ppl    73.971


 44%|███████████████████████████████▊                                         | 4999/11470 [1:42:47<2:13:17,  1.24s/it]

| epoch   1 step     5000 |   5000 batches | lr 0.00025 | ms/batch 1240.76 | loss  4.34 | ppl    76.575
----------------------------------------------------------------------------------------------------
| Eval   5 at step     5000 | time: 1247.19s | valid loss  4.15 | valid ppl    63.656
----------------------------------------------------------------------------------------------------


 45%|█████████████████████████████████                                        | 5200/11470 [1:47:06<2:08:25,  1.23s/it]

| epoch   1 step     5200 |   5200 batches | lr 0.00025 | ms/batch 1291.68 | loss  4.28 | ppl    72.418


 47%|██████████████████████████████████▎                                      | 5400/11470 [1:51:14<2:07:24,  1.26s/it]

| epoch   1 step     5400 |   5400 batches | lr 0.00025 | ms/batch 1237.97 | loss  4.23 | ppl    68.616


 49%|███████████████████████████████████▋                                     | 5600/11470 [1:55:22<2:02:27,  1.25s/it]

| epoch   1 step     5600 |   5600 batches | lr 0.00025 | ms/batch 1241.39 | loss  4.25 | ppl    70.133


 51%|████████████████████████████████████▉                                    | 5800/11470 [1:59:30<1:57:43,  1.25s/it]

| epoch   1 step     5800 |   5800 batches | lr 0.000249 | ms/batch 1239.87 | loss  4.25 | ppl    69.842


 52%|██████████████████████████████████████▏                                  | 5999/11470 [2:03:37<1:53:42,  1.25s/it]

| epoch   1 step     6000 |   6000 batches | lr 0.000249 | ms/batch 1240.75 | loss  4.20 | ppl    66.684
----------------------------------------------------------------------------------------------------
| Eval   6 at step     6000 | time: 1247.86s | valid loss  4.08 | valid ppl    59.082
----------------------------------------------------------------------------------------------------


 54%|███████████████████████████████████████▍                                 | 6200/11470 [2:07:56<1:49:13,  1.24s/it]

| epoch   1 step     6200 |   6200 batches | lr 0.000249 | ms/batch 1289.58 | loss  4.18 | ppl    65.129


 56%|████████████████████████████████████████▋                                | 6400/11470 [2:12:05<1:44:56,  1.24s/it]

| epoch   1 step     6400 |   6400 batches | lr 0.000249 | ms/batch 1242.40 | loss  4.21 | ppl    67.488


 58%|██████████████████████████████████████████                               | 6600/11470 [2:16:13<1:40:50,  1.24s/it]

| epoch   1 step     6600 |   6600 batches | lr 0.000249 | ms/batch 1241.47 | loss  4.15 | ppl    63.173


 59%|███████████████████████████████████████████▎                             | 6800/11470 [2:20:22<1:36:22,  1.24s/it]

| epoch   1 step     6800 |   6800 batches | lr 0.000249 | ms/batch 1242.86 | loss  4.14 | ppl    63.062


 61%|████████████████████████████████████████████▌                            | 6999/11470 [2:24:28<1:31:47,  1.23s/it]

| epoch   1 step     7000 |   7000 batches | lr 0.000249 | ms/batch 1238.13 | loss  4.14 | ppl    63.082
----------------------------------------------------------------------------------------------------
| Eval   7 at step     7000 | time: 1248.45s | valid loss  3.99 | valid ppl    54.021
----------------------------------------------------------------------------------------------------


 63%|█████████████████████████████████████████████▊                           | 7200/11470 [2:28:49<1:29:00,  1.25s/it]

| epoch   1 step     7200 |   7200 batches | lr 0.000249 | ms/batch 1296.32 | loss  4.10 | ppl    60.115


 65%|███████████████████████████████████████████████                          | 7400/11470 [2:32:57<1:23:00,  1.22s/it]

| epoch   1 step     7400 |   7400 batches | lr 0.000249 | ms/batch 1243.95 | loss  4.09 | ppl    59.721


 66%|████████████████████████████████████████████████▎                        | 7600/11470 [2:37:04<1:19:21,  1.23s/it]

| epoch   1 step     7600 |   7600 batches | lr 0.000249 | ms/batch 1234.83 | loss  4.07 | ppl    58.597


 68%|█████████████████████████████████████████████████▋                       | 7800/11470 [2:41:12<1:15:47,  1.24s/it]

| epoch   1 step     7800 |   7800 batches | lr 0.000249 | ms/batch 1238.51 | loss  4.09 | ppl    60.012


 70%|██████████████████████████████████████████████████▉                      | 7999/11470 [2:45:19<1:11:42,  1.24s/it]

| epoch   1 step     8000 |   8000 batches | lr 0.000249 | ms/batch 1240.17 | loss  4.09 | ppl    59.632
----------------------------------------------------------------------------------------------------
| Eval   8 at step     8000 | time: 1248.41s | valid loss  3.92 | valid ppl    50.269
----------------------------------------------------------------------------------------------------


 71%|████████████████████████████████████████████████████▏                    | 8200/11470 [2:49:38<1:08:33,  1.26s/it]

| epoch   1 step     8200 |   8200 batches | lr 0.000249 | ms/batch 1289.01 | loss  4.06 | ppl    58.119


 73%|█████████████████████████████████████████████████████▍                   | 8400/11470 [2:53:45<1:03:21,  1.24s/it]

| epoch   1 step     8400 |   8400 batches | lr 0.000249 | ms/batch 1237.19 | loss  4.07 | ppl    58.740


 75%|████████████████████████████████████████████████████████▏                  | 8600/11470 [2:57:53<59:07,  1.24s/it]

| epoch   1 step     8600 |   8600 batches | lr 0.000249 | ms/batch 1236.24 | loss  4.05 | ppl    57.607


 77%|█████████████████████████████████████████████████████████▌                 | 8800/11470 [3:01:59<55:09,  1.24s/it]

| epoch   1 step     8800 |   8800 batches | lr 0.000249 | ms/batch 1231.28 | loss  4.06 | ppl    58.161


 78%|██████████████████████████████████████████████████████████▊                | 8999/11470 [3:06:04<50:41,  1.23s/it]

| epoch   1 step     9000 |   9000 batches | lr 0.000249 | ms/batch 1232.22 | loss  4.03 | ppl    56.267
----------------------------------------------------------------------------------------------------
| Eval   9 at step     9000 | time: 1242.26s | valid loss  3.87 | valid ppl    47.772
----------------------------------------------------------------------------------------------------


 80%|████████████████████████████████████████████████████████████▏              | 9200/11470 [3:10:22<46:47,  1.24s/it]

| epoch   1 step     9200 |   9200 batches | lr 0.000249 | ms/batch 1284.73 | loss  4.02 | ppl    55.687


 82%|█████████████████████████████████████████████████████████████▍             | 9400/11470 [3:14:29<43:56,  1.27s/it]

| epoch   1 step     9400 |   9400 batches | lr 0.000249 | ms/batch 1233.31 | loss  4.03 | ppl    55.999


 84%|██████████████████████████████████████████████████████████████▊            | 9600/11470 [3:18:36<39:23,  1.26s/it]

| epoch   1 step     9600 |   9600 batches | lr 0.000249 | ms/batch 1236.20 | loss  4.04 | ppl    56.866


 85%|████████████████████████████████████████████████████████████████           | 9800/11470 [3:22:42<35:09,  1.26s/it]

| epoch   1 step     9800 |   9800 batches | lr 0.000249 | ms/batch 1228.60 | loss  3.99 | ppl    54.266


 87%|█████████████████████████████████████████████████████████████████▍         | 9999/11470 [3:26:47<29:50,  1.22s/it]

| epoch   1 step    10000 |  10000 batches | lr 0.000248 | ms/batch 1231.71 | loss  4.01 | ppl    55.012
----------------------------------------------------------------------------------------------------
| Eval  10 at step    10000 | time: 1240.48s | valid loss  3.81 | valid ppl    45.226
----------------------------------------------------------------------------------------------------


 89%|█████████████████████████████████████████████████████████████████▊        | 10200/11470 [3:31:05<26:19,  1.24s/it]

| epoch   1 step    10200 |  10200 batches | lr 0.000248 | ms/batch 1283.45 | loss  3.97 | ppl    53.164


 91%|███████████████████████████████████████████████████████████████████       | 10400/11470 [3:35:12<21:54,  1.23s/it]

| epoch   1 step    10400 |  10400 batches | lr 0.000248 | ms/batch 1233.72 | loss  3.97 | ppl    52.938


 92%|████████████████████████████████████████████████████████████████████▍     | 10600/11470 [3:39:18<18:08,  1.25s/it]

| epoch   1 step    10600 |  10600 batches | lr 0.000248 | ms/batch 1230.39 | loss  3.99 | ppl    54.155


 94%|█████████████████████████████████████████████████████████████████████▋    | 10800/11470 [3:43:24<13:36,  1.22s/it]

| epoch   1 step    10800 |  10800 batches | lr 0.000248 | ms/batch 1232.08 | loss  3.95 | ppl    51.921


 96%|██████████████████████████████████████████████████████████████████████▉   | 10999/11470 [3:47:31<09:44,  1.24s/it]

| epoch   1 step    11000 |  11000 batches | lr 0.000248 | ms/batch 1239.84 | loss  3.99 | ppl    54.028
----------------------------------------------------------------------------------------------------
| Eval  11 at step    11000 | time: 1241.59s | valid loss  3.79 | valid ppl    44.171
----------------------------------------------------------------------------------------------------


 98%|████████████████████████████████████████████████████████████████████████▎ | 11200/11470 [3:51:50<05:34,  1.24s/it]

| epoch   1 step    11200 |  11200 batches | lr 0.000248 | ms/batch 1289.53 | loss  3.96 | ppl    52.693


 99%|█████████████████████████████████████████████████████████████████████████▌| 11400/11470 [3:55:57<01:26,  1.23s/it]

| epoch   1 step    11400 |  11400 batches | lr 0.000248 | ms/batch 1235.11 | loss  3.96 | ppl    52.597


100%|██████████████████████████████████████████████████████████████████████████| 11470/11470 [3:57:23<00:00,  1.24s/it]
  1%|▊                                                                           | 130/11470 [02:40<3:55:23,  1.25s/it]

| epoch   2 step    11600 |    130 batches | lr 0.000248 | ms/batch 1234.71 | loss  3.92 | ppl    50.401


  3%|██▏                                                                         | 330/11470 [06:48<3:50:14,  1.24s/it]

| epoch   2 step    11800 |    330 batches | lr 0.000248 | ms/batch 1237.96 | loss  3.89 | ppl    48.761


  5%|███▌                                                                        | 529/11470 [10:54<3:42:35,  1.22s/it]

| epoch   2 step    12000 |    530 batches | lr 0.000248 | ms/batch 1235.01 | loss  3.91 | ppl    49.771
----------------------------------------------------------------------------------------------------
| Eval  12 at step    12000 | time: 1244.20s | valid loss  3.78 | valid ppl    43.701
----------------------------------------------------------------------------------------------------


  6%|████▊                                                                       | 730/11470 [15:12<3:42:24,  1.24s/it]

| epoch   2 step    12200 |    730 batches | lr 0.000248 | ms/batch 1283.18 | loss  3.88 | ppl    48.235


  8%|██████▏                                                                     | 930/11470 [19:18<3:37:40,  1.24s/it]

| epoch   2 step    12400 |    930 batches | lr 0.000248 | ms/batch 1231.02 | loss  3.88 | ppl    48.224


 10%|███████▍                                                                   | 1130/11470 [23:24<3:32:19,  1.23s/it]

| epoch   2 step    12600 |   1130 batches | lr 0.000248 | ms/batch 1229.95 | loss  3.90 | ppl    49.384


 12%|████████▋                                                                  | 1330/11470 [27:31<3:27:36,  1.23s/it]

| epoch   2 step    12800 |   1330 batches | lr 0.000247 | ms/batch 1234.54 | loss  3.87 | ppl    47.849


 13%|█████████▉                                                                 | 1529/11470 [31:35<3:21:38,  1.22s/it]

| epoch   2 step    13000 |   1530 batches | lr 0.000247 | ms/batch 1227.96 | loss  3.86 | ppl    47.389
----------------------------------------------------------------------------------------------------
| Eval  13 at step    13000 | time: 1238.96s | valid loss  3.74 | valid ppl    41.914
----------------------------------------------------------------------------------------------------


 15%|███████████▎                                                               | 1730/11470 [35:53<3:20:33,  1.24s/it]

| epoch   2 step    13200 |   1730 batches | lr 0.000247 | ms/batch 1282.17 | loss  3.85 | ppl    47.111


 17%|████████████▌                                                              | 1930/11470 [39:59<3:16:54,  1.24s/it]

| epoch   2 step    13400 |   1930 batches | lr 0.000247 | ms/batch 1230.35 | loss  3.86 | ppl    47.336


 19%|█████████████▉                                                             | 2130/11470 [44:06<3:12:00,  1.23s/it]

| epoch   2 step    13600 |   2130 batches | lr 0.000247 | ms/batch 1236.84 | loss  3.88 | ppl    48.309


 20%|███████████████▏                                                           | 2330/11470 [48:14<3:07:57,  1.23s/it]

| epoch   2 step    13800 |   2330 batches | lr 0.000247 | ms/batch 1238.97 | loss  3.85 | ppl    47.136


 22%|████████████████▌                                                          | 2529/11470 [52:20<3:06:49,  1.25s/it]

| epoch   2 step    14000 |   2530 batches | lr 0.000247 | ms/batch 1234.12 | loss  3.84 | ppl    46.561
----------------------------------------------------------------------------------------------------
| Eval  14 at step    14000 | time: 1241.98s | valid loss  3.70 | valid ppl    40.389
----------------------------------------------------------------------------------------------------


 24%|█████████████████▊                                                         | 2730/11470 [56:37<2:59:45,  1.23s/it]

| epoch   2 step    14200 |   2730 batches | lr 0.000247 | ms/batch 1282.53 | loss  3.82 | ppl    45.681


 26%|██████████████████▋                                                      | 2930/11470 [1:00:44<2:59:36,  1.26s/it]

| epoch   2 step    14400 |   2930 batches | lr 0.000247 | ms/batch 1234.29 | loss  3.81 | ppl    45.035


 27%|███████████████████▉                                                     | 3130/11470 [1:04:52<2:50:14,  1.22s/it]

| epoch   2 step    14600 |   3130 batches | lr 0.000247 | ms/batch 1236.52 | loss  3.82 | ppl    45.432


 29%|█████████████████████▏                                                   | 3330/11470 [1:08:58<2:51:05,  1.26s/it]

| epoch   2 step    14800 |   3330 batches | lr 0.000247 | ms/batch 1231.96 | loss  3.82 | ppl    45.687


 31%|██████████████████████▍                                                  | 3529/11470 [1:13:05<2:43:41,  1.24s/it]

| epoch   2 step    15000 |   3530 batches | lr 0.000247 | ms/batch 1239.93 | loss  3.78 | ppl    43.977
----------------------------------------------------------------------------------------------------
| Eval  15 at step    15000 | time: 1242.78s | valid loss  3.67 | valid ppl    39.400
----------------------------------------------------------------------------------------------------


 33%|███████████████████████▋                                                 | 3730/11470 [1:17:24<2:42:11,  1.26s/it]

| epoch   2 step    15200 |   3730 batches | lr 0.000246 | ms/batch 1291.41 | loss  3.81 | ppl    45.359


 34%|█████████████████████████                                                | 3930/11470 [1:21:31<2:35:05,  1.23s/it]

| epoch   2 step    15400 |   3930 batches | lr 0.000246 | ms/batch 1233.62 | loss  3.80 | ppl    44.880


 36%|██████████████████████████▎                                              | 4130/11470 [1:25:40<2:31:49,  1.24s/it]

| epoch   2 step    15600 |   4130 batches | lr 0.000246 | ms/batch 1242.78 | loss  3.79 | ppl    44.384


 38%|███████████████████████████▌                                             | 4330/11470 [1:29:47<2:27:49,  1.24s/it]

| epoch   2 step    15800 |   4330 batches | lr 0.000246 | ms/batch 1234.80 | loss  3.80 | ppl    44.693


 39%|████████████████████████████▊                                            | 4529/11470 [1:33:53<2:20:26,  1.21s/it]

| epoch   2 step    16000 |   4530 batches | lr 0.000246 | ms/batch 1237.52 | loss  3.80 | ppl    44.757
----------------------------------------------------------------------------------------------------
| Eval  16 at step    16000 | time: 1245.52s | valid loss  3.66 | valid ppl    38.963
----------------------------------------------------------------------------------------------------


 41%|██████████████████████████████                                           | 4730/11470 [1:38:06<2:13:40,  1.19s/it]

| epoch   2 step    16200 |   4730 batches | lr 0.000246 | ms/batch 1259.19 | loss  3.76 | ppl    42.816


 43%|███████████████████████████████▍                                         | 4930/11470 [1:42:07<2:12:53,  1.22s/it]

| epoch   2 step    16400 |   4930 batches | lr 0.000246 | ms/batch 1206.17 | loss  3.78 | ppl    43.777


 45%|████████████████████████████████▋                                        | 5130/11470 [1:46:12<2:08:08,  1.21s/it]

| epoch   2 step    16600 |   5130 batches | lr 0.000246 | ms/batch 1222.29 | loss  3.77 | ppl    43.391


 46%|█████████████████████████████████▉                                       | 5330/11470 [1:50:17<2:05:01,  1.22s/it]

| epoch   2 step    16800 |   5330 batches | lr 0.000246 | ms/batch 1226.78 | loss  3.76 | ppl    42.918


 48%|███████████████████████████████████▏                                     | 5529/11470 [1:54:23<2:01:48,  1.23s/it]

| epoch   2 step    17000 |   5530 batches | lr 0.000246 | ms/batch 1238.65 | loss  3.74 | ppl    42.309
----------------------------------------------------------------------------------------------------
| Eval  17 at step    17000 | time: 1228.17s | valid loss  3.64 | valid ppl    38.153
----------------------------------------------------------------------------------------------------


 50%|████████████████████████████████████▍                                    | 5730/11470 [1:58:43<1:56:44,  1.22s/it]

| epoch   2 step    17200 |   5730 batches | lr 0.000245 | ms/batch 1290.79 | loss  3.76 | ppl    43.154


 52%|█████████████████████████████████████▋                                   | 5930/11470 [2:02:51<1:53:38,  1.23s/it]

| epoch   2 step    17400 |   5930 batches | lr 0.000245 | ms/batch 1238.37 | loss  3.75 | ppl    42.393


 52%|█████████████████████████████████████▉                                   | 5970/11470 [2:03:39<1:51:39,  1.22s/it]