## Imports

In [1]:
import os
import sys
sys.path.append(os.path.join(".."))

In [2]:
import argparse
import time
import math
import os
import itertools
import torch
import torch.optim as optim
import numpy as np
import dataclasses
from dataclasses import dataclass
from tqdm import tqdm

from configs.xlmodelconfig import XlModelConfig
from configs.fnetarmodelconfig import FnetarModelConfig
from configs.feedbackmodelconfig import FeedbackModelConfig

from configs.xladaptiveconfig import XlAdaptiveConfig
from configs.feedbackadaptiveconfig import FeedbackAdaptiveConfig

from configs.xldataconfig import XlDataConfig
from configs.feedbackdataconfig import FeedbackDataConfig

from configs.xloptimizerconfig import XlOptimizerConfig
from configs.feedbackoptimizerconfig import FeedbackOptimizerConfig

from configs.runconfig import RunConfig

from blur import Blur

from models.xl import Xl
from models.fnetar import Fnetar
from models.feedback import Feedback

from modules.xlmemories import XlMemories
from modules.feedbackmemories import FeedbackMemories

from modules.adaptiveinput import AdaptiveInput
from modules.adaptivelogsoftmax import AdaptiveLogSoftmax

from utils.data_utils import get_lm_corpus
from utils.exp_utils import create_exp_dir

from models.utils.normaluniforminitializer import NormalUniformInitializer

## Model and data arguments

In [3]:
@dataclass
class Arguments:
    model_name: str
    dataset: str = 'wt103'
    data: str = '../../data/wikitext-103'
    cuda_device: str = 'cuda:0'

### Choose which model to train from ['xl', 'fnetar', 'feedback']

In [4]:
args = Arguments(model_name = 'feedback')

print(args)

Arguments(model_name='feedback', dataset='wt103', data='../../data/wikitext-103', cuda_device='cuda:0')


## Setup checkpoint and device

In [5]:
run_config = RunConfig()


run_config.work_dir = os.path.join(run_config.work_dir, time.strftime('%Y%m%d-%H%M%S'))
logging = create_exp_dir(run_config.work_dir, scripts_to_save=['../train.py', '../blur.py'], debug=run_config.debug)

# Set the random seed manually for reproducibility.
np.random.seed(run_config.seed)
torch.manual_seed(run_config.seed)

if torch.cuda.is_available():
    if not run_config.cuda:
        device = torch.device('cpu')
        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
    else:
        device = torch.device(args.cuda_device)
        torch.cuda.manual_seed_all(run_config.seed)
else:
    device = torch.device('cpu')
    
print(run_config)


Experiment dir : LM-TFM\20211028-175936
RunConfig(work_dir='LM-TFM\\20211028-175936', cuda=True, seed=1111, log_interval=200, eval_interval=1000, debug=False, max_eval_steps=-1)


## Load corpus and config files

In [6]:
corpus = get_lm_corpus(args.data, args.dataset)

if args.model_name == 'xl':
    optimizer_config = XlOptimizerConfig()
    data_config = XlDataConfig()
    adaptive_config = XlAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = XlModelConfig()
elif args.model_name == 'fnetar':
    optimizer_config = XlOptimizerConfig()
    data_config = XlDataConfig()
    adaptive_config = XlAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = FnetarModelConfig()
elif args.model_name == 'feedback':
    optimizer_config = FeedbackOptimizerConfig()
    data_config = FeedbackDataConfig()
    adaptive_config = FeedbackAdaptiveConfig(n_classes=len(corpus.vocab))
    model_config = FeedbackModelConfig()
else:
    raise ValueError
    
assert data_config.batch_size % data_config.batch_chunk == 0

print(optimizer_config)
print(data_config)
print(adaptive_config)
print(model_config)

Loading cached dataset...
FeedbackOptimizerConfig(max_step=200000, eta_min=0.0, clip=0.1, lr_min=0.0, decay_rate=0.5, warmup_step=8000, scheduler='inv_sqrt', lr=0.0007, optim='adam')
FeedbackDataConfig(data='../data/wikitext-103', dataset='wt103', tgt_len=150, mem_len=None, batch_size=60, batch_chunk=15, eval_tgt_len=256, eval_mem_len=256, eval_batch_size=10, n_layer=8)
FeedbackAdaptiveConfig(d_model=512, n_classes=267735, cutoffs=[20000, 40000, 200000], div_value=1.0)
FeedbackModelConfig(n_layer=8, d_model=512, n_head=8, d_head=128, d_inner=2048, drop_out=0.1, drop_att=0.0, tgt_len=32, mem_len=256)


## Load data and construct model

In [7]:
tr_iter = corpus.get_iterator('train', data_config.batch_size, data_config.tgt_len,
    device=device, ext_len=0)
va_iter = corpus.get_iterator('valid', data_config.eval_batch_size, data_config.eval_tgt_len,
    device=device, ext_len=0)
te_iter = corpus.get_iterator('test', data_config.eval_batch_size, data_config.eval_tgt_len,
    device=device, ext_len=0)

encoder = AdaptiveInput(**dataclasses.asdict(adaptive_config))
decoder = AdaptiveLogSoftmax(**dataclasses.asdict(adaptive_config))

if args.model_name == 'xl':
    transformer = Xl(**dataclasses.asdict(model_config))
elif args.model_name == 'fnetar':
    transformer = Fnetar(**dataclasses.asdict(model_config))
elif args.model_name == 'feedback':
    transformer = Feedback(**dataclasses.asdict(model_config))
else:
    raise ValueError

model = Blur(encoder=encoder, transformer=transformer, decoder=decoder, tie_weight=True)

In [8]:
initializer = NormalUniformInitializer()
model.apply(initializer)
model.encoder.apply(initializer) # ensure embedding init is not overridden by out_layer in case of weight sharing

args.n_all_param = sum([p.nelement() for p in model.parameters()])
args.n_nonemb_param = sum([p.nelement() for p in model.transformer.parameters()])
args.n_encoder_param = sum([p.nelement() for p in model.encoder.parameters()])
args.n_decoder_param = sum([p.nelement() for p in model.decoder.parameters()])

para_model = model.to(device)

#### optimizer
optimizer = optim.Adam(model.parameters(), lr=optimizer_config.lr)

#### scheduler
# here we do not set eta_min to lr_min to be backward compatible
# because in previous versions eta_min is default to 0
# rather than the default value of lr_min 1e-6
if optimizer_config.scheduler == 'cosine':
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
        optimizer_config.max_step, eta_min=optimizer_config.eta_min) # should use eta_min arg
elif optimizer_config.scheduler == 'inv_sqrt':
    def lr_lambda(step):
        # return a multiplier instead of a learning rate
        if step == 0 and optimizer_config.warmup_step == 0:
            return 1.
        else:
            return 1. / (step ** 0.5) if step > optimizer_config.warmup_step \
                   else step / (optimizer_config.warmup_step ** 1.5)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

logging('=' * 100)
for k, v in args.__dict__.items():
    logging('    - {} : {}'.format(k, v))
logging('=' * 100)
# logging('#params = {}'.format(args.n_all_param))
# logging('#non emb params = {}'.format(args.n_nonemb_param))
# logging('#encoder params = {}'.format(args.n_encoder_param))
# logging('#decoder params = {}'.format(args.n_decoder_param))

    - model_name : feedback
    - dataset : wt103
    - data : ../../data/wikitext-103
    - cuda_device : cuda:0
    - n_all_param : 172010211
    - n_nonemb_param : 34660617
    - n_encoder_param : 137080320
    - n_decoder_param : 137349594


## Define training and evaluation functions

In [9]:
def train():
    # Turn on training mode which enables dropout.
    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
    model.train()


    if args.model_name == 'xl' or args.model_name == 'fnetar':
        memories = XlMemories(
            n_stream=data_config.batch_chunk,
            n_layer=data_config.n_layer,
            tgt_len=data_config.tgt_len,
            mem_len=data_config.mem_len,
            ext_len=0,
            dtype=next(model.parameters()).dtype
        )
    else:
        memories = FeedbackMemories(n_stream=data_config.batch_chunk)

    train_iter = tr_iter
    for batch, (data, target, seq_len) in tqdm(
        enumerate(train_iter), 
        total = train_iter.n_batch
#         total=len(train_iter) // (data_config.batch_chunk * data_config.batch_size)
    ):
        model.zero_grad()

        data_chunks = torch.chunk(data, data_config.batch_chunk, 0)
        target_chunks = torch.chunk(target, data_config.batch_chunk, 0)
        for i in range(data_config.batch_chunk):
            data_i = data_chunks[i]
            target_i = target_chunks[i]
            memory_i = memories[i]
            loss, new_memory_i = para_model(data_i, target_i, memory_i)
            memories.update_memory_stream(stream_index=i, memory=new_memory_i)

            loss = loss.float().mean().type_as(loss) / data_config.batch_chunk
            loss.backward()
            train_loss += loss.float().item()


        torch.nn.utils.clip_grad_norm_(model.parameters(), optimizer_config.clip)
        optimizer.step()

        # step-wise learning rate annealing
        train_step += 1

        # linear warmup stage
        if train_step < optimizer_config.warmup_step:
            curr_lr = optimizer_config.lr * train_step / optimizer_config.warmup_step
            optimizer.param_groups[0]['lr'] = curr_lr

        else:
            scheduler.step()


        if train_step % run_config.log_interval == 0:
            cur_loss = train_loss / run_config.log_interval
            elapsed = time.time() - log_start_time
            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / run_config.log_interval, cur_loss)
            log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
            logging(log_str)
            train_loss = 0
            log_start_time = time.time()

        if train_step % run_config.eval_interval == 0:
            val_loss = evaluate(va_iter)
            logging('-' * 100)
            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
                      '| valid loss {:5.2f}'.format(
                train_step // run_config.eval_interval, train_step,
                (time.time() - eval_start_time), val_loss)
            log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
            logging(log_str)
            logging('-' * 100)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                if not run_config.debug:
                    with open(os.path.join(run_config.work_dir, 'model.pt'), 'wb') as f:
                        torch.save(model, f)
                    with open(os.path.join(run_config.work_dir, 'optimizer.pt'), 'wb') as f:
                        torch.save(optimizer.state_dict(), f)
                best_val_loss = val_loss

            eval_start_time = time.time()

        if train_step == optimizer_config.max_step:
            break

In [10]:
def evaluate(eval_iter):
    # Turn on evaluation mode which disables dropout.
    model.eval()

    # Evaluation
    total_len, total_loss = 0, 0.

    if args.model_name == 'xl' or args.model_name == 'fnetar':
        eval_memories = XlMemories(
            n_stream=1,
            n_layer=data_config.n_layer,
            tgt_len=data_config.eval_tgt_len,
            mem_len=data_config.eval_mem_len,
            ext_len=0,
            dtype=next(model.parameters()).dtype
        )
    else:
        eval_memories = FeedbackMemories(n_stream=1)


    with torch.no_grad():

        for i, (data, target, seq_len) in enumerate(eval_iter):
            if run_config.max_eval_steps > 0 and i >= run_config.max_eval_steps:
                break
            loss, new_eval_memory = model(data, target, eval_memories[0])
            eval_memories.update_memory_stream(stream_index=0, memory=new_eval_memory)

            loss = loss.mean()
            total_loss += seq_len * loss.float().item()
            total_len += seq_len

    # Switch back to the training mode
    model.train()

    return total_loss / total_len

## Train Model

In [None]:
# Loop over epochs.
train_step = 0
train_loss = 0
best_val_loss = None

log_start_time = time.time()
eval_start_time = time.time()

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in itertools.count(start=1):
        train()
        if train_step == optimizer_config.max_step:
            logging('-' * 100)
            logging('End of training')
            break
except KeyboardInterrupt:
    logging('-' * 100)
    logging('Exiting from training early')

# Load the best saved model.
with open(os.path.join(run_config.work_dir, 'model.pt'), 'rb') as f:
    model = torch.load(f)
para_model = model.to(device)

# Run on test data.
test_loss = evaluate(te_iter)
logging('=' * 100)

logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
    test_loss, math.exp(test_loss)))
logging('=' * 100)

  2%|█▎                                                                          | 200/11470 [07:46<7:10:01,  2.29s/it]

| epoch   1 step      200 |    200 batches | lr 1.75e-05 | ms/batch 2332.64 | loss  9.59 | ppl 14630.030


  3%|██▋                                                                         | 400/11470 [15:29<7:07:13,  2.32s/it]

| epoch   1 step      400 |    400 batches | lr 3.5e-05 | ms/batch 2316.00 | loss  7.16 | ppl  1286.626


  5%|███▉                                                                        | 600/11470 [23:14<7:02:54,  2.33s/it]

| epoch   1 step      600 |    600 batches | lr 5.25e-05 | ms/batch 2324.51 | loss  6.78 | ppl   877.999


  7%|█████▎                                                                      | 800/11470 [30:58<6:54:02,  2.33s/it]

| epoch   1 step      800 |    800 batches | lr 7e-05 | ms/batch 2318.19 | loss  6.48 | ppl   654.381


  9%|██████▌                                                                     | 999/11470 [38:38<6:38:48,  2.29s/it]

| epoch   1 step     1000 |   1000 batches | lr 8.75e-05 | ms/batch 2314.99 | loss  6.25 | ppl   517.674
----------------------------------------------------------------------------------------------------
| Eval   1 at step     1000 | time: 2330.06s | valid loss  6.15 | valid ppl   469.426
----------------------------------------------------------------------------------------------------


 10%|███████▊                                                                   | 1200/11470 [46:34<6:33:50,  2.30s/it]

| epoch   1 step     1200 |   1200 batches | lr 0.000105 | ms/batch 2364.28 | loss  6.07 | ppl   433.725


 12%|█████████▏                                                                 | 1400/11470 [54:18<6:31:43,  2.33s/it]

| epoch   1 step     1400 |   1400 batches | lr 0.000122 | ms/batch 2323.55 | loss  5.89 | ppl   363.032


 14%|██████████▏                                                              | 1600/11470 [1:02:04<6:28:58,  2.36s/it]

| epoch   1 step     1600 |   1600 batches | lr 0.00014 | ms/batch 2326.36 | loss  5.72 | ppl   304.419


 16%|███████████▍                                                             | 1800/11470 [1:09:49<6:08:42,  2.29s/it]

| epoch   1 step     1800 |   1800 batches | lr 0.000158 | ms/batch 2326.33 | loss  5.56 | ppl   260.873


 17%|████████████▋                                                            | 1999/11470 [1:17:31<6:04:39,  2.31s/it]

| epoch   1 step     2000 |   2000 batches | lr 0.000175 | ms/batch 2321.14 | loss  5.43 | ppl   227.286
----------------------------------------------------------------------------------------------------
| Eval   2 at step     2000 | time: 2329.44s | valid loss  5.35 | valid ppl   209.658
----------------------------------------------------------------------------------------------------


 19%|██████████████                                                           | 2200/11470 [1:25:30<5:55:55,  2.30s/it]

| epoch   1 step     2200 |   2200 batches | lr 0.000193 | ms/batch 2383.71 | loss  5.30 | ppl   200.458


 21%|███████████████▎                                                         | 2400/11470 [1:33:16<5:57:26,  2.36s/it]

| epoch   1 step     2400 |   2400 batches | lr 0.00021 | ms/batch 2330.35 | loss  5.20 | ppl   181.631


 23%|████████████████▌                                                        | 2600/11470 [1:41:02<5:39:09,  2.29s/it]

| epoch   1 step     2600 |   2600 batches | lr 0.000228 | ms/batch 2329.11 | loss  5.10 | ppl   163.270


 24%|█████████████████▊                                                       | 2800/11470 [1:48:47<5:35:08,  2.32s/it]

| epoch   1 step     2800 |   2800 batches | lr 0.000245 | ms/batch 2326.67 | loss  5.01 | ppl   149.559


 26%|███████████████████                                                      | 2999/11470 [1:56:31<5:29:31,  2.33s/it]

| epoch   1 step     3000 |   3000 batches | lr 0.000263 | ms/batch 2331.35 | loss  4.95 | ppl   141.618
----------------------------------------------------------------------------------------------------
| Eval   3 at step     3000 | time: 2337.41s | valid loss  4.90 | valid ppl   134.841
----------------------------------------------------------------------------------------------------


 28%|████████████████████▎                                                    | 3200/11470 [2:04:29<5:15:00,  2.29s/it]

| epoch   1 step     3200 |   3200 batches | lr 0.00028 | ms/batch 2377.04 | loss  4.88 | ppl   131.129


 30%|█████████████████████▋                                                   | 3400/11470 [2:12:08<5:10:16,  2.31s/it]

| epoch   1 step     3400 |   3400 batches | lr 0.000297 | ms/batch 2294.54 | loss  4.82 | ppl   124.552


 31%|██████████████████████▉                                                  | 3600/11470 [2:19:46<5:01:53,  2.30s/it]

| epoch   1 step     3600 |   3600 batches | lr 0.000315 | ms/batch 2293.23 | loss  4.74 | ppl   114.388


 33%|████████████████████████▏                                                | 3800/11470 [2:27:26<4:55:53,  2.31s/it]

| epoch   1 step     3800 |   3800 batches | lr 0.000333 | ms/batch 2297.41 | loss  4.76 | ppl   117.254


 35%|█████████████████████████▍                                               | 3999/11470 [2:35:03<4:45:34,  2.29s/it]

| epoch   1 step     4000 |   4000 batches | lr 0.00035 | ms/batch 2297.43 | loss  4.71 | ppl   111.305
----------------------------------------------------------------------------------------------------
| Eval   4 at step     4000 | time: 2309.04s | valid loss  4.69 | valid ppl   109.240
----------------------------------------------------------------------------------------------------


 37%|██████████████████████████▋                                              | 4200/11470 [2:42:55<4:38:43,  2.30s/it]

| epoch   1 step     4200 |   4200 batches | lr 0.000367 | ms/batch 2348.90 | loss  4.66 | ppl   105.460


 38%|████████████████████████████                                             | 4400/11470 [2:50:33<4:29:54,  2.29s/it]

| epoch   1 step     4400 |   4400 batches | lr 0.000385 | ms/batch 2290.89 | loss  4.64 | ppl   103.283


 40%|█████████████████████████████▎                                           | 4600/11470 [2:58:20<4:32:01,  2.38s/it]

| epoch   1 step     4600 |   4600 batches | lr 0.000402 | ms/batch 2332.24 | loss  4.61 | ppl   100.569


 42%|██████████████████████████████▌                                          | 4800/11470 [3:06:26<4:34:14,  2.47s/it]

| epoch   1 step     4800 |   4800 batches | lr 0.00042 | ms/batch 2431.77 | loss  4.56 | ppl    95.355


 44%|███████████████████████████████▊                                         | 4999/11470 [3:14:31<4:23:59,  2.45s/it]

| epoch   1 step     5000 |   5000 batches | lr 0.000438 | ms/batch 2436.90 | loss  4.59 | ppl    98.631
----------------------------------------------------------------------------------------------------
| Eval   5 at step     5000 | time: 2365.77s | valid loss  4.57 | valid ppl    96.294
----------------------------------------------------------------------------------------------------


 45%|█████████████████████████████████                                        | 5200/11470 [3:22:54<4:12:58,  2.42s/it]

| epoch   1 step     5200 |   5200 batches | lr 0.000455 | ms/batch 2504.01 | loss  4.53 | ppl    93.164


 47%|██████████████████████████████████▎                                      | 5400/11470 [3:31:04<4:04:09,  2.41s/it]

| epoch   1 step     5400 |   5400 batches | lr 0.000472 | ms/batch 2449.98 | loss  4.48 | ppl    88.613


 49%|███████████████████████████████████▋                                     | 5600/11470 [3:39:13<4:02:27,  2.48s/it]

| epoch   1 step     5600 |   5600 batches | lr 0.00049 | ms/batch 2440.97 | loss  4.50 | ppl    90.449


 51%|████████████████████████████████████▉                                    | 5800/11470 [3:47:22<3:50:56,  2.44s/it]

| epoch   1 step     5800 |   5800 batches | lr 0.000507 | ms/batch 2447.44 | loss  4.50 | ppl    89.831


 52%|██████████████████████████████████████▏                                  | 5999/11470 [3:55:29<3:40:52,  2.42s/it]

| epoch   1 step     6000 |   6000 batches | lr 0.000525 | ms/batch 2446.29 | loss  4.46 | ppl    86.379
----------------------------------------------------------------------------------------------------
| Eval   6 at step     6000 | time: 2454.76s | valid loss  4.50 | valid ppl    89.965
----------------------------------------------------------------------------------------------------


 54%|███████████████████████████████████████▍                                 | 6200/11470 [4:03:50<3:35:27,  2.45s/it]

| epoch   1 step     6200 |   6200 batches | lr 0.000543 | ms/batch 2490.93 | loss  4.43 | ppl    84.287


 56%|████████████████████████████████████████▋                                | 6400/11470 [4:11:58<3:25:20,  2.43s/it]

| epoch   1 step     6400 |   6400 batches | lr 0.00056 | ms/batch 2440.11 | loss  4.47 | ppl    87.162


 58%|██████████████████████████████████████████                               | 6600/11470 [4:20:06<3:18:59,  2.45s/it]

| epoch   1 step     6600 |   6600 batches | lr 0.000577 | ms/batch 2443.82 | loss  4.41 | ppl    82.154


 59%|███████████████████████████████████████████▎                             | 6800/11470 [4:28:16<3:09:19,  2.43s/it]

| epoch   1 step     6800 |   6800 batches | lr 0.000595 | ms/batch 2447.82 | loss  4.41 | ppl    82.138


 61%|████████████████████████████████████████████▌                            | 6999/11470 [4:36:21<3:04:22,  2.47s/it]

| epoch   1 step     7000 |   7000 batches | lr 0.000613 | ms/batch 2440.43 | loss  4.42 | ppl    83.016
----------------------------------------------------------------------------------------------------
| Eval   7 at step     7000 | time: 2449.88s | valid loss  4.42 | valid ppl    83.347
----------------------------------------------------------------------------------------------------


 63%|█████████████████████████████████████████████▊                           | 7200/11470 [4:44:44<2:56:00,  2.47s/it]

| epoch   1 step     7200 |   7200 batches | lr 0.00063 | ms/batch 2497.68 | loss  4.37 | ppl    78.976


 65%|███████████████████████████████████████████████                          | 7400/11470 [4:52:52<2:46:40,  2.46s/it]

| epoch   1 step     7400 |   7400 batches | lr 0.000647 | ms/batch 2442.89 | loss  4.37 | ppl    78.745


 66%|████████████████████████████████████████████████▎                        | 7600/11470 [5:01:00<2:39:17,  2.47s/it]

| epoch   1 step     7600 |   7600 batches | lr 0.000665 | ms/batch 2440.68 | loss  4.36 | ppl    77.894


 68%|█████████████████████████████████████████████████▋                       | 7800/11470 [5:09:10<2:29:43,  2.45s/it]

| epoch   1 step     7800 |   7800 batches | lr 0.000682 | ms/batch 2446.92 | loss  4.38 | ppl    79.691


 70%|██████████████████████████████████████████████████▉                      | 7999/11470 [5:17:15<2:21:02,  2.44s/it]

| epoch   1 step     8000 |   8000 batches | lr 9.78e-10 | ms/batch 2439.12 | loss  4.37 | ppl    79.411
----------------------------------------------------------------------------------------------------
| Eval   8 at step     8000 | time: 2450.53s | valid loss  4.39 | valid ppl    80.739
----------------------------------------------------------------------------------------------------


 71%|████████████████████████████████████████████████████▏                    | 8200/11470 [5:25:36<2:11:49,  2.42s/it]

| epoch   1 step     8200 |   8200 batches | lr 1.97e-07 | ms/batch 2494.82 | loss  4.55 | ppl    95.073


 73%|█████████████████████████████████████████████████████▍                   | 8400/11470 [5:33:44<2:05:16,  2.45s/it]

| epoch   1 step     8400 |   8400 batches | lr 3.92e-07 | ms/batch 2437.68 | loss  4.57 | ppl    96.914


 75%|██████████████████████████████████████████████████████▋                  | 8600/11470 [5:41:52<1:55:45,  2.42s/it]

| epoch   1 step     8600 |   8600 batches | lr 5.88e-07 | ms/batch 2442.31 | loss  4.55 | ppl    94.254


 77%|████████████████████████████████████████████████████████                 | 8800/11470 [5:50:01<1:48:15,  2.43s/it]

| epoch   1 step     8800 |   8800 batches | lr 7.84e-07 | ms/batch 2440.29 | loss  4.56 | ppl    96.011


 78%|█████████████████████████████████████████████████████████▎               | 8999/11470 [5:58:07<1:42:23,  2.49s/it]

| epoch   1 step     9000 |   9000 batches | lr 9.79e-07 | ms/batch 2445.84 | loss  4.52 | ppl    92.069
----------------------------------------------------------------------------------------------------
| Eval   9 at step     9000 | time: 2449.41s | valid loss  4.34 | valid ppl    76.445
----------------------------------------------------------------------------------------------------


 80%|██████████████████████████████████████████████████████████▌              | 9200/11470 [6:06:28<1:31:00,  2.41s/it]

| epoch   1 step     9200 |   9200 batches | lr 1.17e-06 | ms/batch 2491.85 | loss  4.51 | ppl    91.226


 82%|███████████████████████████████████████████████████████████▊             | 9400/11470 [6:14:35<1:23:31,  2.42s/it]

| epoch   1 step     9400 |   9400 batches | lr 1.37e-06 | ms/batch 2434.62 | loss  4.52 | ppl    91.735


 84%|█████████████████████████████████████████████████████████████            | 9600/11470 [6:22:42<1:16:54,  2.47s/it]

| epoch   1 step     9600 |   9600 batches | lr 1.57e-06 | ms/batch 2436.60 | loss  4.52 | ppl    92.167


 85%|██████████████████████████████████████████████████████████████▎          | 9800/11470 [6:30:51<1:08:26,  2.46s/it]

| epoch   1 step     9800 |   9800 batches | lr 1.76e-06 | ms/batch 2441.50 | loss  4.48 | ppl    87.883


 87%|█████████████████████████████████████████████████████████████████▍         | 9999/11470 [6:38:55<58:45,  2.40s/it]

| epoch   1 step    10000 |  10000 batches | lr 1.96e-06 | ms/batch 2435.65 | loss  4.49 | ppl    88.830
----------------------------------------------------------------------------------------------------
| Eval  10 at step    10000 | time: 2445.01s | valid loss  4.29 | valid ppl    72.854
----------------------------------------------------------------------------------------------------


 89%|█████████████████████████████████████████████████████████████████▊        | 10200/11470 [6:47:15<51:58,  2.46s/it]

| epoch   1 step    10200 |  10200 batches | lr 2.15e-06 | ms/batch 2488.36 | loss  4.47 | ppl    87.126


 91%|███████████████████████████████████████████████████████████████████       | 10400/11470 [6:55:22<43:45,  2.45s/it]

| epoch   1 step    10400 |  10400 batches | lr 2.35e-06 | ms/batch 2435.08 | loss  4.44 | ppl    84.789


 92%|████████████████████████████████████████████████████████████████████▍     | 10600/11470 [7:03:30<35:58,  2.48s/it]

| epoch   1 step    10600 |  10600 batches | lr 2.54e-06 | ms/batch 2436.98 | loss  4.46 | ppl    86.746


 94%|█████████████████████████████████████████████████████████████████████▋    | 10800/11470 [7:11:39<26:48,  2.40s/it]

| epoch   1 step    10800 |  10800 batches | lr 2.74e-06 | ms/batch 2443.96 | loss  4.43 | ppl    83.550


 96%|██████████████████████████████████████████████████████████████████████▉   | 10999/11470 [7:19:45<19:19,  2.46s/it]

| epoch   1 step    11000 |  11000 batches | lr 2.94e-06 | ms/batch 2443.27 | loss  4.47 | ppl    87.279
----------------------------------------------------------------------------------------------------
| Eval  11 at step    11000 | time: 2446.76s | valid loss  4.25 | valid ppl    69.796
----------------------------------------------------------------------------------------------------


 98%|████████████████████████████████████████████████████████████████████████▎ | 11200/11470 [7:28:07<11:10,  2.48s/it]

| epoch   1 step    11200 |  11200 batches | lr 3.13e-06 | ms/batch 2499.17 | loss  4.44 | ppl    84.645


 99%|█████████████████████████████████████████████████████████████████████████▌| 11400/11470 [7:36:17<02:50,  2.43s/it]

| epoch   1 step    11400 |  11400 batches | lr 3.33e-06 | ms/batch 2447.83 | loss  4.44 | ppl    84.713


100%|██████████████████████████████████████████████████████████████████████████| 11470/11470 [7:39:07<00:00,  2.40s/it]
  1%|▊                                                                           | 130/11470 [05:17<7:42:21,  2.45s/it]

| epoch   2 step    11600 |    130 batches | lr 3.52e-06 | ms/batch 2437.07 | loss  4.43 | ppl    83.754


  3%|██▏                                                                         | 330/11470 [13:25<7:33:27,  2.44s/it]

| epoch   2 step    11800 |    330 batches | lr 3.72e-06 | ms/batch 2441.07 | loss  4.39 | ppl    80.585


  5%|███▌                                                                        | 529/11470 [21:30<7:20:55,  2.42s/it]

| epoch   2 step    12000 |    530 batches | lr 3.91e-06 | ms/batch 2438.94 | loss  4.40 | ppl    81.723
----------------------------------------------------------------------------------------------------
| Eval  12 at step    12000 | time: 2449.95s | valid loss  4.21 | valid ppl    67.322
----------------------------------------------------------------------------------------------------


  6%|████▊                                                                       | 730/11470 [29:53<7:15:52,  2.44s/it]

| epoch   2 step    12200 |    730 batches | lr 4.11e-06 | ms/batch 2499.19 | loss  4.38 | ppl    79.583


  8%|██████▏                                                                     | 930/11470 [38:01<7:06:42,  2.43s/it]

| epoch   2 step    12400 |    930 batches | lr 4.31e-06 | ms/batch 2439.36 | loss  4.38 | ppl    79.578


 10%|███████▍                                                                   | 1130/11470 [46:09<6:57:29,  2.42s/it]

| epoch   2 step    12600 |   1130 batches | lr 4.5e-06 | ms/batch 2441.14 | loss  4.38 | ppl    79.828


 12%|████████▋                                                                  | 1330/11470 [54:18<6:46:55,  2.41s/it]

| epoch   2 step    12800 |   1330 batches | lr 4.7e-06 | ms/batch 2444.96 | loss  4.36 | ppl    78.024


 13%|█████████▋                                                               | 1529/11470 [1:02:22<6:43:37,  2.44s/it]

| epoch   2 step    13000 |   1530 batches | lr 4.89e-06 | ms/batch 2433.17 | loss  4.35 | ppl    77.108
----------------------------------------------------------------------------------------------------
| Eval  13 at step    13000 | time: 2448.82s | valid loss  4.18 | valid ppl    65.066
----------------------------------------------------------------------------------------------------


 15%|███████████                                                              | 1730/11470 [1:10:43<6:31:55,  2.41s/it]

| epoch   2 step    13200 |   1730 batches | lr 5.09e-06 | ms/batch 2491.77 | loss  4.34 | ppl    76.711


 17%|████████████▎                                                            | 1930/11470 [1:18:52<6:35:10,  2.49s/it]

| epoch   2 step    13400 |   1930 batches | lr 5.28e-06 | ms/batch 2443.89 | loss  4.34 | ppl    76.972


 19%|█████████████▌                                                           | 2130/11470 [1:27:00<6:17:51,  2.43s/it]

| epoch   2 step    13600 |   2130 batches | lr 5.48e-06 | ms/batch 2442.42 | loss  4.37 | ppl    78.785


 20%|██████████████▊                                                          | 2330/11470 [1:35:08<6:13:16,  2.45s/it]

| epoch   2 step    13800 |   2330 batches | lr 5.68e-06 | ms/batch 2440.77 | loss  4.32 | ppl    75.213


 22%|████████████████                                                         | 2529/11470 [1:43:14<6:03:26,  2.44s/it]

| epoch   2 step    14000 |   2530 batches | lr 5.87e-06 | ms/batch 2439.19 | loss  4.32 | ppl    75.275
----------------------------------------------------------------------------------------------------
| Eval  14 at step    14000 | time: 2448.70s | valid loss  4.14 | valid ppl    63.094
----------------------------------------------------------------------------------------------------


 24%|█████████████████▎                                                       | 2730/11470 [1:51:34<5:51:08,  2.41s/it]

| epoch   2 step    14200 |   2730 batches | lr 6.07e-06 | ms/batch 2490.09 | loss  4.29 | ppl    73.187


 26%|██████████████████▋                                                      | 2930/11470 [2:00:09<5:47:09,  2.44s/it]

| epoch   2 step    14400 |   2930 batches | lr 6.26e-06 | ms/batch 2575.22 | loss  4.27 | ppl    71.851


 27%|███████████████████▉                                                     | 3130/11470 [2:08:18<5:31:40,  2.39s/it]

| epoch   2 step    14600 |   3130 batches | lr 6.46e-06 | ms/batch 2441.41 | loss  4.26 | ppl    70.982


 29%|█████████████████████▏                                                   | 3330/11470 [2:16:26<5:32:55,  2.45s/it]

| epoch   2 step    14800 |   3330 batches | lr 6.65e-06 | ms/batch 2442.35 | loss  4.27 | ppl    71.767


 31%|██████████████████████▍                                                  | 3529/11470 [2:24:31<5:19:56,  2.42s/it]

| epoch   2 step    15000 |   3530 batches | lr 6.85e-06 | ms/batch 2436.99 | loss  4.22 | ppl    68.148
----------------------------------------------------------------------------------------------------
| Eval  15 at step    15000 | time: 2474.42s | valid loss  4.12 | valid ppl    61.436
----------------------------------------------------------------------------------------------------


 33%|███████████████████████▋                                                 | 3730/11470 [2:32:52<5:12:50,  2.43s/it]

| epoch   2 step    15200 |   3730 batches | lr 7.04e-06 | ms/batch 2491.35 | loss  4.25 | ppl    69.909


 34%|█████████████████████████                                                | 3930/11470 [2:41:02<5:09:42,  2.46s/it]

| epoch   2 step    15400 |   3930 batches | lr 7.24e-06 | ms/batch 2451.03 | loss  4.24 | ppl    69.519


 36%|██████████████████████████▎                                              | 4130/11470 [2:49:11<5:04:35,  2.49s/it]

| epoch   2 step    15600 |   4130 batches | lr 7.44e-06 | ms/batch 2443.44 | loss  4.22 | ppl    67.788


 38%|███████████████████████████▌                                             | 4330/11470 [2:57:19<4:51:32,  2.45s/it]

| epoch   2 step    15800 |   4330 batches | lr 7.63e-06 | ms/batch 2442.59 | loss  4.21 | ppl    67.381


 39%|████████████████████████████▊                                            | 4529/11470 [3:05:24<4:44:22,  2.46s/it]

| epoch   2 step    16000 |   4530 batches | lr 7.83e-06 | ms/batch 2439.00 | loss  4.21 | ppl    67.101
----------------------------------------------------------------------------------------------------
| Eval  16 at step    16000 | time: 2450.55s | valid loss  4.10 | valid ppl    60.129
----------------------------------------------------------------------------------------------------


 41%|██████████████████████████████                                           | 4730/11470 [3:13:46<4:37:12,  2.47s/it]

| epoch   2 step    16200 |   4730 batches | lr 7.73e-06 | ms/batch 2495.48 | loss  4.15 | ppl    63.679


 43%|███████████████████████████████▍                                         | 4930/11470 [3:21:54<4:27:27,  2.45s/it]

| epoch   2 step    16400 |   4930 batches | lr 7.64e-06 | ms/batch 2438.42 | loss  4.16 | ppl    63.906


 45%|████████████████████████████████▋                                        | 5130/11470 [3:30:03<4:15:31,  2.42s/it]

| epoch   2 step    16600 |   5130 batches | lr 7.55e-06 | ms/batch 2447.39 | loss  4.15 | ppl    63.202


 46%|█████████████████████████████████▉                                       | 5330/11470 [3:38:12<4:08:12,  2.43s/it]

| epoch   2 step    16800 |   5330 batches | lr 7.46e-06 | ms/batch 2441.97 | loss  4.12 | ppl    61.559


 48%|███████████████████████████████████▏                                     | 5529/11470 [3:46:17<3:58:50,  2.41s/it]

| epoch   2 step    17000 |   5530 batches | lr 7.38e-06 | ms/batch 2438.25 | loss  4.11 | ppl    60.694
----------------------------------------------------------------------------------------------------
| Eval  17 at step    17000 | time: 2449.50s | valid loss  4.08 | valid ppl    59.044
----------------------------------------------------------------------------------------------------


 50%|████████████████████████████████████▍                                    | 5730/11470 [3:54:41<3:51:56,  2.42s/it]

| epoch   2 step    17200 |   5730 batches | lr 7.3e-06 | ms/batch 2508.71 | loss  4.10 | ppl    60.490


 52%|█████████████████████████████████████▋                                   | 5930/11470 [4:02:48<3:44:55,  2.44s/it]

| epoch   2 step    17400 |   5930 batches | lr 7.22e-06 | ms/batch 2434.20 | loss  4.07 | ppl    58.365


 53%|███████████████████████████████████████                                  | 6130/11470 [4:10:57<3:40:14,  2.47s/it]

| epoch   2 step    17600 |   6130 batches | lr 7.14e-06 | ms/batch 2442.93 | loss  4.04 | ppl    56.813


 55%|████████████████████████████████████████▎                                | 6330/11470 [4:19:05<3:28:11,  2.43s/it]

| epoch   2 step    17800 |   6330 batches | lr 7.07e-06 | ms/batch 2442.15 | loss  4.05 | ppl    57.594


 57%|█████████████████████████████████████████▌                               | 6529/11470 [4:27:11<3:17:12,  2.39s/it]

| epoch   2 step    18000 |   6530 batches | lr 7e-06 | ms/batch 2441.18 | loss  3.98 | ppl    53.637
----------------------------------------------------------------------------------------------------
| Eval  18 at step    18000 | time: 2450.83s | valid loss  4.06 | valid ppl    58.186
----------------------------------------------------------------------------------------------------


 59%|██████████████████████████████████████████▊                              | 6730/11470 [4:35:33<3:14:30,  2.46s/it]

| epoch   2 step    18200 |   6730 batches | lr 6.93e-06 | ms/batch 2497.97 | loss  3.97 | ppl    52.862


 60%|████████████████████████████████████████████                             | 6930/11470 [4:43:40<3:07:45,  2.48s/it]

| epoch   2 step    18400 |   6930 batches | lr 6.86e-06 | ms/batch 2437.91 | loss  3.96 | ppl    52.295


 62%|█████████████████████████████████████████████▍                           | 7130/11470 [4:51:49<2:55:06,  2.42s/it]

| epoch   2 step    18600 |   7130 batches | lr 6.8e-06 | ms/batch 2440.84 | loss  3.90 | ppl    49.488


 64%|██████████████████████████████████████████████▋                          | 7330/11470 [4:59:56<2:50:17,  2.47s/it]

| epoch   2 step    18800 |   7330 batches | lr 6.74e-06 | ms/batch 2439.13 | loss  3.84 | ppl    46.694


 66%|███████████████████████████████████████████████▉                         | 7529/11470 [5:08:03<2:40:42,  2.45s/it]

| epoch   2 step    19000 |   7530 batches | lr 6.67e-06 | ms/batch 2444.73 | loss  3.82 | ppl    45.457
----------------------------------------------------------------------------------------------------
| Eval  19 at step    19000 | time: 2449.40s | valid loss  4.06 | valid ppl    57.747
----------------------------------------------------------------------------------------------------


 67%|█████████████████████████████████████████████████▏                       | 7730/11470 [5:16:25<2:30:10,  2.41s/it]

| epoch   2 step    19200 |   7730 batches | lr 6.61e-06 | ms/batch 2498.85 | loss  3.74 | ppl    42.223


 69%|██████████████████████████████████████████████████▍                      | 7930/11470 [5:24:33<2:25:44,  2.47s/it]

| epoch   2 step    19400 |   7930 batches | lr 6.56e-06 | ms/batch 2436.83 | loss  3.61 | ppl    36.806


 71%|███████████████████████████████████████████████████▋                     | 8130/11470 [5:32:42<2:16:11,  2.45s/it]

| epoch   2 step    19600 |   8130 batches | lr 6.5e-06 | ms/batch 2446.18 | loss  4.00 | ppl    54.775


 73%|█████████████████████████████████████████████████████                    | 8330/11470 [5:40:51<2:08:29,  2.46s/it]

| epoch   2 step    19800 |   8330 batches | lr 6.44e-06 | ms/batch 2444.23 | loss  4.24 | ppl    69.321


 74%|██████████████████████████████████████████████████████▎                  | 8529/11470 [5:48:58<1:59:51,  2.45s/it]

| epoch   2 step    20000 |   8530 batches | lr 6.39e-06 | ms/batch 2447.98 | loss  4.23 | ppl    68.653
----------------------------------------------------------------------------------------------------
| Eval  20 at step    20000 | time: 2451.87s | valid loss  4.04 | valid ppl    56.723
----------------------------------------------------------------------------------------------------


 76%|███████████████████████████████████████████████████████▌                 | 8730/11470 [5:57:22<1:52:22,  2.46s/it]

| epoch   2 step    20200 |   8730 batches | lr 6.34e-06 | ms/batch 2509.34 | loss  4.25 | ppl    70.410


 78%|████████████████████████████████████████████████████████▊                | 8930/11470 [6:05:29<1:41:22,  2.39s/it]

| epoch   2 step    20400 |   8930 batches | lr 6.29e-06 | ms/batch 2437.05 | loss  4.24 | ppl    69.704


 80%|██████████████████████████████████████████████████████████               | 9130/11470 [6:13:38<1:34:34,  2.42s/it]

| epoch   2 step    20600 |   9130 batches | lr 6.24e-06 | ms/batch 2442.96 | loss  4.24 | ppl    69.144


 81%|███████████████████████████████████████████████████████████▍             | 9330/11470 [6:21:47<1:27:08,  2.44s/it]

| epoch   2 step    20800 |   9330 batches | lr 6.19e-06 | ms/batch 2442.35 | loss  4.23 | ppl    68.385


 83%|████████████████████████████████████████████████████████████▋            | 9529/11470 [6:29:52<1:18:33,  2.43s/it]

| epoch   2 step    21000 |   9530 batches | lr 6.14e-06 | ms/batch 2438.20 | loss  4.27 | ppl    71.708
----------------------------------------------------------------------------------------------------
| Eval  21 at step    21000 | time: 2451.20s | valid loss  4.03 | valid ppl    56.058
----------------------------------------------------------------------------------------------------


 85%|█████████████████████████████████████████████████████████████▉           | 9730/11470 [6:38:14<1:10:13,  2.42s/it]

| epoch   2 step    21200 |   9730 batches | lr 6.09e-06 | ms/batch 2500.53 | loss  4.21 | ppl    67.279


 86%|███████████████████████████████████████████████████████████████          | 9909/11470 [6:45:34<1:03:53,  2.46s/it]


----------------------------------------------------------------------------------------------------
Exiting from training early
