In [1]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({
    "pad_token": "[PAD]",
    "unk_token": "[UNK]",
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
})



4

In [None]:
from datasets import load_dataset

In [19]:
dataset = load_dataset("ashaba1in/small_openwebtext")

def tokenize_function(examples):
    tokenized = tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=256,  
        return_tensors="pt"
    )
    tokenized['labels'] = tokenized['input_ids'].clone()
    return tokenized

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

tokenized_dataset = tokenized_dataset.with_format('torch')

batch_size = 32

dataloader_egich = DataLoader(
    tokenized_dataset['train'],
    batch_size=batch_size,
    shuffle=True
)


In [6]:
import sys
import os

sys.path.append(os.path.abspath("../model"))

In [7]:
from model import GPT

In [5]:
device ='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
vocab_size = len(tokenizer)
n_embd = 512
n_head = 8
block_size = 256
n_layer = 16
dropout = 0.3
device ='cuda' if torch.cuda.is_available() else 'cpu'
gpt = GPT(
        vocab_size=vocab_size,
        n_embd=n_embd,
        n_head=n_head,
        block_size=block_size,
        n_layer=n_layer,
        dropout=dropout,
        device=device
    ).to(device)
print(sum(p.numel() for p in gpt.parameters())/1e6, 'M parameters')

101.915733 M parameters


In [6]:
num_epochs = 2

In [7]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir="runs/gpt_experiment_1")


2025-04-23 06:15:30.707405: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-23 06:15:30.748183: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import torch.optim as optim
from tqdm import tqdm
from torch.optim.lr_scheduler import SequentialLR, LambdaLR, CosineAnnealingLR


num_epochs = 1
learning_rate = 3e-4
grad_clip = 1.0
save_every = 250
print_every = 50

optimizer = optim.AdamW(gpt.parameters(), lr=learning_rate, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler(enabled=device == 'cuda')
warmup_steps = 5000 

# Планировщик с warmup + decay
# scheduler = SequentialLR(
#     optimizer,
#     schedulers=[
#         LambdaLR(optimizer, lambda step: (step + 1) / warmup_steps), 
#         CosineAnnealingLR(optimizer, T_max=5000*2)
#     ],
#     milestones=[warmup_steps]
# )
scheduler = CosineAnnealingLR(optimizer, T_max=20000)  
scaler = torch.cuda.amp.GradScaler(
    enabled=(device == device),
    init_scale=2**16,
    growth_interval=2000
)


  scaler = torch.cuda.amp.GradScaler(enabled=device == 'cuda')
  scaler = torch.cuda.amp.GradScaler(


In [None]:
global_step = 0
best_loss = float('inf')
flag1 = False
flag2 = False

losses = []

for epoch in range(num_epochs):
    gpt.train()
    total_loss = 0.0
    progress_bar = tqdm(dataloader_egich, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

    for batch in progress_bar:
        optimizer.zero_grad(set_to_none=True)

        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()[:, 1:].contiguous()

        with torch.amp.autocast(device_type='cuda', enabled=device == 'cuda'):
            logits, loss = gpt(inputs[:, :-1], targets)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), grad_clip)
        
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        current_lr = optimizer.param_groups[0]['lr']
        writer.add_scalar("Train/Loss", loss.item(), global_step)
        writer.add_scalar("Train/Learning Rate", current_lr, global_step)
        
        with torch.no_grad():
            perplexity = torch.exp(loss.detach()).item()
        writer.add_scalar("Train/Perplexity", perplexity, global_step)

        total_loss += loss.item()
        losses.append(loss.item())
        

        if global_step % print_every == 0:
            avg_loss = total_loss / (print_every if global_step > 0 else 1)
            print(
                f"\nStep {global_step} | "
                f"Loss: {avg_loss:.4f} | "
                f"Perplexity: {perplexity:.2f} | "
                f"LR: {current_lr:.2e} | "
            )
            total_loss = 0.0
            
        if global_step % save_every == 0 and global_step > 0:
            checkpoint = {
                'model_state_dict': gpt.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step,
                'loss': loss.item(),
            }

            if loss.item() < best_loss:
                best_loss = loss.item()
                torch.save(gpt.state_dict(), "gpt_best.pt")

        global_step += 1
        
        if global_step >= 20000:
            flag1 = True
            break
            
    if flag1:
        break
            

torch.save(gpt.state_dict(), "gpt_final.pt")
print("Training completed!")

Epoch 1/1:   0%|          | 1/31250 [00:00<4:42:21,  1.84it/s]


Step 0 | Loss: 4.1400 | Perplexity: 62.81 | LR: 3.00e-04 | 


Epoch 1/1:   0%|          | 51/31250 [00:23<4:01:39,  2.15it/s]


Step 50 | Loss: 4.2903 | Perplexity: 72.94 | LR: 3.00e-04 | 


Epoch 1/1:   0%|          | 101/31250 [00:46<3:58:54,  2.17it/s]


Step 100 | Loss: 4.2807 | Perplexity: 87.46 | LR: 3.00e-04 | 


Epoch 1/1:   0%|          | 151/31250 [01:09<4:00:46,  2.15it/s]


Step 150 | Loss: 4.2561 | Perplexity: 61.65 | LR: 3.00e-04 | 


Epoch 1/1:   1%|          | 201/31250 [01:33<4:00:04,  2.16it/s]


Step 200 | Loss: 4.2442 | Perplexity: 67.75 | LR: 3.00e-04 | 


Epoch 1/1:   1%|          | 250/31250 [01:55<4:00:16,  2.15it/s]


Step 250 | Loss: 4.2671 | Perplexity: 72.75 | LR: 3.00e-04 | 


Epoch 1/1:   1%|          | 301/31250 [02:34<4:00:35,  2.14it/s] 


Step 300 | Loss: 4.2428 | Perplexity: 63.02 | LR: 3.00e-04 | 


Epoch 1/1:   1%|          | 351/31250 [02:57<3:58:38,  2.16it/s]


Step 350 | Loss: 4.2731 | Perplexity: 77.06 | LR: 3.00e-04 | 


Epoch 1/1:   1%|▏         | 401/31250 [03:21<4:21:02,  1.97it/s]


Step 400 | Loss: 4.2461 | Perplexity: 85.69 | LR: 3.00e-04 | 


Epoch 1/1:   1%|▏         | 451/31250 [03:44<4:00:26,  2.13it/s]


Step 450 | Loss: 4.2642 | Perplexity: 63.25 | LR: 3.00e-04 | 


Epoch 1/1:   2%|▏         | 501/31250 [04:08<3:58:33,  2.15it/s]


Step 500 | Loss: 4.2534 | Perplexity: 78.92 | LR: 3.00e-04 | 


Epoch 1/1:   2%|▏         | 551/31250 [04:31<3:58:45,  2.14it/s]


Step 550 | Loss: 4.2455 | Perplexity: 64.08 | LR: 2.99e-04 | 


Epoch 1/1:   2%|▏         | 601/31250 [04:54<3:57:24,  2.15it/s]


Step 600 | Loss: 4.2893 | Perplexity: 83.92 | LR: 2.99e-04 | 


Epoch 1/1:   2%|▏         | 651/31250 [05:18<3:57:34,  2.15it/s]


Step 650 | Loss: 4.2494 | Perplexity: 81.77 | LR: 2.99e-04 | 


Epoch 1/1:   2%|▏         | 701/31250 [05:41<3:59:15,  2.13it/s]


Step 700 | Loss: 4.2490 | Perplexity: 71.43 | LR: 2.99e-04 | 


Epoch 1/1:   2%|▏         | 750/31250 [06:04<3:57:15,  2.14it/s]


Step 750 | Loss: 4.2849 | Perplexity: 102.86 | LR: 2.99e-04 | 


Epoch 1/1:   3%|▎         | 801/31250 [06:28<3:57:38,  2.14it/s]


Step 800 | Loss: 4.2545 | Perplexity: 72.15 | LR: 2.99e-04 | 


Epoch 1/1:   3%|▎         | 851/31250 [06:51<3:57:17,  2.14it/s]


Step 850 | Loss: 4.2612 | Perplexity: 56.21 | LR: 2.99e-04 | 


Epoch 1/1:   3%|▎         | 901/31250 [07:15<3:56:20,  2.14it/s]


Step 900 | Loss: 4.2530 | Perplexity: 63.29 | LR: 2.99e-04 | 


Epoch 1/1:   3%|▎         | 951/31250 [07:38<3:55:27,  2.14it/s]


Step 950 | Loss: 4.2247 | Perplexity: 75.12 | LR: 2.98e-04 | 


Epoch 1/1:   3%|▎         | 1000/31250 [08:01<3:56:17,  2.13it/s]


Step 1000 | Loss: 4.2814 | Perplexity: 70.23 | LR: 2.98e-04 | 


Epoch 1/1:   3%|▎         | 1051/31250 [08:40<3:55:01,  2.14it/s] 


Step 1050 | Loss: 4.2335 | Perplexity: 67.83 | LR: 2.98e-04 | 


Epoch 1/1:   4%|▎         | 1101/31250 [09:03<3:53:44,  2.15it/s]


Step 1100 | Loss: 4.2349 | Perplexity: 74.19 | LR: 2.98e-04 | 


Epoch 1/1:   4%|▎         | 1151/31250 [09:26<3:53:10,  2.15it/s]


Step 1150 | Loss: 4.2352 | Perplexity: 79.14 | LR: 2.98e-04 | 


Epoch 1/1:   4%|▍         | 1201/31250 [09:50<3:53:26,  2.15it/s]


Step 1200 | Loss: 4.2605 | Perplexity: 73.10 | LR: 2.97e-04 | 


Epoch 1/1:   4%|▍         | 1250/31250 [10:13<3:53:01,  2.15it/s]


Step 1250 | Loss: 4.2192 | Perplexity: 63.42 | LR: 2.97e-04 | 


Epoch 1/1:   4%|▍         | 1301/31250 [10:52<3:53:20,  2.14it/s] 


Step 1300 | Loss: 4.2465 | Perplexity: 70.74 | LR: 2.97e-04 | 


Epoch 1/1:   4%|▍         | 1351/31250 [11:15<3:52:12,  2.15it/s]


Step 1350 | Loss: 4.2393 | Perplexity: 54.93 | LR: 2.97e-04 | 


Epoch 1/1:   4%|▍         | 1401/31250 [11:38<3:51:51,  2.15it/s]


Step 1400 | Loss: 4.2298 | Perplexity: 61.57 | LR: 2.96e-04 | 


Epoch 1/1:   5%|▍         | 1451/31250 [12:02<3:52:21,  2.14it/s]


Step 1450 | Loss: 4.2466 | Perplexity: 69.83 | LR: 2.96e-04 | 


Epoch 1/1:   5%|▍         | 1500/31250 [12:25<3:51:29,  2.14it/s]


Step 1500 | Loss: 4.2344 | Perplexity: 63.36 | LR: 2.96e-04 | 


Epoch 1/1:   5%|▍         | 1551/31250 [13:04<3:49:48,  2.15it/s] 


Step 1550 | Loss: 4.2483 | Perplexity: 76.96 | LR: 2.96e-04 | 


Epoch 1/1:   5%|▌         | 1601/31250 [13:27<3:50:25,  2.14it/s]


Step 1600 | Loss: 4.2614 | Perplexity: 62.16 | LR: 2.95e-04 | 


Epoch 1/1:   5%|▌         | 1651/31250 [13:50<3:50:24,  2.14it/s]


Step 1650 | Loss: 4.2547 | Perplexity: 60.92 | LR: 2.95e-04 | 


Epoch 1/1:   5%|▌         | 1701/31250 [14:14<3:49:24,  2.15it/s]


Step 1700 | Loss: 4.2115 | Perplexity: 63.62 | LR: 2.95e-04 | 


Epoch 1/1:   6%|▌         | 1751/31250 [14:37<3:49:26,  2.14it/s]


Step 1750 | Loss: 4.2412 | Perplexity: 66.40 | LR: 2.94e-04 | 


Epoch 1/1:   6%|▌         | 1801/31250 [15:00<3:50:17,  2.13it/s]


Step 1800 | Loss: 4.2267 | Perplexity: 57.36 | LR: 2.94e-04 | 


Epoch 1/1:   6%|▌         | 1851/31250 [15:24<3:51:49,  2.11it/s]


Step 1850 | Loss: 4.2306 | Perplexity: 67.50 | LR: 2.94e-04 | 


Epoch 1/1:   6%|▌         | 1901/31250 [15:47<3:46:46,  2.16it/s]


Step 1900 | Loss: 4.2294 | Perplexity: 57.65 | LR: 2.93e-04 | 


Epoch 1/1:   6%|▌         | 1951/31250 [16:10<3:48:43,  2.13it/s]


Step 1950 | Loss: 4.2215 | Perplexity: 68.61 | LR: 2.93e-04 | 


Epoch 1/1:   6%|▋         | 2001/31250 [16:34<3:47:41,  2.14it/s]


Step 2000 | Loss: 4.2241 | Perplexity: 69.19 | LR: 2.93e-04 | 


Epoch 1/1:   7%|▋         | 2051/31250 [16:57<3:46:09,  2.15it/s]


Step 2050 | Loss: 4.2272 | Perplexity: 63.65 | LR: 2.92e-04 | 


Epoch 1/1:   7%|▋         | 2101/31250 [17:21<3:49:28,  2.12it/s]


Step 2100 | Loss: 4.2127 | Perplexity: 55.54 | LR: 2.92e-04 | 


Epoch 1/1:   7%|▋         | 2151/31250 [17:44<3:48:02,  2.13it/s]


Step 2150 | Loss: 4.1969 | Perplexity: 69.40 | LR: 2.92e-04 | 


Epoch 1/1:   7%|▋         | 2201/31250 [18:07<3:44:30,  2.16it/s]


Step 2200 | Loss: 4.2357 | Perplexity: 59.72 | LR: 2.91e-04 | 


Epoch 1/1:   7%|▋         | 2250/31250 [18:30<3:46:34,  2.13it/s]


Step 2250 | Loss: 4.2206 | Perplexity: 60.76 | LR: 2.91e-04 | 


Epoch 1/1:   7%|▋         | 2301/31250 [19:09<3:44:45,  2.15it/s] 


Step 2300 | Loss: 4.1871 | Perplexity: 59.59 | LR: 2.90e-04 | 


Epoch 1/1:   8%|▊         | 2351/31250 [19:33<3:46:53,  2.12it/s]


Step 2350 | Loss: 4.2046 | Perplexity: 62.23 | LR: 2.90e-04 | 


Epoch 1/1:   8%|▊         | 2401/31250 [19:56<3:46:18,  2.12it/s]


Step 2400 | Loss: 4.2183 | Perplexity: 64.52 | LR: 2.89e-04 | 


Epoch 1/1:   8%|▊         | 2451/31250 [20:20<3:46:27,  2.12it/s]


Step 2450 | Loss: 4.2205 | Perplexity: 71.07 | LR: 2.89e-04 | 


Epoch 1/1:   8%|▊         | 2501/31250 [20:43<3:46:44,  2.11it/s]


Step 2500 | Loss: 4.2389 | Perplexity: 68.24 | LR: 2.89e-04 | 


Epoch 1/1:   8%|▊         | 2551/31250 [21:07<3:43:43,  2.14it/s]


Step 2550 | Loss: 4.2368 | Perplexity: 67.65 | LR: 2.88e-04 | 


Epoch 1/1:   8%|▊         | 2601/31250 [21:30<3:42:16,  2.15it/s]


Step 2600 | Loss: 4.1811 | Perplexity: 83.49 | LR: 2.88e-04 | 


Epoch 1/1:   8%|▊         | 2651/31250 [21:53<3:41:55,  2.15it/s]


Step 2650 | Loss: 4.2180 | Perplexity: 64.20 | LR: 2.87e-04 | 


Epoch 1/1:   9%|▊         | 2701/31250 [22:17<3:41:39,  2.15it/s]


Step 2700 | Loss: 4.1954 | Perplexity: 66.33 | LR: 2.87e-04 | 


Epoch 1/1:   9%|▉         | 2751/31250 [22:40<3:42:40,  2.13it/s]


Step 2750 | Loss: 4.1988 | Perplexity: 68.53 | LR: 2.86e-04 | 


Epoch 1/1:   9%|▉         | 2801/31250 [23:03<3:41:59,  2.14it/s]


Step 2800 | Loss: 4.2315 | Perplexity: 58.73 | LR: 2.86e-04 | 


Epoch 1/1:   9%|▉         | 2851/31250 [23:27<3:40:51,  2.14it/s]


Step 2850 | Loss: 4.2078 | Perplexity: 68.50 | LR: 2.85e-04 | 


Epoch 1/1:   9%|▉         | 2901/31250 [23:50<3:43:17,  2.12it/s]


Step 2900 | Loss: 4.2315 | Perplexity: 73.27 | LR: 2.85e-04 | 


Epoch 1/1:   9%|▉         | 2951/31250 [24:14<3:39:09,  2.15it/s]


Step 2950 | Loss: 4.2053 | Perplexity: 61.30 | LR: 2.84e-04 | 


Epoch 1/1:  10%|▉         | 3001/31250 [24:37<3:39:57,  2.14it/s]


Step 3000 | Loss: 4.2076 | Perplexity: 69.83 | LR: 2.84e-04 | 


Epoch 1/1:  10%|▉         | 3051/31250 [25:00<3:40:04,  2.14it/s]


Step 3050 | Loss: 4.2112 | Perplexity: 57.66 | LR: 2.83e-04 | 


Epoch 1/1:  10%|▉         | 3101/31250 [25:24<3:40:55,  2.12it/s]


Step 3100 | Loss: 4.2006 | Perplexity: 59.37 | LR: 2.83e-04 | 


Epoch 1/1:  10%|█         | 3151/31250 [25:47<3:38:53,  2.14it/s]


Step 3150 | Loss: 4.2023 | Perplexity: 61.07 | LR: 2.82e-04 | 


Epoch 1/1:  10%|█         | 3201/31250 [26:11<3:40:12,  2.12it/s]


Step 3200 | Loss: 4.1812 | Perplexity: 63.05 | LR: 2.81e-04 | 


Epoch 1/1:  10%|█         | 3251/31250 [26:34<3:38:28,  2.14it/s]


Step 3250 | Loss: 4.2105 | Perplexity: 67.72 | LR: 2.81e-04 | 


Epoch 1/1:  11%|█         | 3301/31250 [26:57<3:36:06,  2.16it/s]


Step 3300 | Loss: 4.2012 | Perplexity: 74.96 | LR: 2.80e-04 | 


Epoch 1/1:  11%|█         | 3351/31250 [27:21<3:38:07,  2.13it/s]


Step 3350 | Loss: 4.2218 | Perplexity: 84.62 | LR: 2.80e-04 | 


Epoch 1/1:  11%|█         | 3401/31250 [27:44<3:36:26,  2.14it/s]


Step 3400 | Loss: 4.2022 | Perplexity: 65.21 | LR: 2.79e-04 | 


Epoch 1/1:  11%|█         | 3451/31250 [28:07<3:35:29,  2.15it/s]


Step 3450 | Loss: 4.1980 | Perplexity: 73.23 | LR: 2.78e-04 | 


Epoch 1/1:  11%|█         | 3501/31250 [28:31<3:37:52,  2.12it/s]


Step 3500 | Loss: 4.2262 | Perplexity: 62.22 | LR: 2.78e-04 | 


Epoch 1/1:  11%|█▏        | 3551/31250 [28:54<3:35:19,  2.14it/s]


Step 3550 | Loss: 4.2077 | Perplexity: 63.17 | LR: 2.77e-04 | 


Epoch 1/1:  12%|█▏        | 3601/31250 [29:17<3:34:00,  2.15it/s]


Step 3600 | Loss: 4.1981 | Perplexity: 73.39 | LR: 2.77e-04 | 


Epoch 1/1:  12%|█▏        | 3651/31250 [29:41<3:35:26,  2.14it/s]


Step 3650 | Loss: 4.2113 | Perplexity: 70.93 | LR: 2.76e-04 | 


Epoch 1/1:  12%|█▏        | 3701/31250 [30:04<3:34:58,  2.14it/s]


Step 3700 | Loss: 4.1996 | Perplexity: 69.26 | LR: 2.75e-04 | 


Epoch 1/1:  12%|█▏        | 3751/31250 [30:28<3:34:13,  2.14it/s]


Step 3750 | Loss: 4.2001 | Perplexity: 69.88 | LR: 2.75e-04 | 


Epoch 1/1:  12%|█▏        | 3801/31250 [30:51<3:35:16,  2.13it/s]


Step 3800 | Loss: 4.2192 | Perplexity: 63.69 | LR: 2.74e-04 | 


Epoch 1/1:  12%|█▏        | 3851/31250 [31:14<3:32:30,  2.15it/s]


Step 3850 | Loss: 4.1991 | Perplexity: 56.19 | LR: 2.73e-04 | 


Epoch 1/1:  12%|█▏        | 3901/31250 [31:38<3:32:22,  2.15it/s]


Step 3900 | Loss: 4.1910 | Perplexity: 75.36 | LR: 2.73e-04 | 


Epoch 1/1:  13%|█▎        | 3951/31250 [32:01<3:33:54,  2.13it/s]


Step 3950 | Loss: 4.1923 | Perplexity: 81.86 | LR: 2.72e-04 | 


Epoch 1/1:  13%|█▎        | 4000/31250 [32:24<3:34:40,  2.12it/s]


Step 4000 | Loss: 4.1825 | Perplexity: 63.45 | LR: 2.71e-04 | 


Epoch 1/1:  13%|█▎        | 4051/31250 [32:48<3:31:02,  2.15it/s]


Step 4050 | Loss: 4.1988 | Perplexity: 67.25 | LR: 2.71e-04 | 


Epoch 1/1:  13%|█▎        | 4101/31250 [33:11<3:33:30,  2.12it/s]


Step 4100 | Loss: 4.1918 | Perplexity: 73.53 | LR: 2.70e-04 | 


Epoch 1/1:  13%|█▎        | 4151/31250 [33:35<3:32:44,  2.12it/s]


Step 4150 | Loss: 4.2098 | Perplexity: 66.21 | LR: 2.69e-04 | 


Epoch 1/1:  13%|█▎        | 4201/31250 [33:58<3:30:20,  2.14it/s]


Step 4200 | Loss: 4.1678 | Perplexity: 61.70 | LR: 2.69e-04 | 


Epoch 1/1:  14%|█▎        | 4251/31250 [34:22<3:31:55,  2.12it/s]


Step 4250 | Loss: 4.1926 | Perplexity: 66.59 | LR: 2.68e-04 | 


Epoch 1/1:  14%|█▍        | 4301/31250 [34:45<3:30:13,  2.14it/s]


Step 4300 | Loss: 4.1764 | Perplexity: 64.26 | LR: 2.67e-04 | 


Epoch 1/1:  14%|█▍        | 4351/31250 [35:09<3:29:30,  2.14it/s]


Step 4350 | Loss: 4.2306 | Perplexity: 68.64 | LR: 2.66e-04 | 


Epoch 1/1:  14%|█▍        | 4401/31250 [35:32<3:31:30,  2.12it/s]


Step 4400 | Loss: 4.1811 | Perplexity: 67.02 | LR: 2.66e-04 | 


Epoch 1/1:  14%|█▍        | 4451/31250 [35:56<3:30:14,  2.12it/s]


Step 4450 | Loss: 4.1861 | Perplexity: 62.05 | LR: 2.65e-04 | 


Epoch 1/1:  14%|█▍        | 4501/31250 [36:20<3:31:42,  2.11it/s]


Step 4500 | Loss: 4.1751 | Perplexity: 66.64 | LR: 2.64e-04 | 


Epoch 1/1:  15%|█▍        | 4551/31250 [36:43<3:30:24,  2.11it/s]


Step 4550 | Loss: 4.2018 | Perplexity: 72.04 | LR: 2.63e-04 | 


Epoch 1/1:  15%|█▍        | 4601/31250 [37:07<3:28:05,  2.13it/s]


Step 4600 | Loss: 4.1978 | Perplexity: 67.59 | LR: 2.63e-04 | 


Epoch 1/1:  15%|█▍        | 4651/31250 [37:30<3:28:19,  2.13it/s]


Step 4650 | Loss: 4.1843 | Perplexity: 61.02 | LR: 2.62e-04 | 


Epoch 1/1:  15%|█▌        | 4701/31250 [37:54<3:28:15,  2.12it/s]


Step 4700 | Loss: 4.2161 | Perplexity: 75.36 | LR: 2.61e-04 | 


Epoch 1/1:  15%|█▌        | 4751/31250 [38:17<3:26:37,  2.14it/s]


Step 4750 | Loss: 4.1840 | Perplexity: 77.84 | LR: 2.60e-04 | 


Epoch 1/1:  15%|█▌        | 4801/31250 [38:40<3:27:17,  2.13it/s]


Step 4800 | Loss: 4.1983 | Perplexity: 70.11 | LR: 2.59e-04 | 


Epoch 1/1:  16%|█▌        | 4851/31250 [39:04<3:26:31,  2.13it/s]


Step 4850 | Loss: 4.2093 | Perplexity: 79.80 | LR: 2.59e-04 | 


Epoch 1/1:  16%|█▌        | 4901/31250 [39:27<3:25:26,  2.14it/s]


Step 4900 | Loss: 4.1930 | Perplexity: 75.21 | LR: 2.58e-04 | 


Epoch 1/1:  16%|█▌        | 4951/31250 [39:51<3:25:41,  2.13it/s]


Step 4950 | Loss: 4.2037 | Perplexity: 68.70 | LR: 2.57e-04 | 


Epoch 1/1:  16%|█▌        | 5001/31250 [40:14<3:25:05,  2.13it/s]


Step 5000 | Loss: 4.1830 | Perplexity: 61.01 | LR: 2.56e-04 | 


Epoch 1/1:  16%|█▌        | 5051/31250 [40:38<3:24:09,  2.14it/s]


Step 5050 | Loss: 4.1825 | Perplexity: 64.00 | LR: 2.55e-04 | 


Epoch 1/1:  16%|█▋        | 5101/31250 [41:01<3:24:37,  2.13it/s]


Step 5100 | Loss: 4.1998 | Perplexity: 68.12 | LR: 2.54e-04 | 


Epoch 1/1:  16%|█▋        | 5151/31250 [41:24<3:24:19,  2.13it/s]


Step 5150 | Loss: 4.1726 | Perplexity: 74.02 | LR: 2.54e-04 | 


Epoch 1/1:  17%|█▋        | 5201/31250 [41:48<3:23:39,  2.13it/s]


Step 5200 | Loss: 4.2113 | Perplexity: 70.57 | LR: 2.53e-04 | 


Epoch 1/1:  17%|█▋        | 5250/31250 [42:11<3:23:10,  2.13it/s]


Step 5250 | Loss: 4.2266 | Perplexity: 59.85 | LR: 2.52e-04 | 


Epoch 1/1:  17%|█▋        | 5301/31250 [42:50<3:21:23,  2.15it/s] 


Step 5300 | Loss: 4.2115 | Perplexity: 74.09 | LR: 2.51e-04 | 


Epoch 1/1:  17%|█▋        | 5351/31250 [43:13<3:21:40,  2.14it/s]


Step 5350 | Loss: 4.1745 | Perplexity: 59.93 | LR: 2.50e-04 | 


Epoch 1/1:  17%|█▋        | 5401/31250 [43:36<3:21:01,  2.14it/s]


Step 5400 | Loss: 4.1744 | Perplexity: 78.74 | LR: 2.49e-04 | 


Epoch 1/1:  17%|█▋        | 5451/31250 [44:00<3:20:30,  2.14it/s]


Step 5450 | Loss: 4.1855 | Perplexity: 61.89 | LR: 2.48e-04 | 


Epoch 1/1:  18%|█▊        | 5501/31250 [44:23<3:21:28,  2.13it/s]


Step 5500 | Loss: 4.1881 | Perplexity: 74.15 | LR: 2.47e-04 | 


Epoch 1/1:  18%|█▊        | 5551/31250 [44:46<3:19:40,  2.15it/s]


Step 5550 | Loss: 4.1849 | Perplexity: 64.51 | LR: 2.47e-04 | 


Epoch 1/1:  18%|█▊        | 5601/31250 [45:10<3:19:48,  2.14it/s]


Step 5600 | Loss: 4.1787 | Perplexity: 51.11 | LR: 2.46e-04 | 


Epoch 1/1:  18%|█▊        | 5651/31250 [45:33<3:18:42,  2.15it/s]


Step 5650 | Loss: 4.1824 | Perplexity: 59.48 | LR: 2.45e-04 | 


Epoch 1/1:  18%|█▊        | 5701/31250 [45:57<3:18:11,  2.15it/s]


Step 5700 | Loss: 4.1928 | Perplexity: 73.67 | LR: 2.44e-04 | 


Epoch 1/1:  18%|█▊        | 5751/31250 [46:20<3:19:18,  2.13it/s]


Step 5750 | Loss: 4.1712 | Perplexity: 75.40 | LR: 2.43e-04 | 


Epoch 1/1:  19%|█▊        | 5801/31250 [46:43<3:18:54,  2.13it/s]


Step 5800 | Loss: 4.2080 | Perplexity: 69.31 | LR: 2.42e-04 | 


Epoch 1/1:  19%|█▊        | 5851/31250 [47:07<3:17:10,  2.15it/s]


Step 5850 | Loss: 4.1802 | Perplexity: 74.36 | LR: 2.41e-04 | 


Epoch 1/1:  19%|█▉        | 5901/31250 [47:30<3:18:27,  2.13it/s]


Step 5900 | Loss: 4.1899 | Perplexity: 56.47 | LR: 2.40e-04 | 


Epoch 1/1:  19%|█▉        | 5951/31250 [47:54<3:17:34,  2.13it/s]


Step 5950 | Loss: 4.1772 | Perplexity: 63.77 | LR: 2.39e-04 | 


Epoch 1/1:  19%|█▉        | 6001/31250 [48:17<3:16:12,  2.14it/s]


Step 6000 | Loss: 4.2141 | Perplexity: 71.21 | LR: 2.38e-04 | 


Epoch 1/1:  19%|█▉        | 6051/31250 [48:40<3:16:46,  2.13it/s]


Step 6050 | Loss: 4.1534 | Perplexity: 53.44 | LR: 2.37e-04 | 


Epoch 1/1:  20%|█▉        | 6101/31250 [49:04<3:16:40,  2.13it/s]


Step 6100 | Loss: 4.1695 | Perplexity: 67.22 | LR: 2.36e-04 | 


Epoch 1/1:  20%|█▉        | 6151/31250 [49:27<3:15:13,  2.14it/s]


Step 6150 | Loss: 4.1754 | Perplexity: 59.37 | LR: 2.35e-04 | 


Epoch 1/1:  20%|█▉        | 6201/31250 [49:50<3:15:35,  2.13it/s]


Step 6200 | Loss: 4.1925 | Perplexity: 57.99 | LR: 2.34e-04 | 


Epoch 1/1:  20%|██        | 6250/31250 [50:13<3:14:53,  2.14it/s]


Step 6250 | Loss: 4.1897 | Perplexity: 57.03 | LR: 2.33e-04 | 


Epoch 1/1:  20%|██        | 6301/31250 [50:52<3:14:35,  2.14it/s] 


Step 6300 | Loss: 4.1717 | Perplexity: 65.28 | LR: 2.32e-04 | 


Epoch 1/1:  20%|██        | 6351/31250 [51:15<3:13:46,  2.14it/s]


Step 6350 | Loss: 4.1922 | Perplexity: 62.73 | LR: 2.31e-04 | 


Epoch 1/1:  20%|██        | 6401/31250 [51:39<3:13:39,  2.14it/s]


Step 6400 | Loss: 4.1804 | Perplexity: 73.52 | LR: 2.30e-04 | 


Epoch 1/1:  21%|██        | 6451/31250 [52:02<3:15:16,  2.12it/s]


Step 6450 | Loss: 4.1829 | Perplexity: 70.28 | LR: 2.29e-04 | 


Epoch 1/1:  21%|██        | 6501/31250 [52:26<3:13:29,  2.13it/s]


Step 6500 | Loss: 4.1700 | Perplexity: 73.45 | LR: 2.28e-04 | 


Epoch 1/1:  21%|██        | 6551/31250 [52:49<3:13:05,  2.13it/s]


Step 6550 | Loss: 4.1783 | Perplexity: 84.08 | LR: 2.27e-04 | 


Epoch 1/1:  21%|██        | 6601/31250 [53:12<3:12:05,  2.14it/s]


Step 6600 | Loss: 4.1878 | Perplexity: 73.71 | LR: 2.26e-04 | 


Epoch 1/1:  21%|██▏       | 6651/31250 [53:36<3:10:57,  2.15it/s]


Step 6650 | Loss: 4.1359 | Perplexity: 58.83 | LR: 2.25e-04 | 


Epoch 1/1:  21%|██▏       | 6701/31250 [53:59<3:10:02,  2.15it/s]


Step 6700 | Loss: 4.1570 | Perplexity: 79.02 | LR: 2.24e-04 | 


Epoch 1/1:  22%|██▏       | 6751/31250 [54:22<3:13:18,  2.11it/s]


Step 6750 | Loss: 4.2074 | Perplexity: 70.27 | LR: 2.23e-04 | 


Epoch 1/1:  22%|██▏       | 6801/31250 [54:46<3:10:12,  2.14it/s]


Step 6800 | Loss: 4.1956 | Perplexity: 72.13 | LR: 2.22e-04 | 


Epoch 1/1:  22%|██▏       | 6851/31250 [55:09<3:09:36,  2.14it/s]


Step 6850 | Loss: 4.1788 | Perplexity: 65.83 | LR: 2.21e-04 | 


Epoch 1/1:  22%|██▏       | 6901/31250 [55:33<3:09:42,  2.14it/s]


Step 6900 | Loss: 4.1577 | Perplexity: 62.92 | LR: 2.20e-04 | 


Epoch 1/1:  22%|██▏       | 6951/31250 [55:56<3:09:15,  2.14it/s]


Step 6950 | Loss: 4.2021 | Perplexity: 61.74 | LR: 2.19e-04 | 


Epoch 1/1:  22%|██▏       | 7001/31250 [56:20<3:10:02,  2.13it/s]


Step 7000 | Loss: 4.1873 | Perplexity: 64.02 | LR: 2.18e-04 | 


Epoch 1/1:  23%|██▎       | 7051/31250 [56:43<3:11:02,  2.11it/s]


Step 7050 | Loss: 4.1867 | Perplexity: 69.82 | LR: 2.17e-04 | 


Epoch 1/1:  23%|██▎       | 7101/31250 [57:07<3:07:57,  2.14it/s]


Step 7100 | Loss: 4.1877 | Perplexity: 64.70 | LR: 2.16e-04 | 


Epoch 1/1:  23%|██▎       | 7151/31250 [57:30<3:07:47,  2.14it/s]


Step 7150 | Loss: 4.1765 | Perplexity: 62.24 | LR: 2.15e-04 | 


Epoch 1/1:  23%|██▎       | 7201/31250 [57:53<3:07:13,  2.14it/s]


Step 7200 | Loss: 4.1677 | Perplexity: 58.81 | LR: 2.14e-04 | 


Epoch 1/1:  23%|██▎       | 7251/31250 [58:17<3:07:03,  2.14it/s]


Step 7250 | Loss: 4.1842 | Perplexity: 72.26 | LR: 2.13e-04 | 


Epoch 1/1:  23%|██▎       | 7301/31250 [58:40<3:06:37,  2.14it/s]


Step 7300 | Loss: 4.1793 | Perplexity: 62.96 | LR: 2.12e-04 | 


Epoch 1/1:  24%|██▎       | 7351/31250 [59:03<3:06:30,  2.14it/s]


Step 7350 | Loss: 4.1623 | Perplexity: 68.70 | LR: 2.11e-04 | 


Epoch 1/1:  24%|██▎       | 7401/31250 [59:27<3:05:58,  2.14it/s]


Step 7400 | Loss: 4.1647 | Perplexity: 58.32 | LR: 2.10e-04 | 


Epoch 1/1:  24%|██▍       | 7451/31250 [59:50<3:06:48,  2.12it/s]


Step 7450 | Loss: 4.1907 | Perplexity: 66.80 | LR: 2.08e-04 | 


Epoch 1/1:  24%|██▍       | 7501/31250 [1:00:14<3:05:36,  2.13it/s]


Step 7500 | Loss: 4.1746 | Perplexity: 70.69 | LR: 2.07e-04 | 


Epoch 1/1:  24%|██▍       | 7551/31250 [1:00:37<3:04:09,  2.14it/s]


Step 7550 | Loss: 4.1533 | Perplexity: 61.80 | LR: 2.06e-04 | 


Epoch 1/1:  24%|██▍       | 7601/31250 [1:01:01<3:04:40,  2.13it/s]


Step 7600 | Loss: 4.1796 | Perplexity: 60.06 | LR: 2.05e-04 | 


Epoch 1/1:  24%|██▍       | 7651/31250 [1:01:24<3:04:51,  2.13it/s]


Step 7650 | Loss: 4.1768 | Perplexity: 74.53 | LR: 2.04e-04 | 


Epoch 1/1:  25%|██▍       | 7701/31250 [1:01:47<3:02:41,  2.15it/s]


Step 7700 | Loss: 4.1814 | Perplexity: 61.09 | LR: 2.03e-04 | 


Epoch 1/1:  25%|██▍       | 7751/31250 [1:02:11<3:04:03,  2.13it/s]


Step 7750 | Loss: 4.1929 | Perplexity: 60.40 | LR: 2.02e-04 | 


Epoch 1/1:  25%|██▍       | 7801/31250 [1:02:34<3:03:44,  2.13it/s]


Step 7800 | Loss: 4.1753 | Perplexity: 64.22 | LR: 2.01e-04 | 


Epoch 1/1:  25%|██▌       | 7851/31250 [1:02:58<3:02:45,  2.13it/s]


Step 7850 | Loss: 4.1588 | Perplexity: 63.69 | LR: 2.00e-04 | 


Epoch 1/1:  25%|██▌       | 7901/31250 [1:03:21<3:03:36,  2.12it/s]


Step 7900 | Loss: 4.1656 | Perplexity: 67.27 | LR: 1.99e-04 | 


Epoch 1/1:  25%|██▌       | 7951/31250 [1:03:44<3:01:55,  2.13it/s]


Step 7950 | Loss: 4.1688 | Perplexity: 60.34 | LR: 1.97e-04 | 


Epoch 1/1:  26%|██▌       | 8001/31250 [1:04:08<3:01:06,  2.14it/s]


Step 8000 | Loss: 4.1901 | Perplexity: 74.94 | LR: 1.96e-04 | 


Epoch 1/1:  26%|██▌       | 8051/31250 [1:04:31<3:01:26,  2.13it/s]


Step 8050 | Loss: 4.1672 | Perplexity: 57.33 | LR: 1.95e-04 | 


Epoch 1/1:  26%|██▌       | 8101/31250 [1:04:55<2:59:42,  2.15it/s]


Step 8100 | Loss: 4.1776 | Perplexity: 62.71 | LR: 1.94e-04 | 


Epoch 1/1:  26%|██▌       | 8151/31250 [1:05:18<2:59:40,  2.14it/s]


Step 8150 | Loss: 4.1596 | Perplexity: 53.08 | LR: 1.93e-04 | 


Epoch 1/1:  26%|██▌       | 8201/31250 [1:05:41<3:00:17,  2.13it/s]


Step 8200 | Loss: 4.2118 | Perplexity: 61.10 | LR: 1.92e-04 | 


Epoch 1/1:  26%|██▋       | 8251/31250 [1:06:05<2:59:10,  2.14it/s]


Step 8250 | Loss: 4.1810 | Perplexity: 70.86 | LR: 1.91e-04 | 


Epoch 1/1:  27%|██▋       | 8301/31250 [1:06:28<2:58:39,  2.14it/s]


Step 8300 | Loss: 4.1815 | Perplexity: 66.49 | LR: 1.90e-04 | 


Epoch 1/1:  27%|██▋       | 8351/31250 [1:06:51<2:59:08,  2.13it/s]


Step 8350 | Loss: 4.1889 | Perplexity: 63.79 | LR: 1.88e-04 | 


Epoch 1/1:  27%|██▋       | 8401/31250 [1:07:15<2:57:43,  2.14it/s]


Step 8400 | Loss: 4.1995 | Perplexity: 62.31 | LR: 1.87e-04 | 


Epoch 1/1:  27%|██▋       | 8451/31250 [1:07:38<2:57:45,  2.14it/s]


Step 8450 | Loss: 4.1633 | Perplexity: 67.56 | LR: 1.86e-04 | 


Epoch 1/1:  27%|██▋       | 8501/31250 [1:08:02<2:59:13,  2.12it/s]


Step 8500 | Loss: 4.1663 | Perplexity: 60.52 | LR: 1.85e-04 | 


Epoch 1/1:  27%|██▋       | 8551/31250 [1:08:25<2:57:20,  2.13it/s]


Step 8550 | Loss: 4.1886 | Perplexity: 70.72 | LR: 1.84e-04 | 


Epoch 1/1:  28%|██▊       | 8601/31250 [1:08:48<2:56:01,  2.14it/s]


Step 8600 | Loss: 4.1731 | Perplexity: 66.14 | LR: 1.83e-04 | 


Epoch 1/1:  28%|██▊       | 8651/31250 [1:09:12<2:57:06,  2.13it/s]


Step 8650 | Loss: 4.1615 | Perplexity: 73.31 | LR: 1.82e-04 | 


Epoch 1/1:  28%|██▊       | 8701/31250 [1:09:35<2:54:55,  2.15it/s]


Step 8700 | Loss: 4.1799 | Perplexity: 64.66 | LR: 1.80e-04 | 


Epoch 1/1:  28%|██▊       | 8751/31250 [1:09:59<2:56:24,  2.13it/s]


Step 8750 | Loss: 4.1825 | Perplexity: 64.15 | LR: 1.79e-04 | 


Epoch 1/1:  28%|██▊       | 8801/31250 [1:10:22<2:56:50,  2.12it/s]


Step 8800 | Loss: 4.1643 | Perplexity: 64.38 | LR: 1.78e-04 | 


Epoch 1/1:  28%|██▊       | 8851/31250 [1:10:46<2:53:53,  2.15it/s]


Step 8850 | Loss: 4.1682 | Perplexity: 65.28 | LR: 1.77e-04 | 


Epoch 1/1:  28%|██▊       | 8901/31250 [1:11:09<2:53:56,  2.14it/s]


Step 8900 | Loss: 4.1738 | Perplexity: 63.19 | LR: 1.76e-04 | 


Epoch 1/1:  29%|██▊       | 8951/31250 [1:11:32<2:54:14,  2.13it/s]


Step 8950 | Loss: 4.1576 | Perplexity: 58.77 | LR: 1.75e-04 | 


Epoch 1/1:  29%|██▉       | 9001/31250 [1:11:56<2:54:36,  2.12it/s]


Step 9000 | Loss: 4.1491 | Perplexity: 66.44 | LR: 1.73e-04 | 


Epoch 1/1:  29%|██▉       | 9051/31250 [1:12:19<2:52:32,  2.14it/s]


Step 9050 | Loss: 4.1817 | Perplexity: 70.57 | LR: 1.72e-04 | 


Epoch 1/1:  29%|██▉       | 9101/31250 [1:12:42<2:52:19,  2.14it/s]


Step 9100 | Loss: 4.1568 | Perplexity: 68.34 | LR: 1.71e-04 | 


Epoch 1/1:  29%|██▉       | 9151/31250 [1:13:06<2:51:50,  2.14it/s]


Step 9150 | Loss: 4.1725 | Perplexity: 64.67 | LR: 1.70e-04 | 


Epoch 1/1:  29%|██▉       | 9201/31250 [1:13:29<2:52:31,  2.13it/s]


Step 9200 | Loss: 4.1575 | Perplexity: 61.32 | LR: 1.69e-04 | 


Epoch 1/1:  30%|██▉       | 9251/31250 [1:13:53<2:53:03,  2.12it/s]


Step 9250 | Loss: 4.1684 | Perplexity: 64.77 | LR: 1.68e-04 | 


Epoch 1/1:  30%|██▉       | 9301/31250 [1:14:16<2:50:13,  2.15it/s]


Step 9300 | Loss: 4.1824 | Perplexity: 59.55 | LR: 1.66e-04 | 


Epoch 1/1:  30%|██▉       | 9351/31250 [1:14:40<2:50:06,  2.15it/s]


Step 9350 | Loss: 4.1624 | Perplexity: 62.24 | LR: 1.65e-04 | 


Epoch 1/1:  30%|███       | 9401/31250 [1:15:03<2:50:52,  2.13it/s]


Step 9400 | Loss: 4.1734 | Perplexity: 65.33 | LR: 1.64e-04 | 


Epoch 1/1:  30%|███       | 9451/31250 [1:15:26<2:49:52,  2.14it/s]


Step 9450 | Loss: 4.1464 | Perplexity: 64.59 | LR: 1.63e-04 | 


Epoch 1/1:  30%|███       | 9501/31250 [1:15:50<2:50:00,  2.13it/s]


Step 9500 | Loss: 4.1670 | Perplexity: 66.69 | LR: 1.62e-04 | 


Epoch 1/1:  31%|███       | 9551/31250 [1:16:13<2:48:23,  2.15it/s]


Step 9550 | Loss: 4.1670 | Perplexity: 61.00 | LR: 1.61e-04 | 


Epoch 1/1:  31%|███       | 9601/31250 [1:16:36<2:49:08,  2.13it/s]


Step 9600 | Loss: 4.1840 | Perplexity: 68.92 | LR: 1.59e-04 | 


Epoch 1/1:  31%|███       | 9651/31250 [1:17:00<2:48:58,  2.13it/s]


Step 9650 | Loss: 4.1813 | Perplexity: 75.23 | LR: 1.58e-04 | 


Epoch 1/1:  31%|███       | 9701/31250 [1:17:23<2:48:48,  2.13it/s]


Step 9700 | Loss: 4.1407 | Perplexity: 61.33 | LR: 1.57e-04 | 


Epoch 1/1:  31%|███       | 9751/31250 [1:17:47<2:47:02,  2.15it/s]


Step 9750 | Loss: 4.1690 | Perplexity: 60.34 | LR: 1.56e-04 | 


Epoch 1/1:  31%|███▏      | 9801/31250 [1:18:10<2:47:11,  2.14it/s]


Step 9800 | Loss: 4.1860 | Perplexity: 56.30 | LR: 1.55e-04 | 


Epoch 1/1:  32%|███▏      | 9851/31250 [1:18:33<2:47:15,  2.13it/s]


Step 9850 | Loss: 4.1738 | Perplexity: 63.16 | LR: 1.54e-04 | 


Epoch 1/1:  32%|███▏      | 9901/31250 [1:18:57<2:45:31,  2.15it/s]


Step 9900 | Loss: 4.1495 | Perplexity: 59.57 | LR: 1.52e-04 | 


Epoch 1/1:  32%|███▏      | 9951/31250 [1:19:20<2:47:12,  2.12it/s]


Step 9950 | Loss: 4.1750 | Perplexity: 57.18 | LR: 1.51e-04 | 


Epoch 1/1:  32%|███▏      | 10001/31250 [1:19:44<2:46:29,  2.13it/s]


Step 10000 | Loss: 4.1826 | Perplexity: 67.85 | LR: 1.50e-04 | 


Epoch 1/1:  32%|███▏      | 10051/31250 [1:20:07<2:44:49,  2.14it/s]


Step 10050 | Loss: 4.1765 | Perplexity: 68.28 | LR: 1.49e-04 | 


Epoch 1/1:  32%|███▏      | 10101/31250 [1:20:30<2:45:30,  2.13it/s]


Step 10100 | Loss: 4.1646 | Perplexity: 73.28 | LR: 1.48e-04 | 


Epoch 1/1:  32%|███▏      | 10151/31250 [1:20:54<2:44:01,  2.14it/s]


Step 10150 | Loss: 4.1599 | Perplexity: 64.87 | LR: 1.46e-04 | 


Epoch 1/1:  33%|███▎      | 10201/31250 [1:21:17<2:43:30,  2.15it/s]


Step 10200 | Loss: 4.1838 | Perplexity: 56.85 | LR: 1.45e-04 | 


Epoch 1/1:  33%|███▎      | 10251/31250 [1:21:41<2:44:43,  2.12it/s]


Step 10250 | Loss: 4.1624 | Perplexity: 71.21 | LR: 1.44e-04 | 


Epoch 1/1:  33%|███▎      | 10301/31250 [1:22:04<2:43:22,  2.14it/s]


Step 10300 | Loss: 4.1621 | Perplexity: 63.44 | LR: 1.43e-04 | 


Epoch 1/1:  33%|███▎      | 10351/31250 [1:22:27<2:42:37,  2.14it/s]


Step 10350 | Loss: 4.1624 | Perplexity: 58.96 | LR: 1.42e-04 | 


Epoch 1/1:  33%|███▎      | 10401/31250 [1:22:51<2:43:16,  2.13it/s]


Step 10400 | Loss: 4.1509 | Perplexity: 58.47 | LR: 1.41e-04 | 


Epoch 1/1:  33%|███▎      | 10451/31250 [1:23:14<2:42:28,  2.13it/s]


Step 10450 | Loss: 4.1745 | Perplexity: 79.64 | LR: 1.39e-04 | 


Epoch 1/1:  34%|███▎      | 10501/31250 [1:23:38<2:42:52,  2.12it/s]


Step 10500 | Loss: 4.1295 | Perplexity: 57.46 | LR: 1.38e-04 | 


Epoch 1/1:  34%|███▍      | 10551/31250 [1:24:01<2:42:26,  2.12it/s]


Step 10550 | Loss: 4.1726 | Perplexity: 63.46 | LR: 1.37e-04 | 


Epoch 1/1:  34%|███▍      | 10601/31250 [1:24:25<2:41:03,  2.14it/s]


Step 10600 | Loss: 4.1645 | Perplexity: 63.82 | LR: 1.36e-04 | 


Epoch 1/1:  34%|███▍      | 10651/31250 [1:24:48<2:39:47,  2.15it/s]


Step 10650 | Loss: 4.1716 | Perplexity: 59.98 | LR: 1.35e-04 | 


Epoch 1/1:  34%|███▍      | 10701/31250 [1:25:12<2:40:10,  2.14it/s]


Step 10700 | Loss: 4.1921 | Perplexity: 76.55 | LR: 1.34e-04 | 


Epoch 1/1:  34%|███▍      | 10751/31250 [1:25:35<2:39:59,  2.14it/s]


Step 10750 | Loss: 4.1620 | Perplexity: 66.29 | LR: 1.32e-04 | 


Epoch 1/1:  35%|███▍      | 10801/31250 [1:25:58<2:38:54,  2.14it/s]


Step 10800 | Loss: 4.1822 | Perplexity: 66.10 | LR: 1.31e-04 | 


Epoch 1/1:  35%|███▍      | 10851/31250 [1:26:22<2:40:50,  2.11it/s]


Step 10850 | Loss: 4.1747 | Perplexity: 69.22 | LR: 1.30e-04 | 


Epoch 1/1:  35%|███▍      | 10901/31250 [1:26:45<2:38:30,  2.14it/s]


Step 10900 | Loss: 4.1639 | Perplexity: 60.13 | LR: 1.29e-04 | 


Epoch 1/1:  35%|███▌      | 10951/31250 [1:27:08<2:37:53,  2.14it/s]


Step 10950 | Loss: 4.1610 | Perplexity: 63.86 | LR: 1.28e-04 | 


Epoch 1/1:  35%|███▌      | 11001/31250 [1:27:32<2:39:47,  2.11it/s]


Step 11000 | Loss: 4.1849 | Perplexity: 65.17 | LR: 1.27e-04 | 


Epoch 1/1:  35%|███▌      | 11051/31250 [1:27:55<2:37:05,  2.14it/s]


Step 11050 | Loss: 4.1540 | Perplexity: 73.75 | LR: 1.25e-04 | 


Epoch 1/1:  36%|███▌      | 11101/31250 [1:28:18<2:36:29,  2.15it/s]


Step 11100 | Loss: 4.1527 | Perplexity: 67.03 | LR: 1.24e-04 | 


Epoch 1/1:  36%|███▌      | 11151/31250 [1:28:42<2:38:15,  2.12it/s]


Step 11150 | Loss: 4.1631 | Perplexity: 52.47 | LR: 1.23e-04 | 


Epoch 1/1:  36%|███▌      | 11201/31250 [1:29:05<2:36:06,  2.14it/s]


Step 11200 | Loss: 4.1561 | Perplexity: 60.72 | LR: 1.22e-04 | 


Epoch 1/1:  36%|███▌      | 11251/31250 [1:29:29<2:36:06,  2.14it/s]


Step 11250 | Loss: 4.1659 | Perplexity: 70.88 | LR: 1.21e-04 | 


Epoch 1/1:  36%|███▌      | 11301/31250 [1:29:52<2:36:34,  2.12it/s]


Step 11300 | Loss: 4.1880 | Perplexity: 63.01 | LR: 1.20e-04 | 


Epoch 1/1:  36%|███▋      | 11351/31250 [1:30:16<2:35:05,  2.14it/s]


Step 11350 | Loss: 4.1702 | Perplexity: 65.72 | LR: 1.18e-04 | 


Epoch 1/1:  36%|███▋      | 11401/31250 [1:30:39<2:34:12,  2.15it/s]


Step 11400 | Loss: 4.1494 | Perplexity: 56.53 | LR: 1.17e-04 | 


Epoch 1/1:  37%|███▋      | 11451/31250 [1:31:02<2:34:35,  2.13it/s]


Step 11450 | Loss: 4.1849 | Perplexity: 74.22 | LR: 1.16e-04 | 


Epoch 1/1:  37%|███▋      | 11501/31250 [1:31:26<2:34:18,  2.13it/s]


Step 11500 | Loss: 4.1733 | Perplexity: 60.73 | LR: 1.15e-04 | 


Epoch 1/1:  37%|███▋      | 11551/31250 [1:31:49<2:34:58,  2.12it/s]


Step 11550 | Loss: 4.1925 | Perplexity: 64.75 | LR: 1.14e-04 | 


Epoch 1/1:  37%|███▋      | 11601/31250 [1:32:13<2:33:53,  2.13it/s]


Step 11600 | Loss: 4.1435 | Perplexity: 57.02 | LR: 1.13e-04 | 


Epoch 1/1:  37%|███▋      | 11651/31250 [1:32:36<2:32:50,  2.14it/s]


Step 11650 | Loss: 4.1588 | Perplexity: 75.97 | LR: 1.12e-04 | 


Epoch 1/1:  37%|███▋      | 11701/31250 [1:33:00<2:32:30,  2.14it/s]


Step 11700 | Loss: 4.1607 | Perplexity: 62.80 | LR: 1.10e-04 | 


Epoch 1/1:  38%|███▊      | 11751/31250 [1:33:23<2:33:20,  2.12it/s]


Step 11750 | Loss: 4.1389 | Perplexity: 62.24 | LR: 1.09e-04 | 


Epoch 1/1:  38%|███▊      | 11801/31250 [1:33:46<2:33:12,  2.12it/s]


Step 11800 | Loss: 4.1769 | Perplexity: 63.56 | LR: 1.08e-04 | 


Epoch 1/1:  38%|███▊      | 11851/31250 [1:34:10<2:32:01,  2.13it/s]


Step 11850 | Loss: 4.1574 | Perplexity: 60.93 | LR: 1.07e-04 | 


Epoch 1/1:  38%|███▊      | 11901/31250 [1:34:33<2:31:13,  2.13it/s]


Step 11900 | Loss: 4.1424 | Perplexity: 69.42 | LR: 1.06e-04 | 


Epoch 1/1:  38%|███▊      | 11951/31250 [1:34:57<2:31:24,  2.12it/s]


Step 11950 | Loss: 4.1556 | Perplexity: 58.13 | LR: 1.05e-04 | 


Epoch 1/1:  38%|███▊      | 12001/31250 [1:35:20<2:32:01,  2.11it/s]


Step 12000 | Loss: 4.1591 | Perplexity: 62.27 | LR: 1.04e-04 | 


Epoch 1/1:  39%|███▊      | 12051/31250 [1:35:44<2:30:13,  2.13it/s]


Step 12050 | Loss: 4.1879 | Perplexity: 58.30 | LR: 1.03e-04 | 


Epoch 1/1:  39%|███▊      | 12101/31250 [1:36:08<2:49:46,  1.88it/s]


Step 12100 | Loss: 4.1666 | Perplexity: 65.69 | LR: 1.01e-04 | 


Epoch 1/1:  39%|███▉      | 12151/31250 [1:36:31<2:29:44,  2.13it/s]


Step 12150 | Loss: 4.1888 | Perplexity: 59.73 | LR: 1.00e-04 | 


Epoch 1/1:  39%|███▉      | 12201/31250 [1:36:54<2:29:26,  2.12it/s]


Step 12200 | Loss: 4.1506 | Perplexity: 73.18 | LR: 9.92e-05 | 


Epoch 1/1:  39%|███▉      | 12251/31250 [1:37:18<2:28:59,  2.13it/s]


Step 12250 | Loss: 4.1629 | Perplexity: 61.80 | LR: 9.81e-05 | 


Epoch 1/1:  39%|███▉      | 12301/31250 [1:37:41<2:27:36,  2.14it/s]


Step 12300 | Loss: 4.1559 | Perplexity: 62.32 | LR: 9.70e-05 | 


Epoch 1/1:  40%|███▉      | 12351/31250 [1:38:04<2:26:53,  2.14it/s]


Step 12350 | Loss: 4.1603 | Perplexity: 66.59 | LR: 9.59e-05 | 


Epoch 1/1:  40%|███▉      | 12401/31250 [1:38:28<2:26:28,  2.14it/s]


Step 12400 | Loss: 4.1995 | Perplexity: 64.27 | LR: 9.48e-05 | 


Epoch 1/1:  40%|███▉      | 12451/31250 [1:38:51<2:26:36,  2.14it/s]


Step 12450 | Loss: 4.1728 | Perplexity: 64.65 | LR: 9.37e-05 | 


Epoch 1/1:  40%|████      | 12501/31250 [1:39:15<2:25:52,  2.14it/s]


Step 12500 | Loss: 4.1641 | Perplexity: 58.64 | LR: 9.26e-05 | 


Epoch 1/1:  40%|████      | 12551/31250 [1:39:38<2:25:37,  2.14it/s]


Step 12550 | Loss: 4.1680 | Perplexity: 56.33 | LR: 9.15e-05 | 


Epoch 1/1:  40%|████      | 12601/31250 [1:40:02<2:27:14,  2.11it/s]


Step 12600 | Loss: 4.1535 | Perplexity: 85.71 | LR: 9.04e-05 | 


Epoch 1/1:  40%|████      | 12651/31250 [1:40:25<2:26:14,  2.12it/s]


Step 12650 | Loss: 4.1602 | Perplexity: 71.53 | LR: 8.93e-05 | 


Epoch 1/1:  41%|████      | 12701/31250 [1:40:49<2:24:05,  2.15it/s]


Step 12700 | Loss: 4.1669 | Perplexity: 67.32 | LR: 8.83e-05 | 


Epoch 1/1:  41%|████      | 12751/31250 [1:41:12<2:25:13,  2.12it/s]


Step 12750 | Loss: 4.1527 | Perplexity: 57.62 | LR: 8.72e-05 | 


Epoch 1/1:  41%|████      | 12801/31250 [1:41:35<2:23:35,  2.14it/s]


Step 12800 | Loss: 4.1749 | Perplexity: 59.04 | LR: 8.61e-05 | 


Epoch 1/1:  41%|████      | 12851/31250 [1:41:59<2:22:58,  2.14it/s]


Step 12850 | Loss: 4.1552 | Perplexity: 57.46 | LR: 8.50e-05 | 


Epoch 1/1:  41%|████▏     | 12901/31250 [1:42:22<2:24:33,  2.12it/s]


Step 12900 | Loss: 4.1481 | Perplexity: 65.60 | LR: 8.40e-05 | 


Epoch 1/1:  41%|████▏     | 12951/31250 [1:42:46<2:23:41,  2.12it/s]


Step 12950 | Loss: 4.1647 | Perplexity: 63.40 | LR: 8.29e-05 | 


Epoch 1/1:  42%|████▏     | 13001/31250 [1:43:09<2:23:18,  2.12it/s]


Step 13000 | Loss: 4.1516 | Perplexity: 67.89 | LR: 8.19e-05 | 


Epoch 1/1:  42%|████▏     | 13051/31250 [1:43:33<2:22:54,  2.12it/s]


Step 13050 | Loss: 4.1660 | Perplexity: 66.98 | LR: 8.08e-05 | 


Epoch 1/1:  42%|████▏     | 13101/31250 [1:43:56<2:21:23,  2.14it/s]


Step 13100 | Loss: 4.1568 | Perplexity: 68.29 | LR: 7.98e-05 | 


Epoch 1/1:  42%|████▏     | 13151/31250 [1:44:20<2:22:10,  2.12it/s]


Step 13150 | Loss: 4.1717 | Perplexity: 64.89 | LR: 7.88e-05 | 


Epoch 1/1:  42%|████▏     | 13201/31250 [1:44:43<2:21:18,  2.13it/s]


Step 13200 | Loss: 4.1737 | Perplexity: 51.86 | LR: 7.77e-05 | 


Epoch 1/1:  42%|████▏     | 13251/31250 [1:45:07<2:20:16,  2.14it/s]


Step 13250 | Loss: 4.1632 | Perplexity: 60.98 | LR: 7.67e-05 | 


Epoch 1/1:  43%|████▎     | 13301/31250 [1:45:30<2:20:09,  2.13it/s]


Step 13300 | Loss: 4.1475 | Perplexity: 71.69 | LR: 7.57e-05 | 


Epoch 1/1:  43%|████▎     | 13351/31250 [1:45:53<2:19:23,  2.14it/s]


Step 13350 | Loss: 4.1451 | Perplexity: 59.07 | LR: 7.46e-05 | 


Epoch 1/1:  43%|████▎     | 13401/31250 [1:46:17<2:18:33,  2.15it/s]


Step 13400 | Loss: 4.1791 | Perplexity: 58.78 | LR: 7.36e-05 | 


Epoch 1/1:  43%|████▎     | 13451/31250 [1:46:40<2:19:23,  2.13it/s]


Step 13450 | Loss: 4.1745 | Perplexity: 66.24 | LR: 7.26e-05 | 


Epoch 1/1:  43%|████▎     | 13501/31250 [1:47:04<2:18:21,  2.14it/s]


Step 13500 | Loss: 4.1936 | Perplexity: 64.27 | LR: 7.16e-05 | 


Epoch 1/1:  43%|████▎     | 13551/31250 [1:47:27<2:17:55,  2.14it/s]


Step 13550 | Loss: 4.1362 | Perplexity: 64.95 | LR: 7.06e-05 | 


Epoch 1/1:  44%|████▎     | 13601/31250 [1:47:51<2:18:41,  2.12it/s]


Step 13600 | Loss: 4.1669 | Perplexity: 62.98 | LR: 6.96e-05 | 


Epoch 1/1:  44%|████▎     | 13651/31250 [1:48:14<2:17:25,  2.13it/s]


Step 13650 | Loss: 4.1566 | Perplexity: 74.52 | LR: 6.86e-05 | 


Epoch 1/1:  44%|████▍     | 13701/31250 [1:48:37<2:16:13,  2.15it/s]


Step 13700 | Loss: 4.1949 | Perplexity: 63.96 | LR: 6.76e-05 | 


Epoch 1/1:  44%|████▍     | 13751/31250 [1:49:01<2:17:02,  2.13it/s]


Step 13750 | Loss: 4.1491 | Perplexity: 57.32 | LR: 6.66e-05 | 


Epoch 1/1:  44%|████▍     | 13801/31250 [1:49:24<2:15:54,  2.14it/s]


Step 13800 | Loss: 4.1839 | Perplexity: 60.70 | LR: 6.57e-05 | 


Epoch 1/1:  44%|████▍     | 13851/31250 [1:49:48<2:14:29,  2.16it/s]


Step 13850 | Loss: 4.1363 | Perplexity: 74.88 | LR: 6.47e-05 | 


Epoch 1/1:  44%|████▍     | 13901/31250 [1:50:11<2:15:15,  2.14it/s]


Step 13900 | Loss: 4.1727 | Perplexity: 61.26 | LR: 6.37e-05 | 


Epoch 1/1:  45%|████▍     | 13951/31250 [1:50:34<2:14:00,  2.15it/s]


Step 13950 | Loss: 4.1823 | Perplexity: 77.55 | LR: 6.28e-05 | 


Epoch 1/1:  45%|████▍     | 14001/31250 [1:50:58<2:14:36,  2.14it/s]


Step 14000 | Loss: 4.1710 | Perplexity: 70.87 | LR: 6.18e-05 | 


Epoch 1/1:  45%|████▍     | 14051/31250 [1:51:21<2:14:21,  2.13it/s]


Step 14050 | Loss: 4.1493 | Perplexity: 67.71 | LR: 6.09e-05 | 


Epoch 1/1:  45%|████▌     | 14101/31250 [1:51:44<2:12:56,  2.15it/s]


Step 14100 | Loss: 4.1754 | Perplexity: 65.66 | LR: 5.99e-05 | 


Epoch 1/1:  45%|████▌     | 14151/31250 [1:52:08<2:12:38,  2.15it/s]


Step 14150 | Loss: 4.1530 | Perplexity: 60.53 | LR: 5.90e-05 | 


Epoch 1/1:  45%|████▌     | 14201/31250 [1:52:31<2:13:52,  2.12it/s]


Step 14200 | Loss: 4.1590 | Perplexity: 61.51 | LR: 5.80e-05 | 


Epoch 1/1:  46%|████▌     | 14250/31250 [1:52:54<2:14:01,  2.11it/s]


Step 14250 | Loss: 4.1613 | Perplexity: 56.00 | LR: 5.71e-05 | 


Epoch 1/1:  46%|████▌     | 14301/31250 [1:53:34<2:12:07,  2.14it/s] 


Step 14300 | Loss: 4.1815 | Perplexity: 67.70 | LR: 5.62e-05 | 


Epoch 1/1:  46%|████▌     | 14351/31250 [1:53:57<2:11:14,  2.15it/s]


Step 14350 | Loss: 4.1585 | Perplexity: 61.82 | LR: 5.53e-05 | 


Epoch 1/1:  46%|████▌     | 14401/31250 [1:54:20<2:12:37,  2.12it/s]


Step 14400 | Loss: 4.1683 | Perplexity: 68.65 | LR: 5.44e-05 | 


Epoch 1/1:  46%|████▌     | 14451/31250 [1:54:44<2:11:01,  2.14it/s]


Step 14450 | Loss: 4.1804 | Perplexity: 69.20 | LR: 5.35e-05 | 


Epoch 1/1:  46%|████▋     | 14501/31250 [1:55:07<2:10:26,  2.14it/s]


Step 14500 | Loss: 4.1570 | Perplexity: 57.70 | LR: 5.26e-05 | 


Epoch 1/1:  47%|████▋     | 14551/31250 [1:55:31<2:10:42,  2.13it/s]


Step 14550 | Loss: 4.1604 | Perplexity: 67.33 | LR: 5.17e-05 | 


Epoch 1/1:  47%|████▋     | 14601/31250 [1:55:54<2:09:51,  2.14it/s]


Step 14600 | Loss: 4.1508 | Perplexity: 58.74 | LR: 5.08e-05 | 


Epoch 1/1:  47%|████▋     | 14651/31250 [1:56:17<2:09:37,  2.13it/s]


Step 14650 | Loss: 4.1628 | Perplexity: 63.29 | LR: 4.99e-05 | 


Epoch 1/1:  47%|████▋     | 14701/31250 [1:56:41<2:10:50,  2.11it/s]


Step 14700 | Loss: 4.1615 | Perplexity: 71.16 | LR: 4.90e-05 | 


Epoch 1/1:  47%|████▋     | 14751/31250 [1:57:05<2:08:55,  2.13it/s]


Step 14750 | Loss: 4.1604 | Perplexity: 65.53 | LR: 4.82e-05 | 


Epoch 1/1:  47%|████▋     | 14801/31250 [1:57:28<2:07:51,  2.14it/s]


Step 14800 | Loss: 4.1622 | Perplexity: 66.12 | LR: 4.73e-05 | 


Epoch 1/1:  48%|████▊     | 14851/31250 [1:57:51<2:08:38,  2.12it/s]


Step 14850 | Loss: 4.1547 | Perplexity: 57.78 | LR: 4.64e-05 | 


Epoch 1/1:  48%|████▊     | 14901/31250 [1:58:15<2:06:54,  2.15it/s]


Step 14900 | Loss: 4.1715 | Perplexity: 70.72 | LR: 4.56e-05 | 


Epoch 1/1:  48%|████▊     | 14951/31250 [1:58:38<2:06:26,  2.15it/s]


Step 14950 | Loss: 4.1563 | Perplexity: 70.89 | LR: 4.48e-05 | 


Epoch 1/1:  48%|████▊     | 15001/31250 [1:59:01<2:07:29,  2.12it/s]


Step 15000 | Loss: 4.1626 | Perplexity: 60.33 | LR: 4.39e-05 | 


Epoch 1/1:  48%|████▊     | 15051/31250 [1:59:25<2:06:16,  2.14it/s]


Step 15050 | Loss: 4.1189 | Perplexity: 62.35 | LR: 4.31e-05 | 


Epoch 1/1:  48%|████▊     | 15101/31250 [1:59:48<2:05:54,  2.14it/s]


Step 15100 | Loss: 4.1421 | Perplexity: 67.41 | LR: 4.23e-05 | 


Epoch 1/1:  48%|████▊     | 15151/31250 [2:00:12<2:05:51,  2.13it/s]


Step 15150 | Loss: 4.1599 | Perplexity: 50.97 | LR: 4.14e-05 | 


Epoch 1/1:  49%|████▊     | 15201/31250 [2:00:35<2:05:18,  2.13it/s]


Step 15200 | Loss: 4.1783 | Perplexity: 64.12 | LR: 4.06e-05 | 


Epoch 1/1:  49%|████▉     | 15251/31250 [2:00:59<2:05:07,  2.13it/s]


Step 15250 | Loss: 4.1599 | Perplexity: 71.48 | LR: 3.98e-05 | 


Epoch 1/1:  49%|████▉     | 15301/31250 [2:01:22<2:05:50,  2.11it/s]


Step 15300 | Loss: 4.1638 | Perplexity: 68.69 | LR: 3.90e-05 | 


Epoch 1/1:  49%|████▉     | 15351/31250 [2:01:45<2:03:43,  2.14it/s]


Step 15350 | Loss: 4.1874 | Perplexity: 73.26 | LR: 3.83e-05 | 


Epoch 1/1:  49%|████▉     | 15401/31250 [2:02:09<2:03:50,  2.13it/s]


Step 15400 | Loss: 4.1594 | Perplexity: 56.26 | LR: 3.75e-05 | 


Epoch 1/1:  49%|████▉     | 15451/31250 [2:02:33<2:03:52,  2.13it/s]


Step 15450 | Loss: 4.1650 | Perplexity: 68.48 | LR: 3.67e-05 | 


Epoch 1/1:  50%|████▉     | 15501/31250 [2:02:56<2:02:59,  2.13it/s]


Step 15500 | Loss: 4.1450 | Perplexity: 67.04 | LR: 3.59e-05 | 


Epoch 1/1:  50%|████▉     | 15551/31250 [2:03:20<2:02:40,  2.13it/s]


Step 15550 | Loss: 4.1551 | Perplexity: 57.65 | LR: 3.52e-05 | 


Epoch 1/1:  50%|████▉     | 15601/31250 [2:03:43<2:02:34,  2.13it/s]


Step 15600 | Loss: 4.1334 | Perplexity: 56.76 | LR: 3.44e-05 | 


Epoch 1/1:  50%|█████     | 15651/31250 [2:04:06<2:01:02,  2.15it/s]


Step 15650 | Loss: 4.1339 | Perplexity: 56.82 | LR: 3.37e-05 | 


Epoch 1/1:  50%|█████     | 15701/31250 [2:04:30<2:01:42,  2.13it/s]


Step 15700 | Loss: 4.1504 | Perplexity: 70.55 | LR: 3.29e-05 | 


Epoch 1/1:  50%|█████     | 15751/31250 [2:04:53<2:01:30,  2.13it/s]


Step 15750 | Loss: 4.1684 | Perplexity: 65.91 | LR: 3.22e-05 | 


Epoch 1/1:  51%|█████     | 15801/31250 [2:05:17<1:59:45,  2.15it/s]


Step 15800 | Loss: 4.1637 | Perplexity: 51.58 | LR: 3.15e-05 | 


Epoch 1/1:  51%|█████     | 15851/31250 [2:05:40<2:00:50,  2.12it/s]


Step 15850 | Loss: 4.1566 | Perplexity: 64.43 | LR: 3.07e-05 | 


Epoch 1/1:  51%|█████     | 15901/31250 [2:06:03<2:01:08,  2.11it/s]


Step 15900 | Loss: 4.1382 | Perplexity: 48.02 | LR: 3.00e-05 | 


Epoch 1/1:  51%|█████     | 15951/31250 [2:06:27<2:00:28,  2.12it/s]


Step 15950 | Loss: 4.1677 | Perplexity: 53.25 | LR: 2.93e-05 | 


Epoch 1/1:  51%|█████     | 16001/31250 [2:06:51<2:00:54,  2.10it/s]


Step 16000 | Loss: 4.1631 | Perplexity: 69.20 | LR: 2.86e-05 | 


Epoch 1/1:  51%|█████▏    | 16051/31250 [2:07:14<1:57:52,  2.15it/s]


Step 16050 | Loss: 4.1551 | Perplexity: 64.63 | LR: 2.79e-05 | 


Epoch 1/1:  52%|█████▏    | 16101/31250 [2:07:37<1:57:27,  2.15it/s]


Step 16100 | Loss: 4.1730 | Perplexity: 77.14 | LR: 2.73e-05 | 


Epoch 1/1:  52%|█████▏    | 16151/31250 [2:08:01<1:57:38,  2.14it/s]


Step 16150 | Loss: 4.1492 | Perplexity: 60.15 | LR: 2.66e-05 | 


Epoch 1/1:  52%|█████▏    | 16201/31250 [2:08:24<1:57:50,  2.13it/s]


Step 16200 | Loss: 4.1589 | Perplexity: 60.18 | LR: 2.59e-05 | 


Epoch 1/1:  52%|█████▏    | 16251/31250 [2:08:48<1:57:13,  2.13it/s]


Step 16250 | Loss: 4.1729 | Perplexity: 67.95 | LR: 2.53e-05 | 


Epoch 1/1:  52%|█████▏    | 16301/31250 [2:09:11<1:56:47,  2.13it/s]


Step 16300 | Loss: 4.1627 | Perplexity: 68.56 | LR: 2.46e-05 | 


Epoch 1/1:  52%|█████▏    | 16351/31250 [2:09:34<1:55:44,  2.15it/s]


Step 16350 | Loss: 4.1806 | Perplexity: 52.79 | LR: 2.40e-05 | 


Epoch 1/1:  52%|█████▏    | 16401/31250 [2:09:58<1:55:31,  2.14it/s]


Step 16400 | Loss: 4.1593 | Perplexity: 74.11 | LR: 2.33e-05 | 


Epoch 1/1:  53%|█████▎    | 16451/31250 [2:10:21<1:56:04,  2.12it/s]


Step 16450 | Loss: 4.1510 | Perplexity: 66.35 | LR: 2.27e-05 | 


Epoch 1/1:  53%|█████▎    | 16501/31250 [2:10:44<1:54:56,  2.14it/s]


Step 16500 | Loss: 4.1595 | Perplexity: 57.78 | LR: 2.21e-05 | 


Epoch 1/1:  53%|█████▎    | 16551/31250 [2:11:08<1:54:46,  2.13it/s]


Step 16550 | Loss: 4.1546 | Perplexity: 72.47 | LR: 2.15e-05 | 


Epoch 1/1:  53%|█████▎    | 16601/31250 [2:11:31<1:54:31,  2.13it/s]


Step 16600 | Loss: 4.1529 | Perplexity: 73.56 | LR: 2.09e-05 | 


Epoch 1/1:  53%|█████▎    | 16651/31250 [2:11:55<1:53:37,  2.14it/s]


Step 16650 | Loss: 4.1606 | Perplexity: 70.85 | LR: 2.03e-05 | 


Epoch 1/1:  53%|█████▎    | 16701/31250 [2:12:18<1:54:52,  2.11it/s]


Step 16700 | Loss: 4.1763 | Perplexity: 61.37 | LR: 1.97e-05 | 


Epoch 1/1:  54%|█████▎    | 16751/31250 [2:12:42<1:54:24,  2.11it/s]


Step 16750 | Loss: 4.1519 | Perplexity: 61.44 | LR: 1.91e-05 | 


Epoch 1/1:  54%|█████▍    | 16801/31250 [2:13:05<1:52:23,  2.14it/s]


Step 16800 | Loss: 4.1646 | Perplexity: 61.24 | LR: 1.85e-05 | 


Epoch 1/1:  54%|█████▍    | 16851/31250 [2:13:28<1:52:05,  2.14it/s]


Step 16850 | Loss: 4.1389 | Perplexity: 63.70 | LR: 1.80e-05 | 


Epoch 1/1:  54%|█████▍    | 16901/31250 [2:13:52<1:53:11,  2.11it/s]


Step 16900 | Loss: 4.1511 | Perplexity: 63.30 | LR: 1.74e-05 | 


Epoch 1/1:  54%|█████▍    | 16951/31250 [2:14:15<1:52:02,  2.13it/s]


Step 16950 | Loss: 4.1800 | Perplexity: 78.96 | LR: 1.69e-05 | 


Epoch 1/1:  54%|█████▍    | 17001/31250 [2:14:39<1:51:18,  2.13it/s]


Step 17000 | Loss: 4.1494 | Perplexity: 66.64 | LR: 1.63e-05 | 


Epoch 1/1:  55%|█████▍    | 17051/31250 [2:15:02<1:51:01,  2.13it/s]


Step 17050 | Loss: 4.1785 | Perplexity: 75.91 | LR: 1.58e-05 | 


Epoch 1/1:  55%|█████▍    | 17101/31250 [2:15:26<1:50:00,  2.14it/s]


Step 17100 | Loss: 4.1597 | Perplexity: 69.89 | LR: 1.53e-05 | 


Epoch 1/1:  55%|█████▍    | 17151/31250 [2:15:49<1:49:22,  2.15it/s]


Step 17150 | Loss: 4.1571 | Perplexity: 57.60 | LR: 1.48e-05 | 


Epoch 1/1:  55%|█████▌    | 17201/31250 [2:16:13<1:49:51,  2.13it/s]


Step 17200 | Loss: 4.1480 | Perplexity: 62.49 | LR: 1.43e-05 | 


Epoch 1/1:  55%|█████▌    | 17250/31250 [2:16:36<1:48:53,  2.14it/s]


Step 17250 | Loss: 4.1519 | Perplexity: 66.08 | LR: 1.38e-05 | 


Epoch 1/1:  55%|█████▌    | 17301/31250 [2:17:00<1:49:05,  2.13it/s]


Step 17300 | Loss: 4.1581 | Perplexity: 51.24 | LR: 1.33e-05 | 


Epoch 1/1:  56%|█████▌    | 17351/31250 [2:17:23<1:48:48,  2.13it/s]


Step 17350 | Loss: 4.1540 | Perplexity: 68.90 | LR: 1.28e-05 | 


Epoch 1/1:  56%|█████▌    | 17401/31250 [2:17:46<1:47:28,  2.15it/s]


Step 17400 | Loss: 4.1389 | Perplexity: 63.18 | LR: 1.23e-05 | 


Epoch 1/1:  56%|█████▌    | 17451/31250 [2:18:10<1:48:27,  2.12it/s]


Step 17450 | Loss: 4.1430 | Perplexity: 71.91 | LR: 1.19e-05 | 


Epoch 1/1:  56%|█████▌    | 17501/31250 [2:18:33<1:47:45,  2.13it/s]


Step 17500 | Loss: 4.1990 | Perplexity: 71.60 | LR: 1.14e-05 | 


Epoch 1/1:  56%|█████▌    | 17551/31250 [2:18:57<1:46:49,  2.14it/s]


Step 17550 | Loss: 4.1670 | Perplexity: 70.34 | LR: 1.10e-05 | 


Epoch 1/1:  56%|█████▋    | 17601/31250 [2:19:20<1:46:56,  2.13it/s]


Step 17600 | Loss: 4.1674 | Perplexity: 61.53 | LR: 1.05e-05 | 


Epoch 1/1:  56%|█████▋    | 17651/31250 [2:19:44<1:45:22,  2.15it/s]


Step 17650 | Loss: 4.1830 | Perplexity: 78.05 | LR: 1.01e-05 | 


Epoch 1/1:  57%|█████▋    | 17701/31250 [2:20:07<1:45:14,  2.15it/s]


Step 17700 | Loss: 4.1863 | Perplexity: 67.71 | LR: 9.68e-06 | 


Epoch 1/1:  57%|█████▋    | 17751/31250 [2:20:30<1:46:15,  2.12it/s]


Step 17750 | Loss: 4.1736 | Perplexity: 62.54 | LR: 9.26e-06 | 


Epoch 1/1:  57%|█████▋    | 17801/31250 [2:20:54<1:44:47,  2.14it/s]


Step 17800 | Loss: 4.1624 | Perplexity: 55.46 | LR: 8.86e-06 | 


Epoch 1/1:  57%|█████▋    | 17851/31250 [2:21:17<1:44:16,  2.14it/s]


Step 17850 | Loss: 4.1593 | Perplexity: 56.79 | LR: 8.47e-06 | 


Epoch 1/1:  57%|█████▋    | 17901/31250 [2:21:41<1:45:29,  2.11it/s]


Step 17900 | Loss: 4.1580 | Perplexity: 68.71 | LR: 8.08e-06 | 


Epoch 1/1:  57%|█████▋    | 17951/31250 [2:22:04<1:44:51,  2.11it/s]


Step 17950 | Loss: 4.1575 | Perplexity: 60.54 | LR: 7.70e-06 | 


Epoch 1/1:  58%|█████▊    | 18001/31250 [2:22:28<1:43:32,  2.13it/s]


Step 18000 | Loss: 4.1635 | Perplexity: 67.56 | LR: 7.33e-06 | 


Epoch 1/1:  58%|█████▊    | 18051/31250 [2:22:51<1:43:12,  2.13it/s]


Step 18050 | Loss: 4.1675 | Perplexity: 58.89 | LR: 6.97e-06 | 


Epoch 1/1:  58%|█████▊    | 18101/31250 [2:23:15<1:42:13,  2.14it/s]


Step 18100 | Loss: 4.1716 | Perplexity: 58.53 | LR: 6.62e-06 | 


Epoch 1/1:  58%|█████▊    | 18151/31250 [2:23:38<1:41:57,  2.14it/s]


Step 18150 | Loss: 4.1703 | Perplexity: 61.32 | LR: 6.28e-06 | 


Epoch 1/1:  58%|█████▊    | 18201/31250 [2:24:02<1:42:35,  2.12it/s]


Step 18200 | Loss: 4.1729 | Perplexity: 55.04 | LR: 5.95e-06 | 


Epoch 1/1:  58%|█████▊    | 18250/31250 [2:24:25<1:41:40,  2.13it/s]


Step 18250 | Loss: 4.1601 | Perplexity: 50.72 | LR: 5.63e-06 | 


Epoch 1/1:  59%|█████▊    | 18301/31250 [2:25:04<1:40:59,  2.14it/s] 


Step 18300 | Loss: 4.1631 | Perplexity: 74.74 | LR: 5.31e-06 | 


Epoch 1/1:  59%|█████▊    | 18351/31250 [2:25:27<1:40:25,  2.14it/s]


Step 18350 | Loss: 4.1804 | Perplexity: 53.55 | LR: 5.00e-06 | 


Epoch 1/1:  59%|█████▉    | 18401/31250 [2:25:51<1:41:07,  2.12it/s]


Step 18400 | Loss: 4.1648 | Perplexity: 49.76 | LR: 4.71e-06 | 


Epoch 1/1:  59%|█████▉    | 18451/31250 [2:26:14<1:39:42,  2.14it/s]


Step 18450 | Loss: 4.1660 | Perplexity: 56.92 | LR: 4.42e-06 | 


Epoch 1/1:  59%|█████▉    | 18501/31250 [2:26:38<1:40:08,  2.12it/s]


Step 18500 | Loss: 4.1766 | Perplexity: 67.92 | LR: 4.14e-06 | 


Epoch 1/1:  59%|█████▉    | 18551/31250 [2:27:01<1:38:04,  2.16it/s]


Step 18550 | Loss: 4.1647 | Perplexity: 76.68 | LR: 3.87e-06 | 


Epoch 1/1:  60%|█████▉    | 18601/31250 [2:27:24<1:39:21,  2.12it/s]


Step 18600 | Loss: 4.1645 | Perplexity: 62.56 | LR: 3.61e-06 | 


Epoch 1/1:  60%|█████▉    | 18651/31250 [2:27:48<1:38:06,  2.14it/s]


Step 18650 | Loss: 4.1626 | Perplexity: 69.42 | LR: 3.36e-06 | 


Epoch 1/1:  60%|█████▉    | 18701/31250 [2:28:11<1:38:18,  2.13it/s]


Step 18700 | Loss: 4.1385 | Perplexity: 69.02 | LR: 3.11e-06 | 


Epoch 1/1:  60%|██████    | 18750/31250 [2:28:34<1:37:12,  2.14it/s]


Step 18750 | Loss: 4.1484 | Perplexity: 59.44 | LR: 2.88e-06 | 


Epoch 1/1:  60%|██████    | 18801/31250 [2:28:58<1:37:21,  2.13it/s]


Step 18800 | Loss: 4.1595 | Perplexity: 72.56 | LR: 2.65e-06 | 


Epoch 1/1:  60%|██████    | 18851/31250 [2:29:22<1:37:52,  2.11it/s]


Step 18850 | Loss: 4.1611 | Perplexity: 62.76 | LR: 2.44e-06 | 


Epoch 1/1:  60%|██████    | 18901/31250 [2:29:45<1:36:02,  2.14it/s]


Step 18900 | Loss: 4.1888 | Perplexity: 56.27 | LR: 2.23e-06 | 


Epoch 1/1:  61%|██████    | 18951/31250 [2:30:09<1:35:34,  2.14it/s]


Step 18950 | Loss: 4.1503 | Perplexity: 63.45 | LR: 2.03e-06 | 


Epoch 1/1:  61%|██████    | 19001/31250 [2:30:32<1:36:26,  2.12it/s]


Step 19000 | Loss: 4.1391 | Perplexity: 52.18 | LR: 1.84e-06 | 


Epoch 1/1:  61%|██████    | 19051/31250 [2:30:55<1:34:56,  2.14it/s]


Step 19050 | Loss: 4.1699 | Perplexity: 53.22 | LR: 1.66e-06 | 


Epoch 1/1:  61%|██████    | 19101/31250 [2:31:19<1:34:20,  2.15it/s]


Step 19100 | Loss: 4.1587 | Perplexity: 55.76 | LR: 1.49e-06 | 


Epoch 1/1:  61%|██████▏   | 19151/31250 [2:31:42<1:34:29,  2.13it/s]


Step 19150 | Loss: 4.1499 | Perplexity: 63.32 | LR: 1.33e-06 | 


Epoch 1/1:  61%|██████▏   | 19201/31250 [2:32:06<1:33:50,  2.14it/s]


Step 19200 | Loss: 4.1547 | Perplexity: 77.91 | LR: 1.18e-06 | 


Epoch 1/1:  62%|██████▏   | 19251/31250 [2:32:29<1:33:54,  2.13it/s]


Step 19250 | Loss: 4.1775 | Perplexity: 61.26 | LR: 1.04e-06 | 


Epoch 1/1:  62%|██████▏   | 19301/31250 [2:32:52<1:33:09,  2.14it/s]


Step 19300 | Loss: 4.1581 | Perplexity: 64.76 | LR: 9.03e-07 | 


Epoch 1/1:  62%|██████▏   | 19351/31250 [2:33:16<1:32:19,  2.15it/s]


Step 19350 | Loss: 4.1632 | Perplexity: 68.45 | LR: 7.79e-07 | 


Epoch 1/1:  62%|██████▏   | 19401/31250 [2:33:39<1:32:23,  2.14it/s]


Step 19400 | Loss: 4.1570 | Perplexity: 55.23 | LR: 6.63e-07 | 


Epoch 1/1:  62%|██████▏   | 19451/31250 [2:34:03<1:32:06,  2.14it/s]


Step 19450 | Loss: 4.1639 | Perplexity: 62.40 | LR: 5.57e-07 | 


Epoch 1/1:  62%|██████▏   | 19501/31250 [2:34:26<1:31:43,  2.13it/s]


Step 19500 | Loss: 4.1533 | Perplexity: 54.48 | LR: 4.61e-07 | 


Epoch 1/1:  63%|██████▎   | 19551/31250 [2:34:49<1:30:58,  2.14it/s]


Step 19550 | Loss: 4.1556 | Perplexity: 61.49 | LR: 3.73e-07 | 


Epoch 1/1:  63%|██████▎   | 19601/31250 [2:35:13<1:30:55,  2.14it/s]


Step 19600 | Loss: 4.1563 | Perplexity: 53.03 | LR: 2.95e-07 | 


Epoch 1/1:  63%|██████▎   | 19651/31250 [2:35:36<1:30:32,  2.13it/s]


Step 19650 | Loss: 4.1495 | Perplexity: 60.54 | LR: 2.25e-07 | 


Epoch 1/1:  63%|██████▎   | 19701/31250 [2:36:00<1:30:40,  2.12it/s]


Step 19700 | Loss: 4.1653 | Perplexity: 51.94 | LR: 1.65e-07 | 


Epoch 1/1:  63%|██████▎   | 19751/31250 [2:36:23<1:30:35,  2.12it/s]


Step 19750 | Loss: 4.1768 | Perplexity: 61.02 | LR: 1.15e-07 | 


Epoch 1/1:  63%|██████▎   | 19801/31250 [2:36:47<1:29:30,  2.13it/s]


Step 19800 | Loss: 4.1717 | Perplexity: 75.70 | LR: 7.33e-08 | 


Epoch 1/1:  64%|██████▎   | 19851/31250 [2:37:10<1:29:37,  2.12it/s]


Step 19850 | Loss: 4.1485 | Perplexity: 72.64 | LR: 4.11e-08 | 


Epoch 1/1:  64%|██████▎   | 19901/31250 [2:37:34<1:29:17,  2.12it/s]


Step 19900 | Loss: 4.1735 | Perplexity: 63.60 | LR: 1.81e-08 | 


Epoch 1/1:  64%|██████▍   | 19951/31250 [2:37:57<1:28:15,  2.13it/s]


Step 19950 | Loss: 4.1818 | Perplexity: 59.73 | LR: 4.44e-09 | 


Epoch 1/1:  64%|██████▍   | 19999/31250 [2:38:20<1:29:04,  2.10it/s]


Training completed!


In [58]:
def generate_text(text, max_new_tokens=50):
    input_tokens = tokenizer.encode(text, return_tensors="pt").to(gpt.device)
    generated_tokens = gpt.generate(input_tokens, max_new_tokens=max_new_tokens)
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    
    return generated_text

In [33]:
input_text = "Barack Obama is"

print(generate_text(input_text))

Barack Obama is technically one of the least moral reasons the country remains idling of a U.S. pledge to tax new taxes and answer elections, sources loose in the way of forming the U.S. government-of-state party told ABC 22 less than


In [39]:
input_text_2 = "Why is the sky blue?"

print(generate_text(input_text_2))

Why is the sky blue? Take the above. I shared about the myriad authorials who offered complex and history books, ranging from armour to hair to in tones to styles.

If you think the sun was grinding to select, you can decided to look after 1967.


In [41]:
input_text_3 = "My favorite songs is"

print(generate_text(input_text_3))

My favorite songs is survival values and everything, behavior. Actually, most of the rest of the song that played playoffs and recorded – some when most of the die of millennials gave it in danger swirling and nerb about it and took the same difficulty on closets.


In [59]:
input_text_4 = "President"

print(generate_text(input_text_4, max_new_tokens=200))

President John E. Cornispy has criticised President Donald Trump for promising to impose an intelligence plan entirely. "The president asked us on it when the secretary of state got our stuff comes out and seriously the dog died. This decision is the only step I hear from him".

Trump stands for James Khan, pointing out the possibility of placing a collective view of American President Donald Trump, with Netanyahu pressing for the public's dreaming of doing what is likely to be even future sources of cooperation over his handling of climate change last year. The group hasn't repeatedly collaborated for a single rebuke of former president John J. Trump: former president Donald Trump. It looks about speaks to Shakespeare at the 1/11 summit where Trump will host a meeting of rival Republican Governor Sergei Carney. Trump's first official foreign policy announcement will be a key short-term reminder that Trump hasn't existed for the last couple months, and that by Monday the president had 

In [60]:
input_text_5 = "Sunrise is very pretty"

print(generate_text(input_text_5, max_new_tokens=200))

Sunrise is very pretty good. Snow is anything new to be certain. It’s best for science fiction tale

No, let’s not warp fire

The horror rise of scenes from UFO colonization

The world’s only impacted planets somehow action most of the seeping planets operating, finding a bigger proportion than Earth’s most important event.

Follow Yastala Kuba for the epic, public and industry-bending and high-school-educated Firefly project to build Oct. 25

Mutibont-rich, current Artists of Hiroshima

The tricky beauty is to modernize how a planet is on the planet’s farm, whether or not to live up to 30 million souls (including a fly bridge driving). It depends on its weight, its weight, and its possibility that it keeps itself is overy to its peg. The attraction of Muhammad the Fenigoral and Namoria goaling being poised to do well in the next few


In [64]:
input_text_6 = "Crocodiles are very dangerous"

print(generate_text(input_text_6, max_new_tokens=200))

Crocodiles are very dangerous when scientists continue adjusting to psilendment of Assyrification compromises, forensics.

The public arm of one of the most unpy mirrors is the fatal prefrontal cortex. Bions flux worsukes are typical only to test the functioning protection, and rates of height cannot survive or yield few light on-nule workouts with a thin response light.

The study, published in the March paper, cites a single microx. 1 microbes with less life- less warm tissue particles (currently acupuncture- cancer cells) that produce more flexibility or unpredictable behaviors. As these notes are amplified at two important processes, one could discover the sort of ability so the new sensor will cause an end to a "bitcoin" scanner.1 is a perfect proof that a proposed bug will be "active" without a few lockers. Each in some type that use two main variables and typically gets a higher validated shot.

The combined flaw is separated from a battery of UVBs, which naturally infringe on
