In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/gamino/wiki_medical_terms/wiki_medical_terms.parquet")

In [2]:
articles = df.iloc[:, 1]

In [3]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="Attention Benchmark ", )

# Initialize wandb
wandb_log = True  # Set to True to enable wandb logging
wandb_run_name = 'Casual Attention-mps'

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnielspace[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import transformers
import tiktoken

import math

from tqdm import tqdm

import os
import time

import re

In [5]:
class TextDataset(Dataset):
    def __init__(self, articles, model="gpt2", seq_length=512):
        self.tokenizer = tiktoken.get_encoding(model)
        self.vocab_size = self.tokenizer.n_vocab
        self.seq_length = seq_length
        self.articles = articles.apply(self.preprocess_and_tokenize)
        
        self.input_ids, self.attention_masks, self.targets = self.create_sequences()

    def preprocess_and_tokenize(self, text):
        # Preprocess text
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenize text
        tokens = self.tokenizer.encode(text)
        
        # Check for invalid token indices
        assert all(token < self.vocab_size for token in tokens), "Token index out of range"
        
        # Pad and truncate tokens
        if len(tokens) > self.seq_length:
            tokens = tokens[:self.seq_length]
        else:
            tokens += [0] * (self.seq_length - len(tokens))
        self.tokens = tokens
            
        return tokens

    def create_sequences(self):
        input_ids = []
        attention_masks = []
        targets = []
        
        for tokens in self.articles:
            input_ids.append(tokens[:-1])  # Exclude the last token for input
            targets.append(tokens[1:])     # Exclude the first token for target
            attention_masks.append([1 if token != 0 else 0 for token in tokens[:-1]])
        
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_masks = torch.tensor(attention_masks, dtype=torch.long)
        targets = torch.tensor(targets, dtype=torch.long)
        
        return input_ids, attention_masks, targets

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        input_seq = self.input_ids[idx]
        attention_mask = self.attention_masks[idx]
        target_seq = self.targets[idx]
        
        sample = {'input_ids': input_seq, 'targets': target_seq, 'attention_mask': attention_mask}
        return sample

def pad_sequences(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    targets = [item['targets'] for item in batch]

    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    targets_padded = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)

    return {'input_ids': input_ids_padded, 'targets': targets_padded, 'attention_mask': attention_masks_padded }


In [6]:
# Create the dataset
dataset = TextDataset(articles, model="gpt2", seq_length=512)

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=pad_sequences)

In [7]:
for data in dataloader:
    x, y, att = data['input_ids'], data['targets'], data['attention_mask']
    print(x,y,att)
    break

tensor([[11498,  1819,   296,  ..., 50002,   484,  1690],
        [  259, 14201,  1187,  ..., 26022,   286,  2585]]) tensor([[ 1819,   296,   392,  ...,   484,  1690,  3051],
        [14201,  1187,   389,  ...,   286,  2585,   423]]) tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])


In [8]:
#configuration
class GPTConfig:
    def __init__(self, vocab_size, block_size, n_layer, n_head, n_embd, dropout, bias=True):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias

In [9]:
#attention
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0

        self.n_head = config.n_head
        self.d_k = config.n_embd // config.n_head
        self.scale = self.d_k ** -0.5

        self.qkv_proj = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.out_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv_proj(x).reshape(B, T, 3, self.n_head, self.d_k).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn_scores = (q @ k.transpose(-2, -1)) * self.scale
        attn_scores = attn_scores.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.attn_dropout(attn_probs)

        attn_output = (attn_probs @ v).transpose(1, 2).reshape(B, T, C)
        attn_output = self.resid_dropout(self.out_proj(attn_output))
        return attn_output

In [10]:
#transformer block
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd, eps=1e-5)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [11]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None, "Config must include vocab_size"
        assert config.block_size is not None, "Config must include block_size"
        self.config = config

        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'wpe': nn.Embedding(config.block_size, config.n_embd),
            'drop': nn.Dropout(config.dropout),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embd, eps=1e-5),
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.tie_weights()

        self.apply(self._init_weights)
        self.init_residuals()
        print(f"Number of parameters: {self.get_num_params()/1e6:.2f}M")

    def tie_weights(self):
        self.transformer.wte.weight = self.lm_head.weight

    def init_residuals(self):
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Sequence length {t} exceeds block size {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            return logits, loss
        else:
            logits = logits[:, [-1], :]  # Use only the last token's logits
            return logits, None

    def crop_block_size(self, block_size):
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:, :, :block_size, :block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}, "Invalid model type"
        override_args = override_args or {}
        assert all(k == 'dropout' for k in override_args), "Only 'dropout' can be overridden"

        print(f"Loading weights from pretrained model: {model_type}")

        config_args = cls.get_config_args(model_type)
        if 'dropout' in override_args:
            print(f"Overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']

        config = GPTConfig(**config_args)
        model = GPT(config)
        model.load_pretrained_weights(model_type)
        return model

    @staticmethod
    def get_config_args(model_type):
        config_map = {
            'gpt2': {'n_layer': 12, 'n_head': 12, 'n_embd': 768},
            'gpt2-medium': {'n_layer': 24, 'n_head': 16, 'n_embd': 1024},
            'gpt2-large': {'n_layer': 36, 'n_head': 20, 'n_embd': 1280},
            'gpt2-xl': {'n_layer': 48, 'n_head': 25, 'n_embd': 1600},
        }
        config_args = config_map[model_type]
        config_args.update({'vocab_size': 50257, 'block_size': 1024, 'bias': True, 'dropout': 0.1})
        return config_args

    def load_pretrained_weights(self, model_type):
        model_hf = transformers.GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        sd = self.state_dict()
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        for k, v in sd_hf.items():
            if k in sd:
                if any(k.endswith(w) for w in transposed):
                    sd[k].copy_(v.t())
                else:
                    sd[k].copy_(v)

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        N = self.get_num_params()
        L, H, Q, T = self.config.n_layer, self.config.n_head, self.config.n_embd // self.config.n_head, self.config.block_size
        flops_per_token = 6 * N + 12 * L * H * Q * T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        flops_achieved = flops_per_iter * (1.0 / dt)
        flops_promised = 312e12
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


In [12]:
#training dependencies
import numpy as np
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
from contextlib import nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group

In [13]:
# Default config values designed to train a GPT-2 (124M) on OpenWebText
# I/O
out_dir = 'out'
eval_interval = 2000
log_interval = 1
eval_iters = 200
eval_only = False
always_save_checkpoint = True
init_from = 'scratch'  # 'scratch' or 'resume' or 'gpt2*'

# Data
data_path = 'Wiki medical terms'  
gradient_accumulation_steps = 5 * 8
batch_size = 12
block_size = 1024

# Model
n_layer = 6  
n_head = 8   
n_embd = 512 
dropout = 0.0
bias = False

# AdamW optimizer
learning_rate = 6e-4
max_iters = 1000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0

# Learning rate decay settings
decay_lr = True
warmup_iters = 200
lr_decay_iters = 1000
min_lr = 6e-5

# DDP settings
backend = 'gloo'

# System
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
dtype = 'float32'  # MPS currently supports only float32
compile = False  # Disable compilation for now as PyTorch 2.0 is not yet stable on MPS
# -----------------------------------------------------------------------------

# Collect configuration keys
config_keys = [k for k, v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys}

# Various initializations and derived attributes
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'mps:{ddp_local_rank}' if torch.backends.mps.is_available() else f'cuda:{ddp_local_rank}'
    torch.mps.set_device(device)
    master_process = ddp_rank == 0
    seed_offset = ddp_rank
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    master_process = True
    seed_offset = 0
    ddp_world_size = 1

tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"Tokens per iteration will be: {tokens_per_iter:,}")

if master_process:
    os.makedirs(out_dir, exist_ok=True)

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'mps' if 'mps' in device else 'cpu'
ptdtype = torch.float32  # Currently, MPS supports only float32
ctx = nullcontext()

Tokens per iteration will be: 491,520


In [14]:
# Initialize iteration number and best validation loss
iter_num = 0
best_val_loss = 1e9

# Model initialization
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                  bias=bias, vocab_size=len(dataset.input_ids), dropout=dropout)

if init_from == 'scratch':
    print("Initializing a new model from scratch")
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    print(f"Resuming training from {out_dir}")
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    override_args = dict(dropout=dropout)
    model = GPT.from_pretrained(init_from, override_args)
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)

if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size

model.to(device)

Initializing a new model from scratch
Number of parameters: 22.42M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(6861, 512)
    (wpe): Embedding(1024, 512)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (qkv_proj): Linear(in_features=512, out_features=1536, bias=False)
          (out_proj): Linear(in_features=512, out_features=512, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
          (3): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(

In [15]:
# Initialize a GradScaler
scaler = GradScaler(enabled=(dtype == 'float16' and 'cuda' in device))

# Configure the optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None  # Free up memory

# Wrap model into DDP container if needed
if ddp:
    print(f"Starting parallel process with rank {ddp_rank}")
    model = DDP(model, device_ids=[ddp_local_rank])


# Function to estimate loss over splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        with tqdm(total=eval_iters, desc=f"Evaluating {split}") as pbar:
            for k in range(eval_iters):
                batch = next(iter(dataloader))
                X, Y = batch['input_ids'].to(device), batch['targets'].to(device)
                with ctx:
                    logits, loss = model(X, Y)
                losses[k] = loss.item()
                pbar.update(1)
        out[split] = losses.mean()
    model.train()
    return out

# Learning rate decay scheduler
def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [16]:
raw_model = model.module if ddp else model
local_iter_num = 0
running_mfu = -1.0
t0 = time.time()

while iter_num <= max_iters:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train_loss": losses['train'],
                "val_loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu * 100,  # convert to percentage
            })

        if losses['val'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break

    with tqdm(total=gradient_accumulation_steps, desc=f"Training step {iter_num}") as pbar:
        total_train_loss = 0.0  # Initialize total training loss
        for micro_step in range(gradient_accumulation_steps):
            if ddp:
                model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
            batch = next(iter(dataloader))
            X, Y = batch['input_ids'].to(device), batch['targets'].to(device)
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / gradient_accumulation_steps
            scaler.scale(loss).backward()
            pbar.update(1)

    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt * 1000:.2f}ms, mfu {running_mfu * 100:.2f}%")

        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train_loss": total_train_loss,
                "lr": lr,
                "mfu": running_mfu * 100,  # convert to percentage
            })


    iter_num += 1
    local_iter_num += 1



if ddp:
    destroy_process_group()

Evaluating train: 100%|██████████| 200/200 [00:35<00:00,  5.67it/s]
Evaluating val: 100%|██████████| 200/200 [00:31<00:00,  6.31it/s]


step 0: train loss 6.9701, val loss 6.8994


Training step 0: 100%|██████████| 40/40 [00:17<00:00,  2.23it/s]


iter 0: loss 7.6355, time 86016.43ms, mfu -100.00%


Training step 1: 100%|██████████| 40/40 [00:18<00:00,  2.21it/s]


iter 1: loss 7.3116, time 18418.96ms, mfu -100.00%


Training step 2: 100%|██████████| 40/40 [00:18<00:00,  2.19it/s]


iter 2: loss 6.6334, time 18700.59ms, mfu -100.00%


Training step 3: 100%|██████████| 40/40 [00:14<00:00,  2.69it/s]


iter 3: loss 6.3470, time 15229.41ms, mfu -100.00%


Training step 4: 100%|██████████| 40/40 [00:09<00:00,  4.05it/s]


iter 4: loss 6.7538, time 10159.89ms, mfu -100.00%


Training step 5: 100%|██████████| 40/40 [00:08<00:00,  4.66it/s]


iter 5: loss 6.6252, time 8807.81ms, mfu 3.08%


Training step 6: 100%|██████████| 40/40 [00:08<00:00,  4.67it/s]


iter 6: loss 6.5432, time 8777.81ms, mfu 3.08%


Training step 7: 100%|██████████| 40/40 [00:08<00:00,  4.68it/s]


iter 7: loss 6.3594, time 8761.59ms, mfu 3.08%


Training step 8: 100%|██████████| 40/40 [00:08<00:00,  4.70it/s]


iter 8: loss 5.7437, time 8748.61ms, mfu 3.09%


Training step 9: 100%|██████████| 40/40 [00:08<00:00,  4.70it/s]


iter 9: loss 6.2530, time 8738.93ms, mfu 3.09%


Training step 10: 100%|██████████| 40/40 [00:08<00:00,  4.68it/s]


iter 10: loss 6.1977, time 8762.26ms, mfu 3.09%


Training step 11: 100%|██████████| 40/40 [00:08<00:00,  4.69it/s]


iter 11: loss 6.5180, time 8756.69ms, mfu 3.09%


Training step 12: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s]


iter 12: loss 6.0196, time 8827.28ms, mfu 3.09%


Training step 13: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s]


iter 13: loss 3.1459, time 8817.45ms, mfu 3.09%


Training step 14: 100%|██████████| 40/40 [00:08<00:00,  4.58it/s]


iter 14: loss 6.3368, time 8983.16ms, mfu 3.08%


Training step 15: 100%|██████████| 40/40 [00:08<00:00,  4.58it/s]


iter 15: loss 5.8720, time 8970.92ms, mfu 3.07%


Training step 16: 100%|██████████| 40/40 [00:08<00:00,  4.52it/s]


iter 16: loss 5.1481, time 9081.98ms, mfu 3.07%


Training step 17: 100%|██████████| 40/40 [00:08<00:00,  4.49it/s]


iter 17: loss 5.7046, time 9135.51ms, mfu 3.06%


Training step 18: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 18: loss 6.1198, time 9308.97ms, mfu 3.04%


Training step 19: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 19: loss 5.6153, time 9355.31ms, mfu 3.03%


Training step 20: 100%|██████████| 40/40 [00:09<00:00,  4.43it/s]


iter 20: loss 3.7873, time 9135.73ms, mfu 3.02%


Training step 21: 100%|██████████| 40/40 [00:09<00:00,  4.12it/s]


iter 21: loss 5.7000, time 9977.65ms, mfu 2.99%


Training step 22: 100%|██████████| 40/40 [00:09<00:00,  4.13it/s]


iter 22: loss 3.2859, time 9814.86ms, mfu 2.97%


Training step 23: 100%|██████████| 40/40 [00:09<00:00,  4.12it/s]


iter 23: loss 5.7778, time 9974.66ms, mfu 2.94%


Training step 24: 100%|██████████| 40/40 [00:10<00:00,  3.96it/s]


iter 24: loss 5.9901, time 10331.65ms, mfu 2.91%


Training step 25: 100%|██████████| 40/40 [00:10<00:00,  3.77it/s]


iter 25: loss 5.6540, time 10868.80ms, mfu 2.87%


Training step 26: 100%|██████████| 40/40 [00:09<00:00,  4.09it/s]


iter 26: loss 5.6381, time 9899.81ms, mfu 2.86%


Training step 27: 100%|██████████| 40/40 [00:10<00:00,  3.88it/s]


iter 27: loss 5.8095, time 10537.15ms, mfu 2.83%


Training step 28: 100%|██████████| 40/40 [00:10<00:00,  3.93it/s]


iter 28: loss 5.3026, time 10426.47ms, mfu 2.81%


Training step 29: 100%|██████████| 40/40 [00:10<00:00,  3.96it/s]


iter 29: loss 5.2052, time 10364.31ms, mfu 2.79%


Training step 30: 100%|██████████| 40/40 [00:10<00:00,  3.78it/s]


iter 30: loss 5.6213, time 10834.68ms, mfu 2.76%


Training step 31: 100%|██████████| 40/40 [00:10<00:00,  3.71it/s]


iter 31: loss 3.0899, time 11061.29ms, mfu 2.73%


Training step 32: 100%|██████████| 40/40 [00:10<00:00,  3.68it/s]


iter 32: loss 3.2532, time 11206.94ms, mfu 2.70%


Training step 33: 100%|██████████| 40/40 [00:11<00:00,  3.38it/s]


iter 33: loss 5.5170, time 12148.20ms, mfu 2.65%


Training step 34: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 34: loss 5.4483, time 11453.16ms, mfu 2.62%


Training step 35: 100%|██████████| 40/40 [00:10<00:00,  3.76it/s]


iter 35: loss 5.1506, time 10904.91ms, mfu 2.61%


Training step 36: 100%|██████████| 40/40 [00:10<00:00,  3.68it/s]


iter 36: loss 5.0079, time 11148.00ms, mfu 2.59%


Training step 37: 100%|██████████| 40/40 [00:10<00:00,  3.69it/s]


iter 37: loss 2.6363, time 11129.83ms, mfu 2.58%


Training step 38: 100%|██████████| 40/40 [00:10<00:00,  3.77it/s]


iter 38: loss 5.3725, time 10872.11ms, mfu 2.57%


Training step 39: 100%|██████████| 40/40 [00:10<00:00,  3.68it/s]


iter 39: loss 4.5700, time 11177.66ms, mfu 2.55%


Training step 40: 100%|██████████| 40/40 [00:10<00:00,  3.75it/s]


iter 40: loss 5.3253, time 10956.22ms, mfu 2.55%


Training step 41: 100%|██████████| 40/40 [00:10<00:00,  3.75it/s]


iter 41: loss 3.4781, time 10945.45ms, mfu 2.54%


Training step 42: 100%|██████████| 40/40 [00:10<00:00,  3.69it/s]


iter 42: loss 2.6145, time 11112.61ms, mfu 2.53%


Training step 43: 100%|██████████| 40/40 [00:10<00:00,  3.65it/s]


iter 43: loss 4.8924, time 11239.43ms, mfu 2.52%


Training step 44: 100%|██████████| 40/40 [00:10<00:00,  3.67it/s]


iter 44: loss 4.5969, time 11192.30ms, mfu 2.51%


Training step 45: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 45: loss 4.4989, time 11409.73ms, mfu 2.50%


Training step 46: 100%|██████████| 40/40 [00:10<00:00,  3.64it/s]


iter 46: loss 5.0235, time 11289.94ms, mfu 2.49%


Training step 47: 100%|██████████| 40/40 [00:11<00:00,  3.34it/s]


iter 47: loss 4.2031, time 12282.02ms, mfu 2.46%


Training step 48: 100%|██████████| 40/40 [00:13<00:00,  3.04it/s]


iter 48: loss 4.8922, time 13469.55ms, mfu 2.41%


Training step 49: 100%|██████████| 40/40 [00:10<00:00,  3.64it/s]


iter 49: loss 4.6450, time 11277.43ms, mfu 2.41%


Training step 50: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 50: loss 3.2406, time 11533.03ms, mfu 2.41%


Training step 51: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 51: loss 4.5132, time 11390.91ms, mfu 2.41%


Training step 52: 100%|██████████| 40/40 [00:11<00:00,  3.45it/s]


iter 52: loss 4.2782, time 11872.78ms, mfu 2.39%


Training step 53: 100%|██████████| 40/40 [00:11<00:00,  3.41it/s]


iter 53: loss 3.9188, time 12034.16ms, mfu 2.38%


Training step 54: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 54: loss 4.6929, time 11749.60ms, mfu 2.37%


Training step 55: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 55: loss 4.4700, time 11875.59ms, mfu 2.36%


Training step 56: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 56: loss 4.0299, time 11876.83ms, mfu 2.36%


Training step 57: 100%|██████████| 40/40 [00:11<00:00,  3.36it/s]


iter 57: loss 4.8187, time 12193.41ms, mfu 2.34%


Training step 58: 100%|██████████| 40/40 [00:11<00:00,  3.34it/s]


iter 58: loss 4.4461, time 12274.36ms, mfu 2.33%


Training step 59: 100%|██████████| 40/40 [00:11<00:00,  3.45it/s]


iter 59: loss 3.0537, time 11914.56ms, mfu 2.32%


Training step 60: 100%|██████████| 40/40 [00:12<00:00,  3.32it/s]


iter 60: loss 2.8888, time 12337.19ms, mfu 2.31%


Training step 61: 100%|██████████| 40/40 [00:12<00:00,  3.32it/s]


iter 61: loss 4.6138, time 12345.97ms, mfu 2.30%


Training step 62: 100%|██████████| 40/40 [00:12<00:00,  3.33it/s]


iter 62: loss 4.4032, time 12322.54ms, mfu 2.29%


Training step 63: 100%|██████████| 40/40 [00:12<00:00,  3.30it/s]


iter 63: loss 2.8292, time 12465.76ms, mfu 2.28%


Training step 64: 100%|██████████| 40/40 [00:11<00:00,  3.40it/s]


iter 64: loss 4.4142, time 12106.93ms, mfu 2.28%


Training step 65: 100%|██████████| 40/40 [00:12<00:00,  3.30it/s]


iter 65: loss 3.2528, time 12436.86ms, mfu 2.27%


Training step 66: 100%|██████████| 40/40 [00:12<00:00,  3.20it/s]


iter 66: loss 3.5632, time 12821.53ms, mfu 2.25%


Training step 67: 100%|██████████| 40/40 [00:12<00:00,  3.23it/s]


iter 67: loss 2.7356, time 12698.13ms, mfu 2.24%


Training step 68: 100%|██████████| 40/40 [00:12<00:00,  3.22it/s]


iter 68: loss 4.7804, time 12750.94ms, mfu 2.23%


Training step 69: 100%|██████████| 40/40 [00:12<00:00,  3.15it/s]


iter 69: loss 3.2457, time 13044.15ms, mfu 2.21%


Training step 70: 100%|██████████| 40/40 [00:12<00:00,  3.30it/s]


iter 70: loss 4.1991, time 12455.82ms, mfu 2.21%


Training step 71: 100%|██████████| 40/40 [00:12<00:00,  3.20it/s]


iter 71: loss 4.4362, time 12825.81ms, mfu 2.20%


Training step 72: 100%|██████████| 40/40 [00:12<00:00,  3.11it/s]


iter 72: loss 4.5153, time 13178.66ms, mfu 2.19%


Training step 73: 100%|██████████| 40/40 [00:12<00:00,  3.23it/s]


iter 73: loss 4.3646, time 12703.80ms, mfu 2.18%


Training step 74: 100%|██████████| 40/40 [00:12<00:00,  3.23it/s]


iter 74: loss 4.7670, time 12717.14ms, mfu 2.18%


Training step 75: 100%|██████████| 40/40 [00:12<00:00,  3.25it/s]


iter 75: loss 4.2158, time 12615.76ms, mfu 2.17%


Training step 76: 100%|██████████| 40/40 [00:13<00:00,  3.06it/s]


iter 76: loss 4.2223, time 13407.76ms, mfu 2.16%


Training step 77: 100%|██████████| 40/40 [00:12<00:00,  3.09it/s]


iter 77: loss 4.0248, time 13275.65ms, mfu 2.15%


Training step 78: 100%|██████████| 40/40 [00:14<00:00,  2.67it/s]


iter 78: loss 4.5796, time 15330.53ms, mfu 2.11%


Training step 79: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 79: loss 3.3408, time 13217.30ms, mfu 2.10%


Training step 80: 100%|██████████| 40/40 [00:12<00:00,  3.14it/s]


iter 80: loss 1.2076, time 13065.99ms, mfu 2.10%


Training step 81: 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]


iter 81: loss 3.9526, time 13443.28ms, mfu 2.09%


Training step 82: 100%|██████████| 40/40 [00:12<00:00,  3.18it/s]


iter 82: loss 2.5849, time 12942.00ms, mfu 2.09%


Training step 83: 100%|██████████| 40/40 [00:16<00:00,  2.43it/s]


iter 83: loss 4.6269, time 16820.17ms, mfu 2.05%


Training step 84: 100%|██████████| 40/40 [00:13<00:00,  3.01it/s]


iter 84: loss 4.6935, time 13628.08ms, mfu 2.04%


Training step 85: 100%|██████████| 40/40 [00:13<00:00,  3.04it/s]


iter 85: loss 2.6658, time 13491.65ms, mfu 2.04%


Training step 86: 100%|██████████| 40/40 [00:12<00:00,  3.09it/s]


iter 86: loss 4.1236, time 13274.14ms, mfu 2.04%


Training step 87: 100%|██████████| 40/40 [00:13<00:00,  3.07it/s]


iter 87: loss 2.3453, time 13351.70ms, mfu 2.04%


Training step 88: 100%|██████████| 40/40 [00:12<00:00,  3.19it/s]


iter 88: loss 2.8614, time 12889.37ms, mfu 2.04%


Training step 89: 100%|██████████| 40/40 [00:13<00:00,  3.06it/s]


iter 89: loss 0.7801, time 13424.59ms, mfu 2.04%


Training step 90: 100%|██████████| 40/40 [00:12<00:00,  3.11it/s]


iter 90: loss 3.3677, time 13198.53ms, mfu 2.04%


Training step 91: 100%|██████████| 40/40 [00:12<00:00,  3.12it/s]


iter 91: loss 3.1158, time 13162.68ms, mfu 2.05%


Training step 92: 100%|██████████| 40/40 [00:13<00:00,  2.97it/s]


iter 92: loss 4.2993, time 13823.67ms, mfu 2.04%


Training step 93: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 93: loss 3.7626, time 13239.88ms, mfu 2.04%


Training step 94: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 94: loss 2.8877, time 13208.16ms, mfu 2.04%


Training step 95: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 95: loss 3.5162, time 13234.42ms, mfu 2.04%


Training step 96: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 96: loss 3.2140, time 13253.17ms, mfu 2.04%


Training step 97: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 97: loss 4.2702, time 13242.55ms, mfu 2.04%


Training step 98: 100%|██████████| 40/40 [00:13<00:00,  2.94it/s]


iter 98: loss 4.1473, time 13922.47ms, mfu 2.03%


Training step 99: 100%|██████████| 40/40 [00:13<00:00,  3.00it/s]


iter 99: loss 2.2643, time 13644.36ms, mfu 2.03%


Training step 100: 100%|██████████| 40/40 [00:12<00:00,  3.12it/s]


iter 100: loss 2.9714, time 13168.19ms, mfu 2.03%


Training step 101: 100%|██████████| 40/40 [00:13<00:00,  3.08it/s]


iter 101: loss 3.6988, time 13344.00ms, mfu 2.03%


Training step 102: 100%|██████████| 40/40 [00:13<00:00,  3.00it/s]


iter 102: loss 3.9980, time 13656.21ms, mfu 2.03%


Training step 103: 100%|██████████| 40/40 [00:13<00:00,  3.04it/s]


iter 103: loss 3.2308, time 13488.05ms, mfu 2.03%


Training step 104: 100%|██████████| 40/40 [00:13<00:00,  3.03it/s]


iter 104: loss 3.7249, time 13515.57ms, mfu 2.02%


Training step 105: 100%|██████████| 40/40 [00:13<00:00,  3.06it/s]


iter 105: loss 3.8128, time 13413.32ms, mfu 2.02%


Training step 106: 100%|██████████| 40/40 [00:12<00:00,  3.11it/s]


iter 106: loss 3.4296, time 13210.37ms, mfu 2.03%


Training step 107: 100%|██████████| 40/40 [00:16<00:00,  2.40it/s]


iter 107: loss 4.1090, time 17016.12ms, mfu 1.98%


Training step 108: 100%|██████████| 40/40 [00:14<00:00,  2.83it/s]


iter 108: loss 4.0481, time 14461.83ms, mfu 1.97%


Training step 109: 100%|██████████| 40/40 [00:13<00:00,  3.04it/s]


iter 109: loss 2.5485, time 13499.78ms, mfu 1.98%


Training step 110: 100%|██████████| 40/40 [00:13<00:00,  3.02it/s]


iter 110: loss 3.9820, time 13592.10ms, mfu 1.98%


Training step 111: 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]


iter 111: loss 4.0223, time 13476.90ms, mfu 1.98%


Training step 112: 100%|██████████| 40/40 [00:13<00:00,  2.93it/s]


iter 112: loss 3.2208, time 13985.27ms, mfu 1.98%


Training step 113: 100%|██████████| 40/40 [00:13<00:00,  3.00it/s]


iter 113: loss 2.1703, time 13689.13ms, mfu 1.98%


Training step 114: 100%|██████████| 40/40 [00:13<00:00,  2.88it/s]


iter 114: loss 3.9708, time 14220.01ms, mfu 1.97%


Training step 115: 100%|██████████| 40/40 [00:15<00:00,  2.60it/s]


iter 115: loss 0.2600, time 15747.83ms, mfu 1.95%


Training step 116: 100%|██████████| 40/40 [00:16<00:00,  2.37it/s]


iter 116: loss 4.0148, time 17213.09ms, mfu 1.91%


Training step 117: 100%|██████████| 40/40 [00:13<00:00,  2.97it/s]


iter 117: loss 3.2244, time 13820.55ms, mfu 1.91%


Training step 118: 100%|██████████| 40/40 [00:14<00:00,  2.84it/s]


iter 118: loss 2.2799, time 14458.56ms, mfu 1.91%


Training step 119: 100%|██████████| 40/40 [00:14<00:00,  2.83it/s]


iter 119: loss 4.0442, time 14553.39ms, mfu 1.91%


Training step 120: 100%|██████████| 40/40 [00:13<00:00,  2.90it/s]


iter 120: loss 2.4345, time 14130.63ms, mfu 1.91%


Training step 121: 100%|██████████| 40/40 [00:13<00:00,  2.93it/s]


iter 121: loss 3.2033, time 14014.71ms, mfu 1.91%


Training step 122: 100%|██████████| 40/40 [00:13<00:00,  2.88it/s]


iter 122: loss 3.6727, time 14221.32ms, mfu 1.91%


Training step 123: 100%|██████████| 40/40 [00:17<00:00,  2.29it/s]


iter 123: loss 3.6028, time 17820.24ms, mfu 1.87%


Training step 124: 100%|██████████| 40/40 [00:17<00:00,  2.25it/s]


iter 124: loss 3.8199, time 18203.89ms, mfu 1.83%


Training step 125: 100%|██████████| 40/40 [00:16<00:00,  2.49it/s]


iter 125: loss 1.7241, time 16465.07ms, mfu 1.81%


Training step 126: 100%|██████████| 40/40 [00:15<00:00,  2.50it/s]


iter 126: loss 2.8779, time 16352.40ms, mfu 1.80%


Training step 127: 100%|██████████| 40/40 [00:15<00:00,  2.60it/s]


iter 127: loss 4.1054, time 15794.27ms, mfu 1.79%


Training step 128: 100%|██████████| 40/40 [00:17<00:00,  2.30it/s]


iter 128: loss 3.9983, time 17863.79ms, mfu 1.76%


Training step 129: 100%|██████████| 40/40 [00:18<00:00,  2.16it/s]


iter 129: loss 2.6791, time 19007.86ms, mfu 1.73%


Training step 130: 100%|██████████| 40/40 [00:18<00:00,  2.11it/s]


iter 130: loss 2.7171, time 19498.51ms, mfu 1.70%


Training step 131: 100%|██████████| 40/40 [00:18<00:00,  2.12it/s]


iter 131: loss 2.7697, time 19400.18ms, mfu 1.67%


Training step 132: 100%|██████████| 40/40 [00:18<00:00,  2.14it/s]


iter 132: loss 3.7065, time 19182.40ms, mfu 1.64%


Training step 133: 100%|██████████| 40/40 [00:17<00:00,  2.30it/s]


iter 133: loss 2.3092, time 17780.85ms, mfu 1.63%


Training step 134: 100%|██████████| 40/40 [00:19<00:00,  2.09it/s]


iter 134: loss 4.1171, time 19635.95ms, mfu 1.61%


Training step 135: 100%|██████████| 40/40 [00:18<00:00,  2.13it/s]


iter 135: loss 2.3286, time 19320.89ms, mfu 1.59%


Training step 136: 100%|██████████| 40/40 [00:17<00:00,  2.26it/s]


iter 136: loss 4.2856, time 18130.68ms, mfu 1.58%


Training step 137: 100%|██████████| 40/40 [00:15<00:00,  2.55it/s]


iter 137: loss 3.1610, time 16084.73ms, mfu 1.59%


Training step 138: 100%|██████████| 40/40 [00:13<00:00,  2.93it/s]


iter 138: loss 3.9373, time 14024.18ms, mfu 1.62%


Training step 139: 100%|██████████| 40/40 [00:14<00:00,  2.84it/s]


iter 139: loss 3.5373, time 14440.92ms, mfu 1.65%


Training step 140: 100%|██████████| 40/40 [00:14<00:00,  2.77it/s]


iter 140: loss 3.8887, time 14771.26ms, mfu 1.67%


Training step 141: 100%|██████████| 40/40 [00:13<00:00,  2.93it/s]


iter 141: loss 3.9436, time 14004.69ms, mfu 1.69%


Training step 142: 100%|██████████| 40/40 [00:13<00:00,  2.99it/s]


iter 142: loss 3.8966, time 13727.90ms, mfu 1.72%


Training step 143: 100%|██████████| 40/40 [00:13<00:00,  2.96it/s]


iter 143: loss 3.4644, time 13843.33ms, mfu 1.75%


Training step 144: 100%|██████████| 40/40 [00:13<00:00,  2.90it/s]


iter 144: loss 3.1051, time 14133.30ms, mfu 1.76%


Training step 145: 100%|██████████| 40/40 [00:13<00:00,  2.91it/s]


iter 145: loss 3.7486, time 14110.83ms, mfu 1.78%


Training step 146: 100%|██████████| 40/40 [00:13<00:00,  2.98it/s]


iter 146: loss 2.2314, time 13781.12ms, mfu 1.80%


Training step 147: 100%|██████████| 40/40 [00:13<00:00,  2.96it/s]


iter 147: loss 3.5580, time 13871.15ms, mfu 1.81%


Training step 148: 100%|██████████| 40/40 [00:13<00:00,  2.94it/s]


iter 148: loss 0.9442, time 13934.03ms, mfu 1.83%


Training step 149: 100%|██████████| 40/40 [00:14<00:00,  2.85it/s]


iter 149: loss 2.3353, time 14394.84ms, mfu 1.83%


Training step 150: 100%|██████████| 40/40 [00:14<00:00,  2.85it/s]


iter 150: loss 2.8550, time 14375.44ms, mfu 1.84%


Training step 151: 100%|██████████| 40/40 [00:12<00:00,  3.15it/s]


iter 151: loss 3.9609, time 13065.16ms, mfu 1.86%


Training step 152: 100%|██████████| 40/40 [00:14<00:00,  2.75it/s]


iter 152: loss 4.1351, time 14891.24ms, mfu 1.86%


Training step 153: 100%|██████████| 40/40 [00:14<00:00,  2.82it/s]


iter 153: loss 3.7817, time 14521.27ms, mfu 1.86%


Training step 154: 100%|██████████| 40/40 [00:13<00:00,  2.92it/s]


iter 154: loss 3.8758, time 14038.74ms, mfu 1.87%


Training step 155: 100%|██████████| 40/40 [00:13<00:00,  3.01it/s]


iter 155: loss 3.4532, time 13622.21ms, mfu 1.88%


Training step 156: 100%|██████████| 40/40 [00:13<00:00,  2.98it/s]


iter 156: loss 3.3943, time 13759.01ms, mfu 1.89%


Training step 157: 100%|██████████| 40/40 [00:13<00:00,  2.88it/s]


iter 157: loss 4.3417, time 14229.12ms, mfu 1.89%


Training step 158: 100%|██████████| 40/40 [00:13<00:00,  2.89it/s]


iter 158: loss 3.6875, time 14184.08ms, mfu 1.89%


Training step 159: 100%|██████████| 40/40 [00:13<00:00,  2.91it/s]


iter 159: loss 1.4807, time 14073.56ms, mfu 1.90%


Training step 160: 100%|██████████| 40/40 [00:13<00:00,  2.91it/s]


iter 160: loss 3.6748, time 14078.81ms, mfu 1.90%


Training step 161: 100%|██████████| 40/40 [00:13<00:00,  2.98it/s]


iter 161: loss 2.8082, time 13764.80ms, mfu 1.91%


Training step 162: 100%|██████████| 40/40 [00:14<00:00,  2.85it/s]


iter 162: loss 2.6525, time 14377.45ms, mfu 1.90%


Training step 163: 100%|██████████| 40/40 [00:13<00:00,  2.87it/s]


iter 163: loss 0.3151, time 14261.62ms, mfu 1.90%


Training step 164: 100%|██████████| 40/40 [00:13<00:00,  2.96it/s]


iter 164: loss 3.5617, time 13879.21ms, mfu 1.91%


Training step 165: 100%|██████████| 40/40 [00:13<00:00,  2.94it/s]


iter 165: loss 0.5715, time 13976.43ms, mfu 1.91%


Training step 166: 100%|██████████| 40/40 [00:15<00:00,  2.61it/s]


iter 166: loss 4.0634, time 15698.15ms, mfu 1.89%


Training step 167: 100%|██████████| 40/40 [00:14<00:00,  2.77it/s]


iter 167: loss 3.4709, time 14810.89ms, mfu 1.89%


Training step 168: 100%|██████████| 40/40 [00:13<00:00,  2.89it/s]


iter 168: loss 3.3992, time 14188.50ms, mfu 1.89%


Training step 169: 100%|██████████| 40/40 [00:14<00:00,  2.77it/s]


iter 169: loss 3.9918, time 14815.35ms, mfu 1.88%


Training step 170: 100%|██████████| 40/40 [00:14<00:00,  2.79it/s]


iter 170: loss 2.5042, time 14707.64ms, mfu 1.88%


Training step 171: 100%|██████████| 40/40 [00:14<00:00,  2.82it/s]


iter 171: loss 3.5103, time 14544.93ms, mfu 1.88%


Training step 172: 100%|██████████| 40/40 [00:13<00:00,  2.99it/s]


iter 172: loss 2.0640, time 13721.88ms, mfu 1.89%


Training step 173: 100%|██████████| 40/40 [00:17<00:00,  2.28it/s]


iter 173: loss 3.3897, time 17975.90ms, mfu 1.85%


Training step 174: 100%|██████████| 40/40 [00:13<00:00,  2.88it/s]


iter 174: loss 3.6096, time 14278.68ms, mfu 1.86%


Training step 175: 100%|██████████| 40/40 [00:13<00:00,  2.95it/s]


iter 175: loss 2.1991, time 13931.15ms, mfu 1.87%


Training step 176: 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]


iter 176: loss 3.4352, time 13449.79ms, mfu 1.88%


Training step 177: 100%|██████████| 40/40 [00:14<00:00,  2.77it/s]


iter 177: loss 2.7608, time 14754.50ms, mfu 1.88%


Training step 178: 100%|██████████| 40/40 [00:13<00:00,  2.99it/s]


iter 178: loss 3.1466, time 13727.86ms, mfu 1.89%


Training step 179: 100%|██████████| 40/40 [00:13<00:00,  3.00it/s]


iter 179: loss 1.4563, time 13686.56ms, mfu 1.90%


Training step 180: 100%|██████████| 40/40 [00:12<00:00,  3.12it/s]


iter 180: loss 3.3192, time 13138.69ms, mfu 1.91%


Training step 181: 100%|██████████| 40/40 [00:13<00:00,  3.00it/s]


iter 181: loss 2.2486, time 13694.30ms, mfu 1.92%


Training step 182: 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]


iter 182: loss 2.0098, time 13446.32ms, mfu 1.93%


Training step 183: 100%|██████████| 40/40 [00:13<00:00,  3.06it/s]


iter 183: loss 2.0387, time 13387.99ms, mfu 1.94%


Training step 184: 100%|██████████| 40/40 [00:12<00:00,  3.18it/s]


iter 184: loss 3.8090, time 12903.18ms, mfu 1.96%


Training step 185: 100%|██████████| 40/40 [00:12<00:00,  3.17it/s]


iter 185: loss 3.3556, time 12932.74ms, mfu 1.97%


Training step 186: 100%|██████████| 40/40 [00:13<00:00,  2.98it/s]


iter 186: loss 1.9686, time 13738.02ms, mfu 1.97%


Training step 187: 100%|██████████| 40/40 [00:12<00:00,  3.09it/s]


iter 187: loss 3.4777, time 13265.49ms, mfu 1.98%


Training step 188: 100%|██████████| 40/40 [00:12<00:00,  3.17it/s]


iter 188: loss 3.1603, time 12932.31ms, mfu 1.99%


Training step 189: 100%|██████████| 40/40 [00:12<00:00,  3.17it/s]


iter 189: loss 4.2106, time 12946.95ms, mfu 2.00%


Training step 190: 100%|██████████| 40/40 [00:13<00:00,  3.07it/s]


iter 190: loss 3.5573, time 13327.51ms, mfu 2.00%


Training step 191: 100%|██████████| 40/40 [00:13<00:00,  2.96it/s]


iter 191: loss 2.3681, time 13872.40ms, mfu 2.00%


Training step 192: 100%|██████████| 40/40 [00:14<00:00,  2.80it/s]


iter 192: loss 2.5401, time 14645.48ms, mfu 1.98%


Training step 193: 100%|██████████| 40/40 [00:12<00:00,  3.11it/s]


iter 193: loss 3.6283, time 13189.49ms, mfu 1.99%


Training step 194: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 194: loss 3.2065, time 13232.26ms, mfu 2.00%


Training step 195: 100%|██████████| 40/40 [00:14<00:00,  2.77it/s]


iter 195: loss 3.7834, time 14767.16ms, mfu 1.98%


Training step 196: 100%|██████████| 40/40 [00:13<00:00,  3.08it/s]


iter 196: loss 0.8511, time 13346.84ms, mfu 1.99%


Training step 197: 100%|██████████| 40/40 [00:13<00:00,  2.90it/s]


iter 197: loss 3.4793, time 14138.12ms, mfu 1.98%


Training step 198: 100%|██████████| 40/40 [00:13<00:00,  3.06it/s]


iter 198: loss 3.4065, time 13377.97ms, mfu 1.98%


Training step 199: 100%|██████████| 40/40 [00:14<00:00,  2.83it/s]


iter 199: loss 3.4962, time 14492.98ms, mfu 1.97%


Training step 200: 100%|██████████| 40/40 [00:13<00:00,  3.03it/s]


iter 200: loss 3.4895, time 13557.81ms, mfu 1.98%


Training step 201: 100%|██████████| 40/40 [00:13<00:00,  2.97it/s]


iter 201: loss 3.4133, time 13857.14ms, mfu 1.97%


Training step 202: 100%|██████████| 40/40 [00:12<00:00,  3.10it/s]


iter 202: loss 2.9155, time 13238.47ms, mfu 1.98%


Training step 203: 100%|██████████| 40/40 [00:12<00:00,  3.08it/s]


iter 203: loss 3.5102, time 13295.61ms, mfu 1.99%


Training step 204: 100%|██████████| 40/40 [00:13<00:00,  2.94it/s]


iter 204: loss 2.8855, time 13965.02ms, mfu 1.98%


Training step 205: 100%|██████████| 40/40 [00:12<00:00,  3.17it/s]


iter 205: loss 3.5718, time 12955.27ms, mfu 1.99%


Training step 206: 100%|██████████| 40/40 [00:14<00:00,  2.73it/s]


iter 206: loss 2.8744, time 14993.21ms, mfu 1.98%


Training step 207: 100%|██████████| 40/40 [00:13<00:00,  3.04it/s]


iter 207: loss 2.1711, time 13492.88ms, mfu 1.98%


Training step 208: 100%|██████████| 40/40 [00:13<00:00,  2.95it/s]


iter 208: loss 2.4860, time 13925.97ms, mfu 1.98%


Training step 209: 100%|██████████| 40/40 [00:12<00:00,  3.13it/s]


iter 209: loss 3.0959, time 13157.07ms, mfu 1.99%


Training step 210: 100%|██████████| 40/40 [00:13<00:00,  2.92it/s]


iter 210: loss 3.4978, time 14080.37ms, mfu 1.98%


Training step 211: 100%|██████████| 40/40 [00:19<00:00,  2.08it/s]


iter 211: loss 1.6903, time 19701.79ms, mfu 1.92%


Training step 212: 100%|██████████| 40/40 [00:14<00:00,  2.70it/s]


iter 212: loss 3.4352, time 15201.20ms, mfu 1.91%


Training step 213: 100%|██████████| 40/40 [00:14<00:00,  2.78it/s]


iter 213: loss 2.0244, time 14728.53ms, mfu 1.90%


Training step 214: 100%|██████████| 40/40 [00:13<00:00,  2.87it/s]


iter 214: loss 2.6268, time 14270.31ms, mfu 1.90%


Training step 215: 100%|██████████| 40/40 [00:13<00:00,  3.05it/s]


iter 215: loss 1.9532, time 13373.97ms, mfu 1.91%


Training step 216: 100%|██████████| 40/40 [00:10<00:00,  3.83it/s]


iter 216: loss 3.6718, time 10685.09ms, mfu 1.98%


Training step 217: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 217: loss 2.2311, time 9536.71ms, mfu 2.06%


Training step 218: 100%|██████████| 40/40 [00:09<00:00,  4.29it/s]


iter 218: loss 3.6651, time 9561.98ms, mfu 2.14%


Training step 219: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 219: loss 3.4617, time 9382.26ms, mfu 2.22%


Training step 220: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 220: loss 1.9076, time 9488.12ms, mfu 2.28%


Training step 221: 100%|██████████| 40/40 [00:09<00:00,  4.25it/s]


iter 221: loss 1.7117, time 9667.15ms, mfu 2.33%


Training step 222: 100%|██████████| 40/40 [00:09<00:00,  4.19it/s]


iter 222: loss 2.2460, time 9804.65ms, mfu 2.38%


Training step 223: 100%|██████████| 40/40 [00:09<00:00,  4.20it/s]


iter 223: loss 3.7552, time 9763.59ms, mfu 2.42%


Training step 224: 100%|██████████| 40/40 [00:09<00:00,  4.18it/s]


iter 224: loss 3.7481, time 9807.22ms, mfu 2.45%


Training step 225: 100%|██████████| 40/40 [00:09<00:00,  4.15it/s]


iter 225: loss 1.9110, time 9878.16ms, mfu 2.48%


Training step 226: 100%|██████████| 40/40 [00:09<00:00,  4.16it/s]


iter 226: loss 3.5186, time 9871.18ms, mfu 2.51%


Training step 227: 100%|██████████| 40/40 [00:09<00:00,  4.13it/s]


iter 227: loss 2.3922, time 9920.92ms, mfu 2.53%


Training step 228: 100%|██████████| 40/40 [00:09<00:00,  4.11it/s]


iter 228: loss 2.5146, time 9979.48ms, mfu 2.55%


Training step 229: 100%|██████████| 40/40 [00:09<00:00,  4.09it/s]


iter 229: loss 2.8772, time 10030.95ms, mfu 2.56%


Training step 230: 100%|██████████| 40/40 [00:09<00:00,  4.13it/s]


iter 230: loss 3.2946, time 9943.64ms, mfu 2.58%


Training step 231: 100%|██████████| 40/40 [00:09<00:00,  4.16it/s]


iter 231: loss 3.9154, time 9850.60ms, mfu 2.60%


Training step 232: 100%|██████████| 40/40 [00:09<00:00,  4.17it/s]


iter 232: loss 3.6516, time 9826.41ms, mfu 2.61%


Training step 233: 100%|██████████| 40/40 [00:09<00:00,  4.21it/s]


iter 233: loss 3.4118, time 9739.92ms, mfu 2.63%


Training step 234: 100%|██████████| 40/40 [00:09<00:00,  4.14it/s]


iter 234: loss 2.9511, time 9901.34ms, mfu 2.64%


Training step 235: 100%|██████████| 40/40 [00:09<00:00,  4.27it/s]


iter 235: loss 3.1474, time 9651.06ms, mfu 2.66%


Training step 236: 100%|██████████| 40/40 [00:09<00:00,  4.16it/s]


iter 236: loss 3.6119, time 9880.72ms, mfu 2.67%


Training step 237: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 237: loss 3.5038, time 9500.58ms, mfu 2.69%


Training step 238: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 238: loss 1.0788, time 9536.54ms, mfu 2.70%


Training step 239: 100%|██████████| 40/40 [00:09<00:00,  4.22it/s]


iter 239: loss 2.2069, time 9724.34ms, mfu 2.71%


Training step 240: 100%|██████████| 40/40 [00:09<00:00,  4.26it/s]


iter 240: loss 2.2564, time 9656.61ms, mfu 2.72%


Training step 241: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 241: loss 2.6699, time 9369.78ms, mfu 2.74%


Training step 242: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 242: loss 3.5260, time 9501.30ms, mfu 2.75%


Training step 243: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 243: loss 2.9750, time 9486.95ms, mfu 2.76%


Training step 244: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 244: loss 3.1987, time 9321.11ms, mfu 2.78%


Training step 245: 100%|██████████| 40/40 [00:09<00:00,  4.44it/s]


iter 245: loss 3.3790, time 9242.75ms, mfu 2.79%


Training step 246: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 246: loss 0.4730, time 9422.44ms, mfu 2.80%


Training step 247: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 247: loss 2.1895, time 9490.95ms, mfu 2.81%


Training step 248: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 248: loss 1.2419, time 9400.37ms, mfu 2.81%


Training step 249: 100%|██████████| 40/40 [00:09<00:00,  4.44it/s]


iter 249: loss 2.8122, time 9270.99ms, mfu 2.83%


Training step 250: 100%|██████████| 40/40 [00:09<00:00,  4.44it/s]


iter 250: loss 3.4656, time 9254.82ms, mfu 2.84%


Training step 251: 100%|██████████| 40/40 [00:09<00:00,  4.44it/s]


iter 251: loss 2.8980, time 9244.98ms, mfu 2.85%


Training step 252: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 252: loss 0.3781, time 9316.17ms, mfu 2.85%


Training step 253: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s]


iter 253: loss 1.7831, time 9349.62ms, mfu 2.86%


Training step 254: 100%|██████████| 40/40 [00:09<00:00,  4.43it/s]


iter 254: loss 2.6916, time 9313.49ms, mfu 2.86%


Training step 255: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 255: loss 3.4878, time 9455.44ms, mfu 2.86%


Training step 256: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s]


iter 256: loss 3.5036, time 9340.40ms, mfu 2.87%


Training step 257: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 257: loss 3.1329, time 9325.59ms, mfu 2.87%


Training step 258: 100%|██████████| 40/40 [00:08<00:00,  4.45it/s]


iter 258: loss 3.0644, time 9241.68ms, mfu 2.88%


Training step 259: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 259: loss 3.4423, time 9462.61ms, mfu 2.88%


Training step 260: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 260: loss 2.9269, time 9419.55ms, mfu 2.88%


Training step 261: 100%|██████████| 40/40 [00:09<00:00,  4.28it/s]


iter 261: loss 2.1110, time 9575.93ms, mfu 2.87%


Training step 262: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 262: loss 1.7301, time 9448.82ms, mfu 2.87%


Training step 263: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 263: loss 3.3089, time 9389.86ms, mfu 2.88%


Training step 264: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 264: loss 0.9307, time 9424.46ms, mfu 2.88%


Training step 265: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 265: loss 2.7602, time 9387.82ms, mfu 2.88%


Training step 266: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 266: loss 3.5297, time 9430.40ms, mfu 2.88%


Training step 267: 100%|██████████| 40/40 [00:09<00:00,  4.27it/s]


iter 267: loss 3.5302, time 9608.71ms, mfu 2.87%


Training step 268: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 268: loss 2.7765, time 9479.70ms, mfu 2.87%


Training step 269: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 269: loss 3.1801, time 9315.97ms, mfu 2.87%


Training step 270: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s]


iter 270: loss 2.4264, time 9338.12ms, mfu 2.88%


Training step 271: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 271: loss 3.0428, time 9330.25ms, mfu 2.88%


Training step 272: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 272: loss 2.4040, time 9453.93ms, mfu 2.88%


Training step 273: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 273: loss 2.7441, time 9309.95ms, mfu 2.88%


Training step 274: 100%|██████████| 40/40 [00:09<00:00,  4.36it/s]


iter 274: loss 3.1484, time 9417.54ms, mfu 2.88%


Training step 275: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 275: loss 2.1285, time 9451.22ms, mfu 2.88%


Training step 276: 100%|██████████| 40/40 [00:09<00:00,  4.24it/s]


iter 276: loss 1.9333, time 9679.47ms, mfu 2.87%


Training step 277: 100%|██████████| 40/40 [00:09<00:00,  4.43it/s]


iter 277: loss 1.6886, time 9278.66ms, mfu 2.88%


Training step 278: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 278: loss 3.4296, time 9428.38ms, mfu 2.88%


Training step 279: 100%|██████████| 40/40 [00:09<00:00,  4.28it/s]


iter 279: loss 3.3202, time 9571.41ms, mfu 2.87%


Training step 280: 100%|██████████| 40/40 [00:09<00:00,  4.31it/s]


iter 280: loss 3.2809, time 9519.65ms, mfu 2.87%


Training step 281: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 281: loss 2.9490, time 9370.08ms, mfu 2.87%


Training step 282: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 282: loss 3.3866, time 9396.54ms, mfu 2.88%


Training step 283: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 283: loss 3.2831, time 9303.18ms, mfu 2.88%


Training step 284: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s]


iter 284: loss 3.4688, time 9348.89ms, mfu 2.88%


Training step 285: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 285: loss 2.6751, time 9443.56ms, mfu 2.88%


Training step 286: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 286: loss 3.0084, time 9399.15ms, mfu 2.88%


Training step 287: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 287: loss 0.3416, time 9438.61ms, mfu 2.88%


Training step 288: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 288: loss 2.2266, time 9471.47ms, mfu 2.88%


Training step 289: 100%|██████████| 40/40 [00:09<00:00,  4.31it/s]


iter 289: loss 3.2107, time 9537.04ms, mfu 2.88%


Training step 290: 100%|██████████| 40/40 [00:09<00:00,  4.42it/s]


iter 290: loss 2.8625, time 9275.68ms, mfu 2.88%


Training step 291: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 291: loss 3.5163, time 9480.39ms, mfu 2.88%


Training step 292: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 292: loss 3.1168, time 9477.17ms, mfu 2.88%


Training step 293: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 293: loss 3.7524, time 9466.91ms, mfu 2.88%


Training step 294: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 294: loss 3.3898, time 9548.38ms, mfu 2.87%


Training step 295: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 295: loss 3.0677, time 9423.56ms, mfu 2.87%


Training step 296: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 296: loss 3.3457, time 9295.04ms, mfu 2.88%


Training step 297: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 297: loss 2.1790, time 9427.54ms, mfu 2.88%


Training step 298: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 298: loss 2.5111, time 9446.64ms, mfu 2.88%


Training step 299: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 299: loss 1.7752, time 9534.55ms, mfu 2.87%


Training step 300: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 300: loss 3.4357, time 9496.57ms, mfu 2.87%


Training step 301: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 301: loss 1.7605, time 9460.38ms, mfu 2.87%


Training step 302: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 302: loss 3.3932, time 9432.31ms, mfu 2.87%


Training step 303: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 303: loss 3.3138, time 9473.25ms, mfu 2.87%


Training step 304: 100%|██████████| 40/40 [00:09<00:00,  4.26it/s]


iter 304: loss 2.1652, time 9662.21ms, mfu 2.87%


Training step 305: 100%|██████████| 40/40 [00:09<00:00,  4.35it/s]


iter 305: loss 2.1090, time 9439.09ms, mfu 2.87%


Training step 306: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 306: loss 2.0861, time 9491.96ms, mfu 2.87%


Training step 307: 100%|██████████| 40/40 [00:09<00:00,  4.29it/s]


iter 307: loss 3.9762, time 9558.19ms, mfu 2.86%


Training step 308: 100%|██████████| 40/40 [00:09<00:00,  4.36it/s]


iter 308: loss 2.4912, time 9416.89ms, mfu 2.86%


Training step 309: 100%|██████████| 40/40 [00:09<00:00,  4.43it/s]


iter 309: loss 2.1313, time 9289.56ms, mfu 2.87%


Training step 310: 100%|██████████| 40/40 [00:09<00:00,  4.29it/s]


iter 310: loss 3.3591, time 9609.99ms, mfu 2.87%


Training step 311: 100%|██████████| 40/40 [00:09<00:00,  4.27it/s]


iter 311: loss 2.8516, time 9618.78ms, mfu 2.86%


Training step 312: 100%|██████████| 40/40 [00:09<00:00,  4.28it/s]


iter 312: loss 3.0782, time 9584.33ms, mfu 2.86%


Training step 313: 100%|██████████| 40/40 [00:09<00:00,  4.36it/s]


iter 313: loss 3.2889, time 9409.56ms, mfu 2.86%


Training step 314: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 314: loss 3.0216, time 9391.97ms, mfu 2.86%


Training step 315: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 315: loss 3.1800, time 9319.19ms, mfu 2.87%


Training step 316: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 316: loss 2.6949, time 9472.40ms, mfu 2.87%


Training step 317: 100%|██████████| 40/40 [00:09<00:00,  4.22it/s]


iter 317: loss 3.0250, time 9733.52ms, mfu 2.86%


Training step 318: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 318: loss 2.9722, time 9532.76ms, mfu 2.86%


Training step 319: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 319: loss 1.6590, time 9447.26ms, mfu 2.86%


Training step 320: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 320: loss 3.3441, time 9382.80ms, mfu 2.86%


Training step 321: 100%|██████████| 40/40 [00:09<00:00,  4.31it/s]


iter 321: loss 3.2422, time 9524.18ms, mfu 2.86%


Training step 322: 100%|██████████| 40/40 [00:09<00:00,  4.42it/s]


iter 322: loss 1.9064, time 9288.04ms, mfu 2.87%


Training step 323: 100%|██████████| 40/40 [00:09<00:00,  4.31it/s]


iter 323: loss 2.7456, time 9572.27ms, mfu 2.86%


Training step 324: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 324: loss 1.8511, time 9484.67ms, mfu 2.86%


Training step 325: 100%|██████████| 40/40 [00:09<00:00,  4.23it/s]


iter 325: loss 3.0817, time 9705.49ms, mfu 2.86%


Training step 326: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 326: loss 2.2035, time 9465.46ms, mfu 2.86%


Training step 327: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 327: loss 1.5664, time 9374.16ms, mfu 2.86%


Training step 328: 100%|██████████| 40/40 [00:09<00:00,  4.42it/s]


iter 328: loss 2.7160, time 9286.70ms, mfu 2.87%


Training step 329: 100%|██████████| 40/40 [00:09<00:00,  4.25it/s]


iter 329: loss 2.9983, time 9652.00ms, mfu 2.86%


Training step 330: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 330: loss 2.2466, time 9554.77ms, mfu 2.86%


Training step 331: 100%|██████████| 40/40 [00:09<00:00,  4.15it/s]


iter 331: loss 1.5511, time 9888.09ms, mfu 2.85%


Training step 332: 100%|██████████| 40/40 [00:09<00:00,  4.37it/s]


iter 332: loss 2.8801, time 9401.22ms, mfu 2.85%


Training step 333: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s]


iter 333: loss 2.7945, time 9337.60ms, mfu 2.86%


Training step 334: 100%|██████████| 40/40 [00:09<00:00,  4.32it/s]


iter 334: loss 3.0307, time 9484.73ms, mfu 2.86%


Training step 335: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 335: loss 3.1181, time 9374.48ms, mfu 2.86%


Training step 336: 100%|██████████| 40/40 [00:09<00:00,  4.34it/s]


iter 336: loss 3.3249, time 9493.35ms, mfu 2.86%


Training step 337: 100%|██████████| 40/40 [00:09<00:00,  4.29it/s]


iter 337: loss 3.0748, time 9577.66ms, mfu 2.86%


Training step 338: 100%|██████████| 40/40 [00:09<00:00,  4.30it/s]


iter 338: loss 0.8559, time 9539.54ms, mfu 2.86%


Training step 339: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 339: loss 3.0977, time 9375.36ms, mfu 2.86%


Training step 340: 100%|██████████| 40/40 [00:09<00:00,  4.40it/s]


iter 340: loss 2.9379, time 9392.44ms, mfu 2.86%


Training step 341: 100%|██████████| 40/40 [00:09<00:00,  4.38it/s]


iter 341: loss 3.0015, time 9358.43ms, mfu 2.87%


Training step 342: 100%|██████████| 40/40 [00:09<00:00,  4.24it/s]


iter 342: loss 2.1298, time 9686.87ms, mfu 2.86%


Training step 343: 100%|██████████| 40/40 [00:09<00:00,  4.24it/s]


iter 343: loss 3.1949, time 9687.48ms, mfu 2.85%


Training step 344: 100%|██████████| 40/40 [00:09<00:00,  4.22it/s]


iter 344: loss 1.9976, time 9715.60ms, mfu 2.85%


Training step 345: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 345: loss 1.6080, time 9469.71ms, mfu 2.85%


Training step 346: 100%|██████████| 40/40 [00:09<00:00,  4.33it/s]


iter 346: loss 2.9321, time 9465.17ms, mfu 2.85%


Training step 347: 100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


iter 347: loss 2.0625, time 9298.90ms, mfu 2.86%


Training step 348: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 348: loss 2.7893, time 11433.20ms, mfu 2.81%


Training step 349: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 349: loss 1.9793, time 11743.83ms, mfu 2.76%


Training step 350: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 350: loss 1.4995, time 11776.01ms, mfu 2.71%


Training step 351: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 351: loss 2.0176, time 11714.37ms, mfu 2.67%


Training step 352: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 352: loss 2.6293, time 11583.95ms, mfu 2.64%


Training step 353: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 353: loss 2.0306, time 11630.50ms, mfu 2.61%


Training step 354: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 354: loss 2.4784, time 11629.75ms, mfu 2.58%


Training step 355: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 355: loss 2.0523, time 11635.51ms, mfu 2.56%


Training step 356: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 356: loss 1.7440, time 11791.19ms, mfu 2.53%


Training step 357: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 357: loss 3.3185, time 11803.20ms, mfu 2.51%


Training step 358: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 358: loss 3.1007, time 11544.02ms, mfu 2.49%


Training step 359: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 359: loss 3.3199, time 11521.47ms, mfu 2.48%


Training step 360: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 360: loss 2.9180, time 11534.75ms, mfu 2.47%


Training step 361: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 361: loss 3.8538, time 11546.43ms, mfu 2.45%


Training step 362: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 362: loss 2.8235, time 11555.79ms, mfu 2.44%


Training step 363: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 363: loss 2.1928, time 11441.66ms, mfu 2.44%


Training step 364: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 364: loss 2.9048, time 11534.67ms, mfu 2.43%


Training step 365: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 365: loss 3.1549, time 11426.97ms, mfu 2.42%


Training step 366: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 366: loss 2.3498, time 11497.17ms, mfu 2.42%


Training step 367: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 367: loss 3.0393, time 11651.70ms, mfu 2.41%


Training step 368: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 368: loss 3.2705, time 11585.44ms, mfu 2.40%


Training step 369: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 369: loss 3.4519, time 11694.35ms, mfu 2.39%


Training step 370: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 370: loss 3.3871, time 11654.15ms, mfu 2.39%


Training step 371: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 371: loss 1.5425, time 11692.48ms, mfu 2.38%


Training step 372: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 372: loss 2.5204, time 11610.37ms, mfu 2.38%


Training step 373: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 373: loss 3.5765, time 11577.63ms, mfu 2.37%


Training step 374: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 374: loss 1.4827, time 11868.17ms, mfu 2.36%


Training step 375: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 375: loss 2.0141, time 11622.73ms, mfu 2.36%


Training step 376: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 376: loss 3.1987, time 11642.38ms, mfu 2.36%


Training step 377: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 377: loss 1.5847, time 11733.25ms, mfu 2.35%


Training step 378: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 378: loss 3.0547, time 11677.14ms, mfu 2.35%


Training step 379: 100%|██████████| 40/40 [00:11<00:00,  3.44it/s]


iter 379: loss 3.1014, time 11953.33ms, mfu 2.34%


Training step 380: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 380: loss 2.6746, time 11680.32ms, mfu 2.34%


Training step 381: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 381: loss 1.9716, time 11734.81ms, mfu 2.34%


Training step 382: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 382: loss 3.3891, time 11603.25ms, mfu 2.34%


Training step 383: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 383: loss 3.0480, time 11689.63ms, mfu 2.34%


Training step 384: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 384: loss 2.6633, time 11866.86ms, mfu 2.33%


Training step 385: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 385: loss 3.6106, time 11636.85ms, mfu 2.33%


Training step 386: 100%|██████████| 40/40 [00:11<00:00,  3.43it/s]


iter 386: loss 3.0696, time 11967.47ms, mfu 2.32%


Training step 387: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 387: loss 3.0612, time 11705.80ms, mfu 2.32%


Training step 388: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 388: loss 2.8508, time 11786.74ms, mfu 2.32%


Training step 389: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 389: loss 2.5870, time 11719.05ms, mfu 2.32%


Training step 390: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 390: loss 3.2149, time 11872.23ms, mfu 2.32%


Training step 391: 100%|██████████| 40/40 [00:11<00:00,  3.47it/s]


iter 391: loss 2.7363, time 11817.23ms, mfu 2.32%


Training step 392: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 392: loss 3.3182, time 11635.44ms, mfu 2.32%


Training step 393: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 393: loss 2.0192, time 11657.19ms, mfu 2.32%


Training step 394: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 394: loss 3.2319, time 11588.62ms, mfu 2.32%


Training step 395: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 395: loss 1.3683, time 11722.39ms, mfu 2.32%


Training step 396: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 396: loss 1.3921, time 11636.09ms, mfu 2.32%


Training step 397: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 397: loss 2.6024, time 11394.66ms, mfu 2.33%


Training step 398: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 398: loss 3.4620, time 11702.34ms, mfu 2.33%


Training step 399: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 399: loss 3.1378, time 11448.55ms, mfu 2.33%


Training step 400: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 400: loss 2.3342, time 11601.28ms, mfu 2.33%


Training step 401: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 401: loss 3.9313, time 11665.93ms, mfu 2.33%


Training step 402: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 402: loss 1.7399, time 11575.17ms, mfu 2.33%


Training step 403: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 403: loss 2.7549, time 11377.29ms, mfu 2.34%


Training step 404: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 404: loss 3.2837, time 11367.51ms, mfu 2.34%


Training step 405: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 405: loss 3.3367, time 11535.87ms, mfu 2.34%


Training step 406: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 406: loss 2.6683, time 11415.62ms, mfu 2.35%


Training step 407: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 407: loss 2.9174, time 11463.67ms, mfu 2.35%


Training step 408: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 408: loss 2.3314, time 11490.21ms, mfu 2.35%


Training step 409: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 409: loss 2.9627, time 11496.59ms, mfu 2.35%


Training step 410: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 410: loss 1.1520, time 11564.84ms, mfu 2.35%


Training step 411: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 411: loss 2.8016, time 11528.20ms, mfu 2.35%


Training step 412: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 412: loss 3.0285, time 11652.02ms, mfu 2.35%


Training step 413: 100%|██████████| 40/40 [00:11<00:00,  3.43it/s]


iter 413: loss 1.9066, time 11958.85ms, mfu 2.34%


Training step 414: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 414: loss 2.6345, time 11706.61ms, mfu 2.34%


Training step 415: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 415: loss 3.0333, time 11725.60ms, mfu 2.34%


Training step 416: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 416: loss 3.0131, time 11612.24ms, mfu 2.34%


Training step 417: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 417: loss 3.1421, time 11771.15ms, mfu 2.33%


Training step 418: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 418: loss 1.3873, time 11587.52ms, mfu 2.33%


Training step 419: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 419: loss 3.2208, time 11663.67ms, mfu 2.33%


Training step 420: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 420: loss 2.7671, time 11680.77ms, mfu 2.33%


Training step 421: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 421: loss 2.4344, time 11708.12ms, mfu 2.33%


Training step 422: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 422: loss 3.0001, time 11756.56ms, mfu 2.33%


Training step 423: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 423: loss 1.6441, time 11610.08ms, mfu 2.33%


Training step 424: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 424: loss 1.4137, time 11871.78ms, mfu 2.32%


Training step 425: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 425: loss 1.6365, time 11788.68ms, mfu 2.32%


Training step 426: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 426: loss 2.9240, time 11726.83ms, mfu 2.32%


Training step 427: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 427: loss 1.0167, time 11881.52ms, mfu 2.32%


Training step 428: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 428: loss 2.8019, time 11680.40ms, mfu 2.32%


Training step 429: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 429: loss 2.6251, time 11800.04ms, mfu 2.32%


Training step 430: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 430: loss 1.8548, time 11635.79ms, mfu 2.32%


Training step 431: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 431: loss 3.3363, time 11773.54ms, mfu 2.32%


Training step 432: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 432: loss 3.2608, time 11636.96ms, mfu 2.32%


Training step 433: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 433: loss 2.1258, time 11535.17ms, mfu 2.32%


Training step 434: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 434: loss 2.2371, time 11640.52ms, mfu 2.32%


Training step 435: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 435: loss 1.4996, time 11702.99ms, mfu 2.32%


Training step 436: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 436: loss 2.2069, time 11867.38ms, mfu 2.32%


Training step 437: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 437: loss 2.9127, time 11466.17ms, mfu 2.32%


Training step 438: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 438: loss 2.7723, time 11536.33ms, mfu 2.33%


Training step 439: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 439: loss 3.0278, time 11520.72ms, mfu 2.33%


Training step 440: 100%|██████████| 40/40 [00:11<00:00,  3.63it/s]


iter 440: loss 3.2203, time 11326.25ms, mfu 2.34%


Training step 441: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 441: loss 2.9374, time 11575.06ms, mfu 2.34%


Training step 442: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 442: loss 3.0937, time 11447.85ms, mfu 2.34%


Training step 443: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 443: loss 3.1267, time 11552.45ms, mfu 2.34%


Training step 444: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 444: loss 3.2709, time 11523.30ms, mfu 2.34%


Training step 445: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 445: loss 1.9340, time 11448.06ms, mfu 2.35%


Training step 446: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 446: loss 1.8862, time 11472.85ms, mfu 2.35%


Training step 447: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 447: loss 2.9567, time 11597.59ms, mfu 2.35%


Training step 448: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 448: loss 2.2109, time 11521.82ms, mfu 2.35%


Training step 449: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 449: loss 1.8270, time 11516.31ms, mfu 2.35%


Training step 450: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 450: loss 2.5168, time 11475.56ms, mfu 2.35%


Training step 451: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 451: loss 1.7275, time 11523.75ms, mfu 2.35%


Training step 452: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 452: loss 3.2841, time 11557.16ms, mfu 2.35%


Training step 453: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 453: loss 2.2086, time 11570.92ms, mfu 2.35%


Training step 454: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 454: loss 2.9441, time 11465.40ms, mfu 2.35%


Training step 455: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 455: loss 2.2004, time 11607.76ms, mfu 2.35%


Training step 456: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 456: loss 2.6322, time 11626.90ms, mfu 2.35%


Training step 457: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 457: loss 2.8399, time 11600.55ms, mfu 2.35%


Training step 458: 100%|██████████| 40/40 [00:11<00:00,  3.42it/s]


iter 458: loss 3.0279, time 12012.90ms, mfu 2.34%


Training step 459: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 459: loss 2.6102, time 11692.84ms, mfu 2.34%


Training step 460: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 460: loss 1.6677, time 11765.79ms, mfu 2.33%


Training step 461: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 461: loss 2.8356, time 11550.91ms, mfu 2.34%


Training step 462: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 462: loss 1.7926, time 11757.40ms, mfu 2.33%


Training step 463: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 463: loss 2.7870, time 11735.03ms, mfu 2.33%


Training step 464: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 464: loss 2.8681, time 11609.08ms, mfu 2.33%


Training step 465: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 465: loss 1.9167, time 11858.86ms, mfu 2.33%


Training step 466: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 466: loss 2.5179, time 11603.46ms, mfu 2.33%


Training step 467: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 467: loss 2.9787, time 11714.71ms, mfu 2.33%


Training step 468: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 468: loss 2.6650, time 11675.10ms, mfu 2.33%


Training step 469: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 469: loss 3.0870, time 11745.58ms, mfu 2.32%


Training step 470: 100%|██████████| 40/40 [00:11<00:00,  3.47it/s]


iter 470: loss 3.2009, time 11812.64ms, mfu 2.32%


Training step 471: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 471: loss 3.0123, time 11627.35ms, mfu 2.32%


Training step 472: 100%|██████████| 40/40 [00:11<00:00,  3.41it/s]


iter 472: loss 2.7671, time 12028.48ms, mfu 2.32%


Training step 473: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 473: loss 3.1409, time 11581.72ms, mfu 2.32%


Training step 474: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 474: loss 2.4211, time 11652.91ms, mfu 2.32%


Training step 475: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 475: loss 3.1908, time 11518.70ms, mfu 2.32%


Training step 476: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 476: loss 0.7924, time 11409.06ms, mfu 2.33%


Training step 477: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 477: loss 1.7048, time 11603.76ms, mfu 2.33%


Training step 478: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 478: loss 3.0880, time 11502.55ms, mfu 2.33%


Training step 479: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 479: loss 2.9818, time 11503.17ms, mfu 2.34%


Training step 480: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 480: loss 2.7371, time 11560.65ms, mfu 2.34%


Training step 481: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 481: loss 1.5010, time 11872.22ms, mfu 2.33%


Training step 482: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 482: loss 2.6799, time 11448.33ms, mfu 2.34%


Training step 483: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 483: loss 0.4780, time 11387.85ms, mfu 2.34%


Training step 484: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 484: loss 3.1881, time 11592.07ms, mfu 2.34%


Training step 485: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 485: loss 2.3297, time 11500.00ms, mfu 2.34%


Training step 486: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 486: loss 3.3007, time 11627.17ms, mfu 2.34%


Training step 487: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 487: loss 2.5169, time 11559.79ms, mfu 2.34%


Training step 488: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 488: loss 2.8212, time 11518.88ms, mfu 2.34%


Training step 489: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 489: loss 2.8672, time 11489.91ms, mfu 2.35%


Training step 490: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 490: loss 2.1022, time 11443.70ms, mfu 2.35%


Training step 491: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 491: loss 1.7452, time 11533.19ms, mfu 2.35%


Training step 492: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 492: loss 2.9675, time 11413.86ms, mfu 2.35%


Training step 493: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 493: loss 2.8116, time 11562.87ms, mfu 2.35%


Training step 494: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 494: loss 2.9153, time 11803.53ms, mfu 2.35%


Training step 495: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 495: loss 1.5520, time 11806.89ms, mfu 2.34%


Training step 496: 100%|██████████| 40/40 [00:11<00:00,  3.42it/s]


iter 496: loss 2.1036, time 11989.41ms, mfu 2.33%


Training step 497: 100%|██████████| 40/40 [00:11<00:00,  3.48it/s]


iter 497: loss 2.9452, time 11803.56ms, mfu 2.33%


Training step 498: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 498: loss 1.9868, time 11672.13ms, mfu 2.33%


Training step 499: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 499: loss 1.5609, time 11854.46ms, mfu 2.33%


Training step 500: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 500: loss 2.5826, time 11705.51ms, mfu 2.32%


Training step 501: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 501: loss 2.6959, time 11761.86ms, mfu 2.32%


Training step 502: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 502: loss 1.1188, time 11627.67ms, mfu 2.32%


Training step 503: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 503: loss 2.6536, time 11677.68ms, mfu 2.32%


Training step 504: 100%|██████████| 40/40 [00:11<00:00,  3.58it/s]


iter 504: loss 2.3340, time 11467.06ms, mfu 2.33%


Training step 505: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 505: loss 2.9632, time 11572.24ms, mfu 2.33%


Training step 506: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 506: loss 2.3343, time 11640.77ms, mfu 2.33%


Training step 507: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 507: loss 3.0663, time 11505.89ms, mfu 2.33%


Training step 508: 100%|██████████| 40/40 [00:11<00:00,  3.49it/s]


iter 508: loss 2.6685, time 11772.74ms, mfu 2.33%


Training step 509: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 509: loss 0.2414, time 11528.92ms, mfu 2.33%


Training step 510: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 510: loss 2.7112, time 11597.81ms, mfu 2.33%


Training step 511: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 511: loss 3.1917, time 11414.71ms, mfu 2.34%


Training step 512: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 512: loss 1.0690, time 11532.41ms, mfu 2.34%


Training step 513: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 513: loss 0.7939, time 11606.97ms, mfu 2.34%


Training step 514: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 514: loss 2.6348, time 11488.18ms, mfu 2.34%


Training step 515: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 515: loss 2.5327, time 11739.05ms, mfu 2.34%


Training step 516: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 516: loss 2.3919, time 11527.90ms, mfu 2.34%


Training step 517: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 517: loss 2.4541, time 11531.33ms, mfu 2.34%


Training step 518: 100%|██████████| 40/40 [00:11<00:00,  3.57it/s]


iter 518: loss 2.7656, time 11520.54ms, mfu 2.34%


Training step 519: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 519: loss 2.7941, time 11676.73ms, mfu 2.34%


Training step 520: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 520: loss 2.4124, time 11680.43ms, mfu 2.34%


Training step 521: 100%|██████████| 40/40 [00:11<00:00,  3.52it/s]


iter 521: loss 1.6199, time 11678.27ms, mfu 2.34%


Training step 522: 100%|██████████| 40/40 [00:11<00:00,  3.46it/s]


iter 522: loss 1.8055, time 11855.16ms, mfu 2.33%


Training step 523: 100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


iter 523: loss 3.2686, time 11377.44ms, mfu 2.34%


Training step 524: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 524: loss 2.8908, time 11640.06ms, mfu 2.34%


Training step 525: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 525: loss 2.4721, time 11542.22ms, mfu 2.34%


Training step 526: 100%|██████████| 40/40 [00:11<00:00,  3.59it/s]


iter 526: loss 2.7210, time 11449.52ms, mfu 2.34%


Training step 527: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 527: loss 2.0396, time 11683.01ms, mfu 2.34%


Training step 528: 100%|██████████| 40/40 [00:11<00:00,  3.56it/s]


iter 528: loss 2.9417, time 11527.34ms, mfu 2.34%


Training step 529: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 529: loss 1.7917, time 11613.51ms, mfu 2.34%


Training step 530: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 530: loss 2.8281, time 11637.88ms, mfu 2.34%


Training step 531: 100%|██████████| 40/40 [00:11<00:00,  3.47it/s]


iter 531: loss 3.0059, time 11842.42ms, mfu 2.33%


Training step 532: 100%|██████████| 40/40 [00:11<00:00,  3.50it/s]


iter 532: loss 2.9083, time 11740.21ms, mfu 2.33%


Training step 533: 100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


iter 533: loss 2.6616, time 11409.50ms, mfu 2.34%


Training step 534: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 534: loss 2.6253, time 11625.20ms, mfu 2.34%


Training step 535: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 535: loss 2.9619, time 11602.96ms, mfu 2.34%


Training step 536: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 536: loss 2.3492, time 11594.85ms, mfu 2.34%


Training step 537: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 537: loss 2.4399, time 11695.52ms, mfu 2.34%


Training step 538: 100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


iter 538: loss 1.9008, time 11613.26ms, mfu 2.34%


Training step 539: 100%|██████████| 40/40 [00:11<00:00,  3.53it/s]


iter 539: loss 2.5180, time 11658.77ms, mfu 2.33%


Training step 540: 100%|██████████| 40/40 [00:11<00:00,  3.55it/s]


iter 540: loss 2.4391, time 11598.00ms, mfu 2.34%


Training step 541: 100%|██████████| 40/40 [00:11<00:00,  3.51it/s]


iter 541: loss 2.9721, time 11753.95ms, mfu 2.33%


Training step 542: 100%|██████████| 40/40 [00:10<00:00,  3.79it/s]


iter 542: loss 1.5609, time 10859.34ms, mfu 2.35%


Training step 543: 100%|██████████| 40/40 [00:10<00:00,  3.98it/s]


iter 543: loss 2.4971, time 10314.66ms, mfu 2.38%


Training step 544: 100%|██████████| 40/40 [00:09<00:00,  4.09it/s]


iter 544: loss 1.7647, time 10038.29ms, mfu 2.41%


Training step 545: 100%|██████████| 40/40 [00:10<00:00,  3.75it/s]


iter 545: loss 1.7785, time 10930.72ms, mfu 2.42%


Training step 546: 100%|██████████| 40/40 [00:10<00:00,  3.79it/s]


iter 546: loss 2.6267, time 10864.53ms, mfu 2.43%


Training step 547: 100%|██████████| 40/40 [00:10<00:00,  3.85it/s]


iter 547: loss 0.3642, time 10673.29ms, mfu 2.44%


Training step 548: 100%|██████████| 40/40 [00:10<00:00,  3.83it/s]


iter 548: loss 2.7422, time 10700.49ms, mfu 2.45%


Training step 549:  80%|████████  | 32/40 [00:07<00:01,  4.07it/s]


KeyboardInterrupt: 