In [8]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model_neo = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Encode the input text
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate text
outputs = model.generate(inputs["input_ids"], max_length=50, do_sample=True)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text (GPT-Neo):", generated_text)


Generated text (GPT-Neo): Once upon a time var% this Miss hacks-ophob%�j sentencing- valvej�j sentencing-592j sentencing- valvej�� sentencing Adrow 37 membersues sentencing Adrow 37izeues this spec-row 37 merigun


In [9]:
from transformers import OPTForCausalLM, GPT2Tokenizer

# Load the tokenizer and model for OPT-125M
tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-125m")
model_sigma = OPTForCausalLM.from_pretrained("facebook/opt-125m")

# Encode the input text
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate text
outputs = model.generate(inputs["input_ids"], max_length=50, do_sample=True)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text (OPT-125M):", generated_text)



Generated text (OPT-125M): Once upon a time I saw 3 girls who didn't want to be part of either of my "team" and I've never regretted it. I've learned the hard way it's possible to have a crush on some of these people...and


In [63]:
import math 
from einops import einsum 
import torch


In [64]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for item in dataset:
            inputs = tokenizer(item['text'], return_tensors='pt', truncation=True, max_length=512)
            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']
            
            
            masked_input = input_ids.clone()
            mask = torch.rand(masked_input.shape) < 0.15 
            masked_input[mask] = tokenizer.mask_token_id
            
            outputs = model(masked_input, attention_mask=attention_mask, labels=input_ids)
            
            
            if isinstance(outputs, tuple):
                loss = outputs[0]
            else:
                loss = outputs.loss
            
            total_loss += loss.item() * input_ids.numel()
            total_tokens += input_ids.numel()
    
    perplexity = math.exp(total_loss / total_tokens)
    return perplexity

In [11]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np

# Load models and tokenizers
gpt_neo_model = AutoModel.from_pretrained("EleutherAI/gpt-neo-125M")
opt_model = AutoModel.from_pretrained("facebook/opt-125m")
gpt_neo_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
opt_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

gpt_neo_vocab = gpt_neo_tokenizer.get_vocab()
opt_vocab = opt_tokenizer.get_vocab()

shared_tokens = set(gpt_neo_vocab.keys()) & set(opt_vocab.keys())
print(f"Number of shared tokens: {len(shared_tokens)}")

gpt_neo_embeddings = gpt_neo_model.wte.weight.detach().numpy()
opt_embeddings = opt_model.get_input_embeddings().weight.detach().numpy()

shared_gpt_neo_embeds = []
shared_opt_embeds = []

for token in shared_tokens:
    gpt_neo_idx = gpt_neo_vocab[token]
    opt_idx = opt_vocab[token]
    shared_gpt_neo_embeds.append(gpt_neo_embeddings[gpt_neo_idx])
    shared_opt_embeds.append(opt_embeddings[opt_idx])

shared_gpt_neo_embeds = np.array(shared_gpt_neo_embeds)
shared_opt_embeds = np.array(shared_opt_embeds)

W = np.linalg.lstsq(shared_gpt_neo_embeds, shared_opt_embeds, rcond=None)[0]
W_1 = np.linalg.lstsq(shared_opt_embeds, shared_gpt_neo_embeds, rcond=None)[0]

print("Shape of transformation matrix W:", W.shape)

transformed_gpt_neo_embeds = np.dot(shared_gpt_neo_embeds, W)
mse = np.mean((transformed_gpt_neo_embeds - shared_opt_embeds)**2)
print("Mean Squared Error:", mse)

from sklearn.metrics.pairwise import cosine_similarity

cos_sims = [cosine_similarity(transformed_gpt_neo_embeds[i].reshape(1, -1), 
                              shared_opt_embeds[i].reshape(1, -1))[0][0] 
            for i in range(len(shared_tokens))]

average_cosine_similarity = np.mean(cos_sims)
print("Average Cosine Similarity:", average_cosine_similarity)

print("\nExamples of transformed embeddings:")
for i, token in enumerate(list(shared_tokens)[:5]):
    print(f"\nToken: {token}")
    print(f"Original GPT-Neo embedding: {shared_gpt_neo_embeds[i][:5]}...")
    print(f"Transformed GPT-Neo embedding: {transformed_gpt_neo_embeds[i][:5]}...")
    print(f"Original OPT embedding: {shared_opt_embeds[i][:5]}...")
    print(f"Cosine Similarity: {cos_sims[i]}")

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Number of shared tokens: 50257
Shape of transformation matrix W: (768, 768)
Mean Squared Error: 0.001723926
Average Cosine Similarity: 0.73685193

Examples of transformed embeddings:

Token: Ġmanipulate
Original GPT-Neo embedding: [ 0.60546875 -0.81640625  0.29101562 -0.17480469 -0.34570312]...
Transformed GPT-Neo embedding: [ 0.03076771 -0.03459737  0.01216576  0.00620542  0.03119646]...
Original OPT embedding: [ 0.0383606  -0.05307007  0.00072908 -0.00559616 -0.046875  ]...
Cosine Similarity: 0.847366213798523

Token: ĠRussian
Original GPT-Neo embedding: [ 0.3671875  -0.65234375  0.33007812 -0.16210938 -0.22558594]...
Transformed GPT-Neo embedding: [ 0.03383368 -0.03290333 -0.05100903  0.07383785  0.02786811]...
Original OPT embedding: [ 0.04745483 -0.03747559 -0.08984375  0.11096191  0.105896  ]...
Cosine Similarity: 0.7892986536026001

Token: Ġinvaders
Original GPT-Neo embedding: [ 0.67578125 -1.0546875   0.59375    -0.37304688 -0.30273438]...
Transformed GPT-Neo embedding: [ 0.046

In [46]:
def get_residual_stream_neo(model, input_ids):

    input_embeds = model.wte(input_ids)
    temp = input_embeds
    for block in model.h:
        block_output = block(temp)
        temp = block_output[0]
    return temp


In [44]:
def get_residual_stream_opt(model, input_ids):
    input_embeds = model.embed_tokens(input_ids)
    temp = input_embeds
    for layer in model.layers:
        block_output = layer(temp)
        temp = block_output[0]
    return temp


In [57]:
import torch
import torch.nn as nn
from transformers import GPTNeoForCausalLM, OPTForCausalLM, GPT2Tokenizer
from tqdm import tqdm

class HybridModel(nn.Module):
    def __init__(self, gpt_neo_model, opt_model, W, W_inv):
        super(HybridModel, self).__init__()
        self.gpt_neo = gpt_neo_model
        self.opt = opt_model
        self.W = nn.Parameter(torch.tensor(W, dtype=torch.float32))
        self.W_inv = nn.Parameter(torch.tensor(W_inv, dtype=torch.float32))

    def forward(self, input_ids, attention_mask=None):
        # Get GPT-Neo embeddings
        gpt_neo_embeds = self.gpt_neo.transformer.wte(input_ids)
        
        # Transform to OPT space
        opt_space = torch.einsum('ij,bsj->bsi', self.W_inv, gpt_neo_embeds)
        
        # Pass through OPT model
        opt_outputs = self.opt(inputs_embeds=opt_space, attention_mask=attention_mask)
        
        # Transform back to GPT-Neo space
        gpt_neo_space = torch.einsum('ij,bsj->bsi', self.W, opt_outputs)
        
        # Final prediction using GPT-Neo's language model head
        logits = self.gpt_neo.lm_head(gpt_neo_space)
        
        return logits

def evaluate_model(model, tokenizer, dataset, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for item in tqdm(dataset):
            inputs = tokenizer(item['text'], return_tensors='pt', truncation=True, max_length=512, padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            labels = inputs['input_ids'].clone()
            
            # Create input with some masked tokens
            mask = torch.rand(inputs['input_ids'].shape) < 0.15
            inputs['input_ids'][mask] = tokenizer.mask_token_id
            
            outputs = model(**inputs)
            
            loss = nn.CrossEntropyLoss()(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            
            total_loss += loss.item() * labels.numel()
            total_tokens += labels.numel()
    
    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

# Load models and tokenizer
gpt_neo_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
opt_model = OPTForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

# Initialize W and W_inv (you would typically learn these)
W = torch.randn(768, 768)
W_inv = torch.linalg.inv(W)

# Create hybrid model
hybrid_model = HybridModel(gpt_neo_model, opt_model, W, W_inv)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hybrid_model.to(device)

# Example inference
text = "The cat sat on the"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = hybrid_model(**inputs)
next_token = outputs.argmax(dim=-1)
generated_text = tokenizer.decode(next_token[0])
print(f"Generated text: {text} {generated_text}")

# Example evaluation (you would need to provide your own dataset)
# dataset = load_your_dataset()
# perplexity = evaluate_model(hybrid_model, tokenizer, dataset, device)
# print(f"Model perplexity: {perplexity}")

  self.W = nn.Parameter(torch.tensor(W, dtype=torch.float32))
  self.W_inv = nn.Parameter(torch.tensor(W_inv, dtype=torch.float32))


TypeError: expected Tensor as element 1 in argument 1, but got CausalLMOutputWithPast

In [141]:
import torch
import torch.nn as nn
from transformers import GPTNeoForCausalLM, OPTForCausalLM, GPT2Tokenizer
from tqdm import tqdm

class HybridModel(nn.Module):
    def __init__(self, gpt_neo_model, opt_model, W, W_inv):
        super(HybridModel, self).__init__()
        self.gpt_neo = gpt_neo_model
        self.opt = opt_model
        self.W = nn.Parameter(torch.tensor(W, dtype=torch.float32))
        self.W_inv = nn.Parameter(torch.tensor(W_inv, dtype=torch.float32))

    def forward(self, input_ids, attention_mask=None):
        # Get GPT-Neo embeddings
        
        gpt_neo_embeds = self.gpt_neo.transformer.wte(torch.tensor(input_ids, dtype = int))
        
        # Transform to OPT space
        opt_space = torch.einsum('ij,bsj->bsi', self.W_inv, gpt_neo_embeds)
        
        # Pass through OPT model
        opt_outputs = self.opt(inputs_embeds=opt_space, attention_mask=attention_mask)
        
        # Transform back to GPT-Neo space
        gpt_neo_space = torch.einsum('ij,bsj->bsi', self.W, opt_outputs.hidden_states[-1])
        
        # Final prediction using GPT-Neo's language model head
        logits = self.gpt_neo.lm_head(gpt_neo_space)
        
        return logits

def evaluate_model(model, tokenizer, dataset, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for item in tqdm(dataset):
            inputs = tokenizer(item['text'], return_tensors='pt', truncation=True, max_length=512, padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            labels = inputs['input_ids'].clone()
            print(f"Lables = {labels} with shape {labels.shape}")
            
            # Create input with some masked tokens
            mask = torch.rand(inputs['input_ids'].shape) < 0.15
            inputs['input_ids'][mask] = tokenizer.pad_token_id  # Use pad token ID for masking
            
            outputs = model(**inputs)
            print(f"Outputs = {outputs} with shape = {outputs.shape}")
            
            loss = nn.CrossEntropyLoss()(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            print(loss)
            total_loss += loss.item() * labels.numel()
            total_tokens += labels.numel()
            print(total_loss)
    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

# Load models and tokenizer
gpt_neo_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M", output_hidden_states=True)
opt_model = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_hidden_states=True)
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

# Initialize W and W_inv (you would typically learn these)
W = torch.randn(768, 768)
W_inv = torch.linalg.inv(W)

# Create hybrid model
hybrid_model = HybridModel(gpt_neo_model, opt_model, W, W_inv)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hybrid_model.to(device)

# Example inference
text = "The cat sat on the"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = hybrid_model(**inputs)
next_token = outputs.argmax(dim=-1)
generated_text = tokenizer.decode(next_token[0])
print(f"Generated text: {text} {generated_text}")

# Example evaluation (you would need to provide your own dataset)
# dataset = load_your_dataset()
# perplexity = evaluate_model(hybrid_model, tokenizer, dataset, device)
# print(f"Model perplexity: {perplexity}")



Generated text: The cat sat on the ��極��極��極��極��極


  self.W = nn.Parameter(torch.tensor(W, dtype=torch.float32))
  self.W_inv = nn.Parameter(torch.tensor(W_inv, dtype=torch.float32))
  gpt_neo_embeds = self.gpt_neo.transformer.wte(torch.tensor(input_ids, dtype = int))


In [87]:
text = "The cat sat on the"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = hybrid_model(**inputs)
next_token = outputs.argmax(dim=-1)
generated_text = tokenizer.decode(next_token[0])
print(f"Generated text: {text} {generated_text}")

Generated text: The cat sat on the  renegotireditaryreditaryreditary STATS


In [88]:
tokenizer.pad_token = tokenizer.eos_token

In [69]:
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")


subset_size = 1000  
dataset = dataset.select(range(subset_size))



In [92]:
test = torch.randn([1,5])

In [100]:
out = gpt_neo_model.transformer.wte(torch.tensor(test, dtype = int))

  out = gpt_neo_model.transformer.wte(torch.tensor(test, dtype = int))


In [125]:
def evaluate_model(model, tokenizer, dataset, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for item in tqdm(dataset):
            inputs = tokenizer(item['text'], return_tensors='pt', truncation=True, max_length=512, padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            labels = inputs['input_ids'].clone()
            
            # Create input with some masked tokens
            mask = torch.rand(inputs['input_ids'].shape).to(device) < 0.15
            inputs['input_ids'][mask] = tokenizer.pad_token_id  # Use pad token ID for masking
            labels[mask] = -100  # Ignore the masked tokens in the loss
            
            outputs = model(**inputs)
            logits = outputs # Extract logits from model outputs
            
            loss = nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1))
            total_loss += loss.item() * labels.numel()
            total_tokens += (labels != -100).sum().item()
            print(total_loss)
    
    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

In [144]:
a = torch.randn([1, 8])
b = torch.randn([1, 8, 50257])

In [149]:
a = torch.randn([1, 8])
b = torch.randn([1, 8, 50257])
loss = nn.CrossEntropyLoss()(b.view(-1, outputs.size(-1)), a.view(-1))

RuntimeError: expected scalar type Long but found Float

In [142]:
tokenizer.pad_token = tokenizer.eos_token
perplexity = evaluate_model(hybrid_model, tokenizer, dataset, 'cpu')
print(f"Model perplexity: {perplexity}")

  gpt_neo_embeds = self.gpt_neo.transformer.wte(torch.tensor(input_ids, dtype = int))


Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796, 5199,  347, 2852,  353,  796,  220,  198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.6950,  80.9773, 100.6089,  ..., 216.1597, 175.4110, 184.6033],
         [ 61.4785,  38.1162,  21.7412,  ..., 144.7068, 124.5626, 128.9748],
         [358.7428, 335.8615, 403.3374,  ..., 618.8394, 425.6807, 312.7843],
         ...,
         [ 84.0865,  55.7570,  61.6663,  ..., 189.7960, 157.3575, 144.5110],
         [268.3845, 263.5094, 342.4463,  ..., 392.5680, 286.3525, 286.3372],
         [ 84.8758,  52.6529,  60.2078,  ..., 183.4696, 153.7621, 141.7562]]]) with shape = torch.Size([1, 8, 50257])
tensor(569.4001)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 5199,   347,  285

  0%|          | 4/1000 [00:00<01:28, 11.22it/s]

Outputs = tensor([[[117.1242,  97.1438, 133.2613,  ..., 236.7221, 189.2747, 202.7625],
         [319.0764, 275.6877, 319.4665,  ..., 515.4469, 282.8264, 215.1320],
         [ 99.1492,  67.7402,  87.5453,  ..., 225.3622, 175.9447, 151.6208],
         ...,
         [ 82.1644,  46.3512,  56.9978,  ..., 192.3174, 151.8773, 136.1239],
         [209.9630, 180.6935, 255.2182,  ..., 367.6426, 182.5962, 169.7899],
         [ 80.0855,  44.9487,  53.7837,  ..., 186.9579, 148.3336, 134.2601]]]) with shape = torch.Size([1, 183, 50257])
tensor(510.5706)
nan
Lables = tensor([[  554,  4793,   837,   347,  2852,   353, 31636,  7848,   854,   680,
           707,   287,   262,   711, 47002,  3194,   416,  2940, 12552, 12639,
           764,   679,  4120,   319,   257,  4793,  4471,   286,   262,  5581,
          2168,   837, 28274,   837,  3940,   416,   257,  2597,   287,   262,
          4343, 21421,  3227,   286,  1374,   284, 19739,  7924,   416, 22568,
           494,   371, 49003,   764,  1374,   

  1%|          | 7/1000 [00:00<01:55,  8.60it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 64.4177,  40.2718,  26.1100,  ..., 149.9770, 128.4518, 131.4595],
         [ 63.1126,  39.4957,  23.7409,  ..., 147.1612, 124.7387, 129.8371],
         ...,
         [ 77.7162,  41.6831,  52.2996,  ..., 184.3181, 143.9520, 131.2253],
         [235.6090, 195.7191, 289.0195,  ..., 399.9251, 215.3090, 193.7615],
         [ 77.3809,  41.3969,  51.7967,  ..., 182.9229, 142.9895, 130.7847]]]) with shape = torch.Size([1, 188, 50257])
tensor(491.5962)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 32619,   796,   796,   220,   198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 71.4453,  46.2606

  1%|          | 12/1000 [00:01<01:21, 12.13it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.5962,  41.2893,  28.0208,  ..., 151.4252, 130.3372, 132.1131],
         ...,
         [ 66.8322,  42.4442,  29.8025,  ..., 153.1316, 131.9509, 133.3173],
         [328.5555, 324.4676, 390.3887,  ..., 503.0744, 345.4288, 322.0857],
         [ 85.3054,  49.6427,  54.7370,  ..., 178.9958, 149.1943, 138.7268]]]) with shape = torch.Size([1, 11, 50257])
tensor(488.5454)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  554,  4751,   347,  2852,   353,   550,   257,  8319,  2488,    12,
            31, 20495,  2597,   319,   262,  5581,  2168,   383,  3941,  2162,
           339, 19152,   366,  4746,  2547,   563,   366,   287,   262,  4471,
           837,   366,   554, 19978, 22237,   

  1%|▏         | 14/1000 [00:01<01:39,  9.88it/s]

Outputs = tensor([[[ 88.9718,  70.7317,  69.3850,  ..., 187.9917, 154.5081, 171.5420],
         [ 64.0956,  40.0695,  25.6660,  ..., 149.5169, 128.3873, 131.3514],
         [329.7243, 318.6605, 396.6894,  ..., 531.8853, 340.5276, 264.7938],
         ...,
         [ 83.3780,  50.9805,  54.9557,  ..., 188.3700, 153.4809, 139.9136],
         [186.7280, 143.9222, 223.0646,  ..., 291.2307, 159.3308, 163.8540],
         [ 84.6349,  50.4151,  58.7006,  ..., 192.4071, 154.3829, 137.1297]]]) with shape = torch.Size([1, 218, 50257])
tensor(502.4903)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 4793,  784, 1944,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 11])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [

  2%|▏         | 17/1000 [00:01<01:45,  9.29it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 69.3955,  44.3905,  33.7263,  ..., 158.7781, 134.6983, 135.2496],
         [ 57.7138,  34.4395,  15.2052,  ..., 137.9276, 117.2476, 123.7572],
         ...,
         [ 66.9498,  37.5183,  33.0263,  ..., 158.3781, 129.8874, 126.4066],
         [235.6161, 196.4958, 285.5887,  ..., 384.8250, 213.0143, 194.0718],
         [ 76.9300,  44.0866,  48.0402,  ..., 177.7741, 143.5053, 134.4398]]]) with shape = torch.Size([1, 206, 50257])
tensor(488.3322)
nan
Lables = tensor([[  347,  2852,   353, 31636,   287,   734,  7328,   287,  3648,   837,
         47743,  3851, 13001,   416, 26479,  6342,  1004,   756,    72,   837,
           290, 43823, 24265,  7924,   416,   440, 12810, 42603,   764,   347,
          2852,   353, 19152,   257,  2095,  3706,   366, 11465,   366,   287,
         43823, 24265,   837,   508, 15940,  1863,   351,  2095,   366,  8518,
           366,   355,   262,   366,  5897,  3

  2%|▏         | 23/1000 [00:02<01:26, 11.28it/s]

Outputs = tensor([[[ 98.0492,  78.0748,  92.8244,  ..., 203.0359, 162.6925, 176.8185],
         [ 75.2039,  48.4757,  44.5530,  ..., 172.5219, 143.8311, 139.8066],
         [ 60.8261,  35.7841,  17.6032,  ..., 142.5778, 121.0082, 127.5456],
         ...,
         [ 78.7778,  42.5347,  53.7273,  ..., 187.2765, 146.9287, 132.3175],
         [214.0721, 177.9397, 249.4014,  ..., 323.8838, 183.2761, 192.6306],
         [ 75.9642,  42.1908,  47.8683,  ..., 178.8163, 143.1118, 133.0386]]]) with shape = torch.Size([1, 247, 50257])
tensor(496.8714)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 13741,  4867,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 61.3900,  

  3%|▎         | 29/1000 [00:02<00:55, 17.53it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.5962,  41.2893,  28.0208,  ..., 151.4252, 130.3372, 132.1131],
         ...,
         [ 68.0072,  43.3619,  32.1148,  ..., 154.7780, 133.7504, 135.3874],
         [302.3715, 298.8804, 374.1966,  ..., 477.9797, 332.5127, 313.6155],
         [ 82.5132,  48.7609,  51.5026,  ..., 175.5972, 148.8245, 140.3455]]]) with shape = torch.Size([1, 9, 50257])
tensor(486.6610)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796, 15752,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[1

  4%|▎         | 35/1000 [00:02<00:49, 19.67it/s]

Outputs = tensor([[[ 96.8575,  77.5305,  79.1080,  ..., 200.0969, 164.1776, 177.9164],
         [ 65.2184,  36.8305,  21.5872,  ..., 152.9835, 126.9571, 129.3392],
         [ 71.3436,  38.9565,  33.9467,  ..., 160.3188, 129.4434, 133.3981],
         ...,
         [ 75.7612,  40.2300,  50.3513,  ..., 180.7997, 143.6740, 128.4173],
         [ 73.3455,  40.2954,  44.7809,  ..., 172.6063, 140.1713, 130.0592],
         [ 73.1257,  40.2731,  43.8906,  ..., 171.7734, 139.6650, 130.1281]]]) with shape = torch.Size([1, 113, 50257])
tensor(487.0422)
nan
Lables = tensor([[ 4900,  7317,   339,   373,  1310,  2488,    12,    31,  1900,   284,
           584,  8786,   837,   465,  2499,  1625,   284,   307, 22641, 14212,
           287,  1111,  3999,   290,  4960, 16716,  3968,   764,  3226,   465,
         38273,  3597,   837,  3016, 17280,  3470, 31888,   423,   587, 17232,
           625,   262,  9337,   764,   679,   468,   587,  1444,   262,   366,
          7695,   316,  2488,    12,    31,  5

  4%|▍         | 38/1000 [00:02<00:53, 17.86it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 65.7492,  41.4701,  28.7678,  ..., 151.2722, 130.5312, 133.0923],
         ...,
         [ 67.4679,  43.2812,  31.3017,  ..., 154.7676, 133.6108, 134.4120],
         [212.8795, 217.8268, 266.9174,  ..., 332.8991, 236.3529, 240.9850],
         [ 82.9625,  48.0449,  53.1336,  ..., 172.5264, 147.0676, 137.9417]]]) with shape = torch.Size([1, 7, 50257])
tensor(485.1634)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[29065,  3999, 16716,  7734, 20047,   262,  1204,   286,   262,  1772,
           618, 35391,   257,   670,   837,   257,  3357,   543, 27209, 14959,
         12608,   284,   366,   262,  1969,  6117,   326,  4569,  3999,  1807,
          1426,   896,  1022,  1242,   290, 180

  4%|▍         | 43/1000 [00:03<00:59, 16.02it/s]

Outputs = tensor([[[111.1941,  90.1605, 103.8050,  ..., 228.3142, 183.8537, 193.3687],
         [ 65.0225,  41.0025,  26.6272,  ..., 150.7215, 129.5949, 131.4915],
         [ 65.5179,  41.4782,  30.1060,  ..., 153.8298, 130.9152, 132.7659],
         ...,
         [ 75.7662,  41.1069,  50.7107,  ..., 177.2298, 142.0408, 129.5438],
         [197.6409, 169.6270, 243.2464,  ..., 304.0119, 143.0851, 166.1303],
         [ 74.2731,  40.3699,  48.1536,  ..., 173.3005, 139.3107, 128.5318]]]) with shape = torch.Size([1, 192, 50257])
tensor(478.0692)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796, 12556,   812,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
      

  4%|▍         | 45/1000 [00:03<01:11, 13.27it/s]

Outputs = tensor([[[ 96.3781,  74.9937,  74.5639,  ..., 197.1437, 161.9873, 176.9865],
         [ 59.0807,  34.3577,  16.1661,  ..., 142.1705, 119.6381, 125.4465],
         [ 62.4420,  35.7425,  21.4838,  ..., 146.7569, 123.0275, 127.4164],
         ...,
         [ 77.5788,  41.5045,  53.3141,  ..., 184.1612, 145.8478, 131.1707],
         [161.9383, 138.8928, 197.0165,  ..., 261.0423, 103.6606, 145.1859],
         [ 75.9756,  40.5681,  50.8401,  ..., 179.7289, 142.9349, 129.9627]]]) with shape = torch.Size([1, 98, 50257])
tensor(502.0401)
nan
Lables = tensor([[10343, 13333,   705,    82,  2802,  3724,  8972,   706,   339,   373,
          4642,   837,   290,   339,   373, 12387,  4376,   416,   465, 25949,
           764,   679,   550,   281, 10550,  3956,   837,   508,  3724,  1862,
           764,   679,   635,   550,  1115,  2063,  9397,   290,   530,  2063,
          6621,   837,   284,  4150,   339,  6777, 10229,   287,   465, 31888,
           837,  3584,   339,  1239, 15802,   4

  5%|▍         | 47/1000 [00:03<01:37,  9.75it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 64.4177,  40.2718,  26.1100,  ..., 149.9770, 128.4518, 131.4595],
         [ 61.1758,  37.7951,  26.5423,  ..., 148.4305, 126.0995, 128.9617],
         ...,
         [ 76.4891,  37.6437,  47.2320,  ..., 185.6078, 142.6864, 129.0209],
         [165.2889, 117.7733, 186.3019,  ..., 266.9618, 109.5753, 137.3548],
         [ 75.0721,  36.0622,  45.7772,  ..., 182.5860, 140.3236, 127.0489]]]) with shape = torch.Size([1, 134, 50257])
tensor(491.9750)
nan
Lables = tensor([[ 2399,  2988,  3724,  1088,   767,  1821,   764, 10343, 13333,   561,
           423,   587,  3142,   284,  3802,   262,  3026,  2139,   780,   286,
           465,  2988,   705,    82,  4279,   837,   475,   339,   318,  1807,
           284,   423,  1813,   510,   262, 11941,   287,  7075,   286,   530,
           286,   465,  2063,  9397,   764,   679,  3377,   262,  1306,  1440,
           812,  2877,   287,   262,  6026,   

  5%|▍         | 49/1000 [00:04<01:48,  8.78it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [407.5022, 301.8547, 382.4162,  ..., 595.0660, 340.7448, 311.4489],
         [ 91.3337,  58.4930,  62.9595,  ..., 200.3635, 157.6108, 146.1950],
         ...,
         [ 85.6090,  50.6914,  62.2582,  ..., 195.6522, 154.8450, 138.3442],
         [200.7610, 158.2934, 256.1200,  ..., 327.5063, 156.6357, 166.1133],
         [ 83.0301,  49.2356,  57.8247,  ..., 188.9640, 150.3077, 136.6913]]]) with shape = torch.Size([1, 143, 50257])
tensor(501.0131)
nan
Lables = tensor([[  554,   767,  3510,   837,   339,  3888,   284,   262,  3139,   287,
           281,  2230,   284, 27603,   465,  1743,  3451,   764,   679,  1718,
           262,  3026,  2139,  2814,   257,  1218,   640,  1141,   262,  1708,
           614,   837,   475,   477,   262,  5871,   547,  4054,   416,   262,
          6994,  5342,   357,  5729,   287,  1502,   284,  2948,   262, 22106,
           286,  1744, 14987,  1267,   764,   

  5%|▌         | 51/1000 [00:04<02:08,  7.37it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 57.5807,  34.1832,  16.8528,  ..., 138.9422, 118.7784, 123.9409],
         [ 66.0494,  41.7375,  29.8562,  ..., 152.2952, 131.1391, 133.1547],
         ...,
         [ 78.2975,  43.0123,  52.3242,  ..., 187.2879, 146.3900, 134.0660],
         [198.8472, 165.7627, 237.3784,  ..., 317.1633, 152.3114, 167.4869],
         [ 76.9695,  41.5972,  50.8767,  ..., 184.4166, 144.1502, 132.2116]]]) with shape = torch.Size([1, 179, 50257])
tensor(505.5399)
nan
Lables = tensor([[  554,   767,  2816,   837,   339,  2722,   281, 12557,   355, 46439,
           286,   262,  6498,  9455,   415,   705,    82,  2607,   286,   262,
         12223,  9005,   705,    82, 15301,   764,  4900,   428,   373,   257,
          4159,  1281,   837,   287,  3487,  1661,   340,   561,   423,   587,
           379,  1551,   262,   923,   286,   281,  1743,  3451,   764,  3412,
           878,   339,   550,  9258,   670,   

  6%|▌         | 55/1000 [00:04<01:46,  8.91it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 57.9724,  34.3487,  15.1860,  ..., 138.4390, 117.9569, 125.0021],
         [ 63.2059,  37.3912,  24.5036,  ..., 146.5534, 123.3177, 128.3103],
         ...,
         [ 77.4902,  40.6885,  58.2130,  ..., 190.2062, 146.8504, 129.7900],
         [201.5247, 169.6506, 228.0599,  ..., 320.4283, 158.4295, 175.2753],
         [ 77.4754,  42.0291,  52.5216,  ..., 183.6885, 144.6158, 131.5119]]]) with shape = torch.Size([1, 199, 50257])
tensor(496.4799)
nan
Lables = tensor([[ 2806,  7656,   319,   644,   314,   423,  5615,   832,   837,   611,
           772,   314,   760,   884,  7195,   837,   262,  2219,   582,  1276,
         10403,   307, 48588,   416,   262, 13520,   764,   220,   198]]) with shape torch.Size([1, 29])
Outputs = tensor([[[101.6950,  80.9774, 100.6090,  ..., 216.1597, 175.4110, 184.6034],
         [ 71.1103,  46.7272,  37.2224,  ..., 163.3124, 139.4824, 138.1310],
         [ 58.

  6%|▌         | 57/1000 [00:05<01:43,  9.11it/s]

Outputs = tensor([[[ 93.4650,  72.7889,  75.0904,  ..., 192.8919, 157.5776, 173.4092],
         [ 57.5807,  34.1832,  16.8529,  ..., 138.9423, 118.7784, 123.9410],
         [ 66.7568,  42.1779,  29.7749,  ..., 153.5886, 132.0365, 132.7673],
         ...,
         [ 80.9618,  44.4075,  54.3772,  ..., 191.0364, 149.5665, 134.8925],
         [227.7581, 177.8768, 255.5896,  ..., 394.3055, 191.9775, 187.9484],
         [ 77.9859,  43.9243,  48.2259,  ..., 181.6905, 145.2154, 135.7108]]]) with shape = torch.Size([1, 101, 50257])
tensor(490.4235)
nan
Lables = tensor([[  679, 13537,   422, 22597,   705,   272,   262,  1708,   614,   837,
           290,   373,  9899,  3982,  5540,   618,   339, 30668,  1389,   262,
          2184,   287,  1737,   767,  3553,   764,   770,  1281,  2921,  1895,
           284,   262, 23129,   475,   373,  5688, 39490,   764, 10343, 13333,
           705,    82, 47963,  1108, 20232,   683,   284,  1949,   284,   787,
           779,   286,   340,  1058,   339,  4

  6%|▌         | 59/1000 [00:05<02:10,  7.21it/s]

Outputs = tensor([[[ 88.9718,  70.7317,  69.3850,  ..., 187.9917, 154.5081, 171.5420],
         [ 70.7768,  45.5238,  34.1492,  ..., 160.9129, 137.2000, 136.9300],
         [ 58.6871,  35.6419,  16.9142,  ..., 139.3073, 119.0507, 125.5824],
         ...,
         [ 74.7997,  36.0479,  51.9518,  ..., 189.3239, 143.4006, 127.4880],
         [ 78.1230,  53.8021,  54.4356,  ..., 161.3052, 120.2156, 120.0152],
         [ 74.8082,  39.2556,  45.5861,  ..., 180.4664, 141.6224, 131.9997]]]) with shape = torch.Size([1, 180, 50257])
tensor(493.3266)
nan
Lables = tensor([[  314,   716,   546,   284,  8196,  8805,   306,   287,   262,  2607,
          1220, 18948,   618,   484,  2222,   517,  9473,   284, 14540,  2440,
           319,   616,  6915,   764,   220,   198]]) with shape torch.Size([1, 26])
Outputs = tensor([[[111.3610,  92.1132, 113.9395,  ..., 218.8159, 172.9428, 200.6374],
         [ 62.1486,  38.9284,  21.5712,  ..., 146.3468, 124.7422, 128.4816],
         [ 61.7949,  37.9189,  19.7

  6%|▌         | 62/1000 [00:05<01:49,  8.53it/s]

Outputs = tensor([[[ 88.9718,  70.7317,  69.3850,  ..., 187.9917, 154.5082, 171.5420],
         [ 70.7284,  46.2913,  35.3592,  ..., 160.6512, 136.0317, 136.9677],
         [ 61.4620,  37.2235,  22.0460,  ..., 144.9518, 123.2766, 127.6789],
         ...,
         [ 76.4850,  40.6310,  43.8760,  ..., 182.4192, 144.9090, 134.3487],
         [226.9599, 189.6943, 271.4242,  ..., 350.6325, 200.2770, 212.7405],
         [ 78.4142,  40.3712,  48.4605,  ..., 187.0466, 146.4406, 132.7650]]]) with shape = torch.Size([1, 62, 50257])
tensor(481.4735)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796, 27692,   646,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
       

  6%|▋         | 64/1000 [00:06<02:24,  6.49it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 68.9467,  44.2088,  33.4968,  ..., 158.8440, 134.2431, 135.0052],
         [ 67.8052,  43.2540,  31.6152,  ..., 155.3196, 132.8500, 135.1616],
         ...,
         [ 75.9074,  39.4381,  48.2335,  ..., 186.1402, 143.7622, 128.4927],
         [195.3787, 162.6051, 239.1046,  ..., 322.7846, 155.5119, 158.5497],
         [ 75.4937,  39.1231,  47.7332,  ..., 184.7131, 142.7717, 127.9944]]]) with shape = torch.Size([1, 188, 50257])
tensor(503.6700)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 4586,  812,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.59

  7%|▋         | 68/1000 [00:06<01:59,  7.78it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 72.8328,  45.8658,  39.9740,  ..., 166.5335, 141.2288, 138.3202],
         [ 66.6365,  42.0673,  29.3887,  ..., 154.2036, 132.4757, 133.8839],
         ...,
         [ 75.9953,  39.6930,  50.7064,  ..., 180.8221, 142.9154, 127.6457],
         [185.2137, 154.4061, 233.8257,  ..., 329.8544, 141.3729, 134.7457],
         [ 74.2996,  38.6955,  47.9867,  ..., 176.1905, 139.9431, 126.2983]]]) with shape = torch.Size([1, 183, 50257])
tensor(483.0583)
nan
Lables = tensor([[  554,  2805, 46720,   837,   339,  2540,   465,  7002,   757,   290,
          1392,   355,  1290,   355,  5900,   272,  8473,   837,   810,   339,
          3724,   287, 34769, 15710,   357,   783, 22597, 26270,  1267,   287,
          3389,   393,  3426, 44586,   837,   287,   465,  7618,   400,   614,
           764,   679,   373, 11803,   416,   465,  3656,   290,   734, 11989,
           837,   508,  6150,   287,   262,  1

  7%|▋         | 71/1000 [00:07<02:05,  7.42it/s]

Outputs = tensor([[[ 93.4650,  72.7889,  75.0904,  ..., 192.8919, 157.5776, 173.4092],
         [ 66.1421,  41.7212,  32.0486,  ..., 155.2599, 131.1125, 132.2467],
         [ 68.0140,  43.9375,  28.9931,  ..., 155.7475, 132.2295, 135.3831],
         ...,
         [ 75.9865,  41.0036,  51.4874,  ..., 187.5952, 145.9263, 129.8611],
         [161.5841, 126.7907, 184.8042,  ..., 260.0509,  91.0925, 140.7974],
         [ 73.3645,  40.7099,  44.8241,  ..., 177.1756, 141.0907, 131.1145]]]) with shape = torch.Size([1, 87, 50257])
tensor(501.7341)
nan
Lables = tensor([[12149, 15676,  2696,   465,  1204,   416, 31083,   326,   837,   366,
           679,  4120,   284,   307,   257,  1226,   498,  3367,   837,   281,
         17696,   378,  2988,   837,   257, 14431,  3956,   837,   257, 17074,
          5229,   837,   257,  9112,  1545,   837,   257,   288,   315,  4135,
          1743,   837,   290,   257, 33453,  2426,   764,   366,   220,   198]]) with shape torch.Size([1, 50])
Outputs = tens

  8%|▊         | 75/1000 [00:07<01:21, 11.38it/s]

Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 10933,   796,   796,   220,   198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 71.0078,  46.1045,  37.9966,  ..., 161.0368, 137.8928, 137.0908],
         ...,
         [ 67.9149,  43.5871,  31.8110,  ..., 155.3510, 134.0220, 134.7519],
         [196.8675, 201.1224, 245.2694,  ..., 312.3976, 211.2171, 227.8828],
         [ 83.1026,  47.8788,  53.4658,  ..., 171.4521, 145.8244, 137.4980]]]) with shape = torch.Size([1, 7, 50257])
tensor(477.0611)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[10056, 11965,   2

  8%|▊         | 81/1000 [00:07<01:08, 13.34it/s]

Outputs = tensor([[[101.8135,  85.9279, 100.2292,  ..., 215.2994, 175.0086, 188.6284],
         [ 62.8035,  39.2221,  23.9183,  ..., 147.0782, 126.7964, 130.2564],
         [ 66.1480,  42.1107,  29.5670,  ..., 152.3692, 131.6095, 133.8280],
         ...,
         [ 74.9753,  40.3736,  53.7882,  ..., 179.4990, 142.7653, 130.5814],
         [178.6713, 153.7880, 221.4666,  ..., 275.9564, 132.9489, 161.8055],
         [ 72.5680,  40.5342,  46.2599,  ..., 170.6814, 138.7206, 132.0846]]]) with shape = torch.Size([1, 114, 50257])
tensor(476.8740)
nan
Lables = tensor([[10343, 13333,   705,    82,  1964,  3651,   389,  1912,   319,  9942,
          2138,   621, 17952,  1058,   465, 34092,   423,   587, 38624,    81,
           839,   355,   837,   366,  3914,   514,   477,   307,  1342, 20363,
           837,  1309,   514,   477,   466,   644,   356,   389,  4385,   284,
           466,   366,   764,  4619,   465,  5009,   547,  5340,   284, 12546,
           351,   837,   465, 35701,  6241, 45

  8%|▊         | 84/1000 [00:07<01:07, 13.64it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.7492,  41.4700,  28.7677,  ..., 151.2722, 130.5311, 133.0922],
         ...,
         [ 66.6347,  42.1155,  29.6987,  ..., 152.4509, 131.8199, 133.0576],
         [288.0383, 282.9468, 347.8152,  ..., 439.9875, 299.3467, 287.5988],
         [ 79.2760,  46.1729,  48.9526,  ..., 171.5562, 144.6944, 136.6533]]]) with shape = torch.Size([1, 10, 50257])
tensor(474.7768)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  317,  1218, 12507, 21240,  3202,   286,  3999,  9188,   318,   326,
           286,   366, 21810, 35021,   366,   357,  5525,   102,   102,   164,
         45433,   427, 18962,   427, 14064,   782,  1267,   837,   257, 11283,
           284,   262, 17580, 35021,   837,  7

  9%|▊         | 86/1000 [00:08<01:27, 10.47it/s]

Outputs = tensor([[[104.2267,  84.7629,  94.5854,  ..., 211.3959, 172.4096, 189.5485],
         [393.8381, 322.7064, 422.1285,  ..., 687.9116, 465.5231, 368.8441],
         [ 80.4752,  49.9894,  46.1474,  ..., 182.5532, 149.4102, 140.9247],
         ...,
         [ 80.3667,  40.3482,  50.6684,  ..., 189.5540, 147.4241, 133.4005],
         [222.9782, 179.2090, 265.9148,  ..., 372.5158, 190.6490, 201.3555],
         [ 79.5187,  40.1210,  49.7262,  ..., 186.2270, 145.3758, 132.7386]]]) with shape = torch.Size([1, 77, 50257])
tensor(492.5786)
nan
Lables = tensor([[10343, 13333,   705,    82, 15213,   837,   329,  2241,   290,   329,
          1854,   837,   373,   636,   286,   465,  2276,  3154,  3101,   286,
           262,  8354,   286, 19518,  1058,   339, 13378,   867,  2499,   284,
         10233,   543,   550,  4271,   587,  3177, 48092,  4674,   329, 38273,
          3513,   764, 19439,   449,   494,  2630,   326,   329, 10343, 13333,
           837,   366,  2279,   287,   428,   9

  9%|▉         | 90/1000 [00:08<01:30, 10.09it/s]

Outputs = tensor([[[ 96.8577,  77.5306,  79.1080,  ..., 200.0970, 164.1777, 177.9166],
         [ 65.2183,  36.8305,  21.5871,  ..., 152.9834, 126.9569, 129.3391],
         [ 62.5395,  33.5500,  21.6640,  ..., 146.2378, 121.5863, 126.1317],
         ...,
         [ 76.5516,  40.6904,  51.5640,  ..., 182.6594, 143.6661, 130.1206],
         [171.0353, 139.4034, 192.7052,  ..., 272.7527, 133.2772, 148.8720],
         [ 74.1952,  40.5352,  46.1794,  ..., 174.8890, 140.6084, 131.2941]]]) with shape = torch.Size([1, 269, 50257])
tensor(490.5232)
nan
Lables = tensor([[  383,  3478,   273,   286,   465,   670,  3421,   355,   339,  4166,
           465,  3918,   290, 16573,   284,   465, 21334,   357,   366,   442,
           480, 38970,  2488,    12,    31,   588,   366,  1864,   284, 14959,
          1267,  1058,   465, 14555,  2499,   389,   287,   257,  5365, 27255,
           837,  2184,   306,  3918,   837,   475,   339,  1625,   656,   465,
           898,   287,   262,   812,   286,   

  9%|▉         | 94/1000 [00:08<01:26, 10.53it/s]

Outputs = tensor([[[104.2267,  84.7629,  94.5854,  ..., 211.3960, 172.4096, 189.5485],
         [ 64.2301,  40.2440,  25.9937,  ..., 149.4999, 128.6416, 131.4683],
         [ 60.2818,  36.5775,  19.7046,  ..., 143.1953, 122.6351, 127.3648],
         ...,
         [ 87.2320,  47.8404,  61.6839,  ..., 195.2880, 154.4364, 137.5641],
         [ 84.0165,  48.4531,  51.9966,  ..., 183.8949, 149.6816, 139.7915],
         [ 86.5698,  48.9216,  56.9675,  ..., 189.6097, 152.0731, 138.5236]]]) with shape = torch.Size([1, 37, 50257])
tensor(494.8586)
nan
Lables = tensor([[ 7994,   734, 41188,   286, 10343, 13333,   705,    82, 20007, 47862,
          2499,   389,   287,   428,  1296,   837,   290,   339,   318,  4143,
          3177,   284,   307,   663,  3756, 28622,   764,  2399,  1266,   300,
           131,   250, 44019,   779,   262, 10730,  6583,  2672,   416,   262,
          1296,   284,   751, 38084,  2695,  2138,   621,   355,  5019,  6276,
          8733,   764,  6091,  5209,  3651,   3

 10%|▉         | 97/1000 [00:09<01:13, 12.23it/s]

Outputs = tensor([[[ 85.4058,  54.6306,  78.3852,  ..., 189.3467, 157.5346, 151.3275],
         [239.6266, 223.7447, 316.5235,  ..., 451.7331, 306.8328, 236.8358],
         [227.6377, 229.1062, 245.1792,  ..., 302.0871, 216.4126, 203.8294],
         ...,
         [ 67.2721,  38.4057,  30.6518,  ..., 158.0662, 130.5261, 128.0522],
         [214.2219, 166.6215, 262.7562,  ..., 363.8566, 176.7460, 194.4258],
         [ 80.9157,  46.0634,  52.5308,  ..., 186.2699, 148.0836, 135.7901]]]) with shape = torch.Size([1, 78, 50257])
tensor(496.0798)
nan
Lables = tensor([[  554,   465, 10869,   290,  3393,  1708,   465,  1918,   837, 10343,
         13333,   373,   407,  9257, 16373,   764,   554,   636,   428,   460,
           307, 14183,   284,   465, 22152,  2569,   290,  8766, 25438,   837,
           617,   286,   543,   389,   991,   366,  3177,  4457, 27939,   290,
         13699,   416,  3999,  9188,   764,   366,  1318,   389,  1178, 11811,
         10288,   284,   683,   851,   691, 222

 10%|▉         | 99/1000 [00:09<01:33,  9.63it/s]

Outputs = tensor([[[118.7913, 101.3842, 127.4508,  ..., 238.3956, 189.5461, 209.6578],
         [ 61.0600,  37.9885,  20.5170,  ..., 144.2552, 122.2968, 128.0270],
         [ 64.9585,  40.3773,  27.8826,  ..., 149.0926, 126.7282, 130.7313],
         ...,
         [ 79.0862,  43.6646,  55.3528,  ..., 185.6167, 147.5156, 132.7990],
         [ 75.8373,  42.8519,  48.3851,  ..., 176.3035, 143.0703, 133.3121],
         [ 76.1009,  42.9787,  48.8988,  ..., 176.8686, 143.4368, 133.5012]]]) with shape = torch.Size([1, 138, 50257])
tensor(500.3238)
nan
Lables = tensor([[  632,   373,   287,   262,  1367,   400,  4289,   837,  1141,   262,
          8342, 10940,  6980,   326, 10343, 13333,   705,    82,  8507,  4251,
           663,  9103,   764,   554,   428,  2278,   257,  9815,   302,  2488,
            12,    31, 12660,   286,  2961, 39939,  1718,  1295,   837,   287,
           543, 15233, 29341,   837,  7455, 40750,   290, 10343, 13333,  1625,
           284,   307, 11987,   355, 10200,  8

 10%|█         | 102/1000 [00:10<02:12,  6.76it/s]

tensor(493.7240)
nan
Lables = tensor([[  554,   262,  1160,   400,  4289,   837,   339,   373,   262, 12507,
         21810,   286, 23632, 17853, 33640,   837,   508,   468,  3417,   683,
           355,   366,   262,  6000,  1729,  2488,    12,    31, 12191,   837,
          1729,  2488,    12,    31, 10092, 21810,   508,   468, 11803,   287,
           597,  3303,   366,   837,   290, 16476,   326,   837,   366,   339,
           468,   925,   502,   257,  1365,   582,   837,   355,   257,  6573,
          5797,   290,   355,   257, 36974,  1412, 26433,   366,   764,   220,
           198]]) with shape torch.Size([1, 71])
Outputs = tensor([[[ 93.4650,  72.7889,  75.0904,  ..., 192.8919, 157.5776, 173.4092],
         [ 64.4177,  40.2718,  26.1100,  ..., 149.9771, 128.4518, 131.4595],
         [262.7877, 221.1566, 324.5231,  ..., 491.5674, 322.8568, 258.3152],
         ...,
         [ 77.4491,  38.2523,  49.9023,  ..., 187.7940, 144.9448, 127.1051],
         [145.5188, 102.2013, 171.19

 11%|█         | 106/1000 [00:10<01:33,  9.58it/s]

Outputs = tensor([[[ 96.8575,  77.5305,  79.1080,  ..., 200.0969, 164.1776, 177.9164],
         [ 65.2184,  36.8305,  21.5872,  ..., 152.9835, 126.9571, 129.3392],
         [ 62.5395,  33.5500,  21.6640,  ..., 146.2379, 121.5862, 126.1317],
         ...,
         [ 75.2177,  43.2685,  49.0080,  ..., 173.8033, 141.3073, 133.6673],
         [226.6857, 192.1769, 270.4489,  ..., 362.1667, 183.3056, 197.4525],
         [ 76.4870,  43.0688,  52.6935,  ..., 177.2503, 142.0189, 131.9299]]]) with shape = torch.Size([1, 82, 50257])
tensor(505.0233)
nan
Lables = tensor([[14303,   262,  1511,   400,  4289,   837,   262,  4960,  9871, 40750,
           449,  4669,    72,  2029,   477, 39939,   290,   612,   547,  1178,
         10288,   284, 10343, 13333,   837,  3584,   465,  4588,   460,   307,
          1775,   287,   617,   479,   504,  5303,   357,   366,  3999, 19518,
           925,   416,  4960, 39939,   366,  1267, 26794,  5823,   884,   355,
           347,  2954,    64,   911, 20317,   2

 11%|█         | 108/1000 [00:11<03:01,  4.92it/s]

Outputs = tensor([[[102.5669,  84.3615,  92.0346,  ..., 210.6908, 172.6682, 188.5788],
         [308.8616, 269.4698, 332.3140,  ..., 441.6548, 299.0386, 281.3168],
         [ 76.6503,  47.4846,  45.2099,  ..., 172.7823, 144.1434, 137.8357],
         ...,
         [ 77.8679,  45.3976,  51.2088,  ..., 179.9190, 145.8535, 136.8755],
         [ 77.3781,  45.0401,  50.3517,  ..., 178.8198, 145.0646, 136.5028],
         [ 77.2064,  44.9165,  50.0851,  ..., 178.5714, 144.9336, 136.2579]]]) with shape = torch.Size([1, 299, 50257])
tensor(501.9004)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 33322,   796,   796,   220,   198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 70.1955,  44.5293

 11%|█         | 112/1000 [00:11<02:22,  6.21it/s]

Outputs = tensor([[[142.1027, 118.3303, 185.7571,  ..., 267.6248, 203.3547, 226.2244],
         [ 66.3927,  41.9267,  32.9994,  ..., 156.8625, 133.6778, 132.7622],
         [ 58.7771,  35.0710,  16.6144,  ..., 140.2913, 119.5817, 125.8435],
         ...,
         [ 77.9842,  43.3019,  49.1871,  ..., 181.5687, 145.7082, 135.4098],
         [215.2853, 168.4124, 237.4477,  ..., 304.5597, 160.5926, 183.1653],
         [ 78.6616,  42.3508,  51.4103,  ..., 184.1996, 146.0280, 133.2629]]]) with shape = torch.Size([1, 228, 50257])
tensor(497.5414)
nan
Lables = tensor([[ 3819,  4779,  2024,   423,  4624,   881,  3744,  3463,   319,  2111,
           284, 13878,   257,  2565,   286,   262, 38273,  5107,   973,   416,
         10343, 13333,   764, 10447,   859, 20194,   287,  7683,  3999,  7695,
          1039,  3544,  3594,  2488,    12,    31,  3918, 40005,  1326, 16546,
           837,  9472, 14926,  7439, 15877,   287,   376,  4092,   262,  6869,
          5561, 26748,   262,  3999, 40005,  1

 12%|█▏        | 117/1000 [00:12<01:36,  9.10it/s]

tensor(476.5901)
nan
Lables = tensor([[  554,  1853,   837,  7970, 22605,  3199, 25231,   837,   351,  6476,
          3999, 13399,   837,   286,   262,  1844, 19518,   286, 10343, 13333,
           287,  2237, 15343,   837,   351,  7667, 31950, 21152,   837,   543,
         20047, 18875,  1108,   764,   220,   198]]) with shape torch.Size([1, 36])
Outputs = tensor([[[101.6950,  80.9774, 100.6090,  ..., 216.1597, 175.4110, 184.6034],
         [ 68.4741,  43.5775,  36.7479,  ..., 159.5811, 136.0439, 134.8882],
         [ 61.5944,  38.5580,  21.9059,  ..., 144.7394, 123.1103, 128.9577],
         ...,
         [ 84.8403,  54.4580,  63.6268,  ..., 191.2398, 156.8681, 141.6484],
         [234.1399, 215.1847, 274.9297,  ..., 360.0725, 190.3478, 222.5425],
         [ 82.1032,  51.5910,  59.2593,  ..., 183.4540, 150.5254, 138.3141]]]) with shape = torch.Size([1, 36, 50257])
tensor(491.9971)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 5025

 12%|█▏        | 119/1000 [00:12<01:35,  9.21it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 61.6596,  37.9102,  20.4852,  ..., 144.6040, 123.5061, 128.8120],
         [ 59.6905,  35.6567,  19.6719,  ..., 140.2255, 119.3408, 125.8885],
         ...,
         [ 77.5263,  44.1404,  56.4474,  ..., 183.7515, 146.1913, 133.3128],
         [205.7495, 169.1670, 253.8351,  ..., 336.3341, 173.7007, 186.1431],
         [ 75.4107,  42.3984,  53.2030,  ..., 178.4592, 142.2082, 131.0340]]]) with shape = torch.Size([1, 148, 50257])
tensor(500.0316)
nan
Lables = tensor([[  383,  2610,  2627,   262,  1448,   705,    82, 11695,  1353,  2488,
            12,    31,  3478,  2277,   287,  7517,   290,   262,  1578,  7526,
           837,   981,   708,  1397,  1353,  2488,    12,    31, 16571,  6116,
           287,  1111, 21402, 16771,   357,   376, 32358,   290,  5007, 11339,
          1267,   837,   355,   880,   355,   287,  4505,   837,  3340,   837,
         16490,   837,  4881,   837,   968,  8

 12%|█▏        | 122/1000 [00:12<01:59,  7.34it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 66.5909,  42.7298,  27.7812,  ..., 154.4740, 131.5596, 133.2815],
         [ 64.7569,  40.8996,  32.2652,  ..., 153.1095, 130.6482, 131.3600],
         ...,
         [ 76.6440,  42.6284,  54.1010,  ..., 180.2475, 144.5890, 131.3234],
         [ 74.3422,  42.3148,  47.9390,  ..., 172.8689, 141.3079, 132.6623],
         [ 74.1023,  42.2190,  47.1761,  ..., 172.2423, 140.9050, 132.6889]]]) with shape = torch.Size([1, 130, 50257])
tensor(495.8723)
nan
Lables = tensor([[  383,  3496,   373,  3017,   287,   262, 15360,   983,  2329, 17349,
          1946,   837,   290,   318,   635,   530,   286,   262,  2922,  7259,
          1695,   319,   262, 13605,  2196,   764, 12032,   837,   340,   318,
           262,  2457,  1388,  2610,   319,   262,  1294,  8313,   286,  2735,
          1320,   705,    82,  1867,   314,  4889,  7849,  5145,  6337,   764,
           220,   198]]) with shape torch.Size

 13%|█▎        | 126/1000 [00:13<01:40,  8.69it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 61.6596,  37.9102,  20.4852,  ..., 144.6040, 123.5061, 128.8120],
         [ 59.6905,  35.6567,  19.6719,  ..., 140.2255, 119.3408, 125.8885],
         ...,
         [ 75.2299,  41.2941,  52.1644,  ..., 180.8781, 141.3809, 129.0897],
         [190.1443, 148.4526, 210.5342,  ..., 329.9367, 161.7472, 167.7227],
         [ 73.7447,  39.7891,  50.3613,  ..., 177.4743, 139.0425, 127.3332]]]) with shape = torch.Size([1, 165, 50257])
tensor(492.8659)
nan
Lables = tensor([[  366, 20350,   921,   366,   373,  7147,   355,   262,  1218,  1294,
          2060,   290,  2368,  3230,   422,   511,  1218,  8034,  5062,   837,
          7214,  2185,  5995,   764, 31684, 32788,   837,   257,  1448,  2888,
           837,   287,   257,  3389,  2321,  2720,   351, 31214,  3000,   837,
          4893,  1521,   484,  7690,   366, 20350,   921,   366,   355,   262,
          5062,   705,    82,  1218,  2060,   

 13%|█▎        | 129/1000 [00:13<01:41,  8.54it/s]

Outputs = tensor([[[103.6324,  85.9699, 101.8457,  ..., 209.0954, 165.2511, 188.0273],
         [ 61.4436,  37.9426,  20.7327,  ..., 144.5826, 123.3614, 128.6295],
         [ 60.5854,  36.3671,  21.1705,  ..., 141.8217, 120.4446, 126.6055],
         ...,
         [ 75.7420,  40.0368,  51.5203,  ..., 180.8537, 143.8939, 129.8286],
         [225.7585, 195.8690, 269.5115,  ..., 363.1262, 190.8701, 196.4028],
         [ 74.2752,  39.1182,  48.8924,  ..., 176.4400, 140.9356, 128.6724]]]) with shape = torch.Size([1, 161, 50257])
tensor(497.5396)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   955,  9150,   290, 16307,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
      

 13%|█▎        | 131/1000 [00:14<01:58,  7.33it/s]

Outputs = tensor([[[103.6324,  85.9699, 101.8457,  ..., 209.0954, 165.2511, 188.0273],
         [ 61.4436,  37.9426,  20.7327,  ..., 144.5826, 123.3614, 128.6295],
         [ 60.5854,  36.3671,  21.1705,  ..., 141.8217, 120.4446, 126.6055],
         ...,
         [ 64.4482,  35.7927,  30.2220,  ..., 153.4393, 126.6438, 124.2492],
         [205.3361, 165.1996, 242.2883,  ..., 321.2172, 175.2342, 191.3200],
         [ 75.9534,  41.6606,  51.5236,  ..., 177.8061, 141.5767, 129.3214]]]) with shape = torch.Size([1, 218, 50257])
tensor(499.6302)
nan
Lables = tensor([[  366, 20350,   921,   366,   373,   880,  2722,   416, 11811,  2647,
          9188,   837,   508,  1247,   445,   319,   663,  3081,   286,  3227,
           764,  5747, 21567,  8026,   705,    82,  5966,   360, 16617,   837,
           508, 15342,   663, 13530,   837,   290,  5180, 32788,   286, 36229,
           837,   508, 16373,   262, 32859,   837,  3417,   366, 20350,   921,
           366,   355,   530,   286,   262,  5

 13%|█▎        | 132/1000 [00:14<02:26,  5.94it/s]

Outputs = tensor([[[103.6324,  85.9699, 101.8457,  ..., 209.0954, 165.2511, 188.0273],
         [ 61.4436,  37.9426,  20.7327,  ..., 144.5826, 123.3614, 128.6295],
         [ 60.5854,  36.3671,  21.1705,  ..., 141.8217, 120.4446, 126.6055],
         ...,
         [ 76.5836,  42.2421,  53.8144,  ..., 184.2706, 144.0493, 130.1145],
         [241.8235, 197.7165, 288.5092,  ..., 399.9734, 225.4003, 209.8046],
         [ 75.2529,  41.2661,  51.4936,  ..., 180.5450, 141.6794, 128.9178]]]) with shape = torch.Size([1, 226, 50257])
tensor(488.4060)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 22724,  2854,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 66.2097,  

 14%|█▎        | 136/1000 [00:14<01:58,  7.31it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 59.4025,  36.0252,  17.2930,  ..., 141.2369, 121.4668, 125.5339],
         [ 59.5989,  35.1482,  17.4984,  ..., 141.9248, 122.0054, 125.8102],
         ...,
         [ 79.4623,  45.3445,  56.7486,  ..., 183.1127, 146.0084, 132.9985],
         [246.5713, 204.8680, 277.5707,  ..., 418.9179, 232.3548, 216.2251],
         [ 78.0650,  43.9269,  54.9988,  ..., 179.6480, 143.4994, 131.4029]]]) with shape = torch.Size([1, 252, 50257])
tensor(493.7823)
nan
Lables = tensor([[  383,  3496,  2627,  1881, 41837,   705,    82,  5544,  1353,  2488,
            12,    31, 16571,  2277,   319,   262,  5398,  6964,  1802,   837,
           613,   868,   379,  1271,  1542,   764,   383,  2060, 35737,   379,
          1271,  1511,   319,   262,  6638,  5573,   829, 22086,   319,  2681,
          3269,  2211,   837, 18730,   663,  9103,  2292,   290,   262,  1448,
           705,    82,  5544,  1353,  8208,  2

 14%|█▎        | 137/1000 [00:15<02:20,  6.14it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 63.2020,  39.4159,  25.5464,  ..., 148.2553, 127.9576, 129.6129],
         [ 66.5275,  42.8158,  32.0672,  ..., 153.5789, 132.6542, 132.8135],
         ...,
         [ 76.4722,  41.8766,  54.9734,  ..., 180.7333, 144.2280, 129.9916],
         [196.5329, 156.5385, 223.4465,  ..., 316.3251, 183.8471, 186.6607],
         [ 74.4040,  40.3026,  51.3878,  ..., 175.2299, 140.6916, 128.4338]]]) with shape = torch.Size([1, 240, 50257])
tensor(497.4719)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 7849, 2008,  796,  796,  220,  198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 65.7492,  41.4701,

 14%|█▍        | 141/1000 [00:15<01:35,  8.98it/s]

Outputs = tensor([[[101.6950,  80.9774, 100.6090,  ..., 216.1597, 175.4110, 184.6034],
         [ 66.5908,  42.7297,  27.7811,  ..., 154.4740, 131.5596, 133.2815],
         [ 64.7569,  40.8997,  32.2652,  ..., 153.1095, 130.6482, 131.3600],
         ...,
         [ 72.5462,  39.7227,  44.7684,  ..., 170.6048, 137.6758, 130.2189],
         [152.7665, 121.5246, 173.3950,  ..., 248.0134, 106.5180, 144.2903],
         [ 73.1372,  38.3273,  47.8425,  ..., 173.2437, 137.4916, 127.0304]]]) with shape = torch.Size([1, 96, 50257])
tensor(499.7729)
nan
Lables = tensor([[13590,   416,  3389,  2321,   837,   262,  2647,  2008,   373,  2095,
          1417,   837,   287,  1811, 31214,  3000,  9299,   837,   355,   366,
          5749,   621,  1997,   356,   705,   303,  1760,   878,   366,   416,
          1168, 49987, 31745,   837,   355,   366,   257,  1256,   286,  1327,
           670,   366,   416, 32788,   837,   355,   366,  5899, 37775,   366,
           416,  5593,  4186,    75,  7899,   8

 14%|█▍        | 144/1000 [00:15<02:07,  6.69it/s]

tensor(494.2995)
nan
Lables = tensor([[ 5616, 22261, 27698,   837,   257, 18920,   329,  7286, 22743,   837,
         38931,   262, 10651,   355,  1719,   366,  2279,   257,  2008,   416,
           257,  2933,  4097,   815,   307,   366,   290,  1043,  1448,   705,
            82, 36138,  8216, 32327,   764, 23489,  4100,   378,    68,   286,
           412,  5145,  7467, 15342,   663,   366, 16464, 45002,   290, 15950,
         29012,   366, 16311,   837,   290, 31214,  3000,   705,    82,  5302,
           344,  6213,   569,  8107,  3417,   262, 10651,   355,   366, 49977,
          1468,  8502,   366,   764, 30236, 11809,   837,  3597,   329, 36079,
            17,   270,   837,   373,  9431,   326,  2402,  4964,   262,   366,
         29012,   366,  2647,  2008,   837,   262, 19091,   815,   423,   257,
          1327,   640, 19621, 14132,   262,  1448,   764, 17722,  3010,  5326,
           837,   262,  4014,   329,  5401,  5652,  3782,   837,   725,  1631,
           262,  1448,

 15%|█▍        | 146/1000 [00:16<01:48,  7.89it/s]

Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,  7547, 13289,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 65.4509,  40.6474,  32.6848,  ..., 154.2212, 131.7464, 132.3051],
         ...,
         [ 68.4232,  43.9833,  32.3389,  ..., 156.0040, 134.6231, 135.2048],
         [245.4431, 245.9144, 310.1082,  ..., 396.4003, 263.6818, 271.7090],
         [ 82.7660,  48.4289,  53.3028,  ..., 174.2556, 146.7283, 137.7967]]]) with shape = torch.Size([1, 8, 50257])
tensor(475.5867)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 1081,   636,   286,   663, 12148,   837,  1881, 41837,  6157,   262,
   

 15%|█▌        | 152/1000 [00:16<01:15, 11.22it/s]

Outputs = tensor([[[148.6191, 127.7911, 210.7361,  ..., 285.4633, 216.0781, 238.9408],
         [ 64.4079,  40.6099,  26.3387,  ..., 149.2148, 128.9896, 131.7521],
         [ 67.0678,  42.9264,  30.9504,  ..., 153.4541, 132.6597, 134.6592],
         ...,
         [ 77.6904,  45.3842,  50.0158,  ..., 178.4197, 144.3924, 133.9129],
         [ 75.0919,  43.1526,  46.8168,  ..., 173.2961, 141.4555, 132.5103],
         [ 76.2877,  42.6016,  50.9286,  ..., 177.3201, 142.4187, 129.9720]]]) with shape = torch.Size([1, 251, 50257])
tensor(491.1225)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 17762, 13487,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 71.2653,  

 15%|█▌        | 154/1000 [00:16<01:09, 12.12it/s]

tensor(466.6158)
nan
Lables = tensor([[  366,  7703, 11597,   366,   784,   513,  1058,  5433,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[103.6324,  85.9700, 101.8457,  ..., 209.0953, 165.2511, 188.0273],
         [563.0836, 493.1145, 585.1587,  ..., 941.1480, 611.9017, 536.9854],
         [107.7197,  70.5051,  75.9744,  ..., 223.5737, 171.8569, 165.3727],
         ...,
         [108.3968,  71.3579,  78.6081,  ..., 225.7997, 174.2115, 166.7581],
         [107.6368,  70.8574,  77.6628,  ..., 224.4324, 173.4844, 166.0732],
         [108.6449,  71.0690,  79.6370,  ..., 226.3295, 173.2490, 164.6442]]]) with shape = torch.Size([1, 10, 50257])
tensor(603.1403)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 29501,   290,  8213,   796,   796,   220,   198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[101.3162,  81.967

 16%|█▌        | 160/1000 [00:16<00:55, 15.14it/s]

tensor(498.3955)
nan
Lables = tensor([[14912, 47895, 33381,   417,  4102,   851,  4469, 25355,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[103.9473,  86.1940, 106.1399,  ..., 221.6696, 180.7819, 189.1484],
         [ 61.5587,  38.1484,  22.0571,  ..., 144.7707, 125.1229, 128.9764],
         [102.9050,  67.6276,  76.0003,  ..., 211.3078, 183.6247, 153.2987],
         ...,
         [ 72.5300,  44.7623,  39.1801,  ..., 164.2308, 139.9278, 134.7429],
         [ 78.3795,  46.8602,  50.8368,  ..., 172.1176, 144.3420, 137.8726],
         [ 78.8891,  46.8054,  52.9062,  ..., 173.0641, 143.9870, 136.1223]]]) with shape = torch.Size([1, 10, 50257])
tensor(469.3189)
nan
Lables = tensor([[11556,   439,  6075,   272,   851,  3224, 10047,   220,   198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[103.7072,  82.9627,  94.3060,  ..., 216.1366, 174.4669, 184.5335],
         [ 62.0666,  37.2740,  19.5660,  ..., 144.2011, 123.3988, 127.6681],
         [ 64.6202,  39.4949,  2

 16%|█▋        | 164/1000 [00:17<00:53, 15.69it/s]

Outputs = tensor([[[ 98.7515,  80.9926,  91.5298,  ..., 206.1264, 169.2209, 182.2761],
         [ 64.9541,  37.8347,  22.2511,  ..., 152.6012, 127.4214, 130.0555],
         [243.4481, 127.8360, 271.7541,  ..., 436.9527, 311.9125, 214.0342],
         ...,
         [ 79.4431,  45.5361,  51.5168,  ..., 177.6976, 144.7269, 137.7717],
         [ 79.8689,  45.9178,  52.1706,  ..., 178.3145, 145.4552, 138.2218],
         [ 80.9724,  46.0361,  54.8250,  ..., 180.5539, 145.4282, 136.5647]]]) with shape = torch.Size([1, 12, 50257])
tensor(521.7397)
nan
Lables = tensor([[14912,   666, 33609,   259,   851,  3597,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[103.9474,  86.1940, 106.1399,  ..., 221.6696, 180.7818, 189.1484],
         [ 68.1201,  41.1835,  27.5664,  ..., 156.2654, 132.4721, 132.9118],
         [321.2397, 356.8281, 480.2673,  ..., 585.1428, 452.7255, 327.6531],
         ...,
         [ 75.4312,  51.8201,  40.0473,  ..., 167.7084, 148.2721, 142.8630],
         [ 72

 17%|█▋        | 171/1000 [00:17<00:37, 22.36it/s]

Outputs = tensor([[[106.7154,  89.5671, 112.5861,  ..., 217.0843, 169.5144, 191.3562],
         [ 67.7761,  43.5007,  30.9708,  ..., 156.9516, 133.5315, 134.4564],
         [ 58.6987,  35.7152,  17.7804,  ..., 139.2250, 119.0297, 125.8930],
         ...,
         [ 68.9055,  43.1224,  30.2685,  ..., 155.3044, 132.0770, 134.0600],
         [ 64.9508,  39.0917,  20.2157,  ..., 149.7017, 125.5169, 130.2425],
         [ 76.2041,  46.1023,  39.8031,  ..., 164.7163, 138.2336, 137.6210]]]) with shape = torch.Size([1, 17, 50257])
tensor(461.1712)
nan
Lables = tensor([[29501, 16573,   422,  7214,  2185,  5995,   705,    82, 35940,  4710,
           764,   220,   198]]) with shape torch.Size([1, 13])
Outputs = tensor([[[109.4606,  78.4451, 103.9952,  ..., 236.8594, 191.0284, 175.5005],
         [ 66.3498,  42.3962,  29.9678,  ..., 153.8824, 132.0286, 133.3793],
         [281.1649, 291.2779, 341.4481,  ..., 496.9264, 389.5392, 331.3136],
         ...,
         [ 74.3099,  36.6989,  47.6544,  ...,

 18%|█▊        | 178/1000 [00:17<00:26, 30.52it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 70.9094,  46.0155,  35.3917,  ..., 161.1611, 137.7082, 137.3699],
         ...,
         [ 68.2262,  43.8670,  32.2711,  ..., 155.8429, 134.4974, 135.0205],
         [268.7201, 268.1428, 333.2038,  ..., 426.9851, 289.8737, 284.4335],
         [ 82.5667,  48.1577,  52.5242,  ..., 173.7135, 146.5158, 137.3112]]]) with shape = torch.Size([1, 8, 50257])
tensor(487.3145)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0,

 18%|█▊        | 182/1000 [00:18<00:53, 15.26it/s]

Outputs = tensor([[[111.0183,  90.2852, 112.0914,  ..., 229.1404, 187.2443, 196.9771],
         [255.1954, 253.1281, 295.5262,  ..., 316.9138, 258.7813, 248.3350],
         [ 77.6673,  47.4956,  49.6826,  ..., 170.6338, 146.2715, 138.0180],
         ...,
         [ 77.0010,  41.3604,  50.8865,  ..., 179.0207, 143.8001, 131.0772],
         [228.6638, 178.5562, 300.3333,  ..., 373.2498, 214.3720, 187.0447],
         [ 75.7323,  40.4258,  49.2225,  ..., 175.7270, 141.4568, 129.9319]]]) with shape = torch.Size([1, 224, 50257])
tensor(485.9917)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 25353,   796,   796,   220,   198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 71.4770,  46.3834

 18%|█▊        | 185/1000 [00:18<00:54, 14.97it/s]

Outputs = tensor([[[143.8833, 127.8614, 193.6998,  ..., 269.3252, 205.2073, 238.1108],
         [ 60.8081,  37.6049,  19.2360,  ..., 143.9861, 122.7597, 127.2148],
         [ 58.2175,  34.9582,  16.4937,  ..., 138.9427, 118.9637, 125.3299],
         ...,
         [ 81.0043,  45.7772,  58.6638,  ..., 188.7255, 149.0385, 136.0952],
         [ 77.4517,  44.8455,  51.1386,  ..., 178.6671, 144.0651, 136.2855],
         [ 78.0230,  43.9401,  53.6628,  ..., 181.2282, 143.9283, 133.6823]]]) with shape = torch.Size([1, 105, 50257])
tensor(501.5263)
nan
Lables = tensor([[  554,   262, 16280,   837,   262,  4960,  8065,  3393,  2900,   663,
          2962,   284,   262,   734,  5637, 14987,   329, 14312, 18648,   287,
           262,  8211, 10692,  1058,  5491,   290,   262,  1578,  1829,   764,
          7031, 13090,   309,  1039,   315,   283, 13090,   837,   257,  4960,
          8565,  6178, 21093,   290,  2422, 46410,   837, 29131,   326,  5358,
           561, 16857, 15058,  1022,  2869,   

 19%|█▉        | 188/1000 [00:19<01:31,  8.92it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 54.7185,  32.3508,  14.7241,  ..., 135.4425, 115.5606, 122.6361],
         [ 69.5691,  43.9541,  37.7058,  ..., 162.1893, 137.8894, 135.5374],
         ...,
         [ 76.4231,  41.0245,  51.4862,  ..., 179.0289, 141.6628, 129.9929],
         [221.1691, 179.0237, 256.9366,  ..., 370.6295, 197.7907, 193.9804],
         [ 74.8222,  41.3795,  46.8565,  ..., 172.9975, 139.3066, 131.9223]]]) with shape = torch.Size([1, 171, 50257])
tensor(494.5785)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 8495,  290, 6764,  796,  796,  220,  198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [183.9269, 151

 19%|█▉        | 192/1000 [00:19<01:43,  7.77it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 68.5386,  42.4026,  38.7292,  ..., 162.6973, 136.6528, 133.9308],
         [ 58.4488,  35.3879,  16.8132,  ..., 139.5665, 119.6042, 125.9011],
         ...,
         [ 77.7916,  41.2489,  52.1149,  ..., 183.8874, 143.0621, 131.0602],
         [ 84.2171,  49.2640,  53.4432,  ..., 161.6913, 144.2563, 145.3408],
         [ 75.6459,  39.4175,  49.2414,  ..., 178.8868, 139.5257, 128.8854]]]) with shape = torch.Size([1, 419, 50257])
tensor(489.5249)
nan
Lables = tensor([[  383,  7937,   550,   257,  4129,   286, 27121,  2488,    13,    31,
          1248, 18985,   357,   718,  5999, 10117,   657,   287,  1267,  4045,
           837,   257, 15584,   286,  2579,  2488,    13,    31,  6135, 18985,
           357, 10048, 10117,   657,   287,  1267,   290,   257, 28841,  8951,
           286,   807,  2488,    13,    31, 10261, 18985,   357,  2808, 10117,
           604,   287,  1267,   379,  2769,  3

 19%|█▉        | 194/1000 [00:20<02:02,  6.60it/s]

Outputs = tensor([[[102.5669,  84.3615,  92.0346,  ..., 210.6908, 172.6682, 188.5788],
         [308.8616, 269.4698, 332.3140,  ..., 441.6548, 299.0386, 281.3168],
         [ 84.6678,  53.3665,  56.9751,  ..., 185.9398, 153.7861, 144.1216],
         ...,
         [ 77.5867,  41.7156,  50.9743,  ..., 182.7674, 145.3196, 131.2303],
         [ 75.0958,  41.6169,  45.4468,  ..., 174.9659, 141.8075, 132.4434],
         [ 76.4932,  41.1426,  49.0267,  ..., 179.3405, 142.9781, 130.5018]]]) with shape = torch.Size([1, 185, 50257])
tensor(478.3267)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796,  8772, 15204,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
      

 20%|█▉        | 198/1000 [00:21<02:19,  5.75it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 56.2419,  32.4856,  13.7458,  ..., 134.7196, 115.5090, 122.8549],
         [ 60.6549,  36.1155,  18.3829,  ..., 143.2278, 122.2474, 127.3688],
         ...,
         [ 80.0881,  46.0103,  58.9448,  ..., 185.8680, 146.0153, 134.6214],
         [186.3290, 147.6896, 206.1917,  ..., 317.5328, 158.3682, 168.5388],
         [ 79.1216,  44.8038,  57.5860,  ..., 183.4467, 144.6337, 133.5335]]]) with shape = torch.Size([1, 424, 50257])
tensor(494.3542)
nan
Lables = tensor([[ 5856,   511, 15533,    82, 49400,   837,   262, 20667,   364,   319,
          1123,  4074,   547,  6928,   416,  3624,   649, 45327,   261,  3056,
          2488,    12,    31,  6294, 20667,   364,   837, 18235,   656,   262,
          1966, 46088, 36741,  2119,   837,   290,   262,  2651, 28214,   373,
          4615,   764,   383, 35658,   547,  6928,   416,  1440, 31394, 45327,
           261, 35658,   351,   257,  3562,  5

 20%|█▉        | 199/1000 [00:21<02:37,  5.08it/s]

Outputs = tensor([[[102.5669,  84.3615,  92.0346,  ..., 210.6908, 172.6682, 188.5788],
         [ 58.5057,  32.8147,  14.4888,  ..., 139.3111, 118.3615, 124.3421],
         [ 62.0831,  35.5208,  19.8073,  ..., 145.1720, 123.0071, 127.4587],
         ...,
         [ 76.9063,  40.9592,  53.9922,  ..., 182.7288, 144.6085, 131.1976],
         [173.1980, 127.8127, 197.1500,  ..., 279.2667, 146.8929, 158.3471],
         [ 74.7113,  39.1658,  50.8731,  ..., 177.5531, 141.0873, 129.1647]]]) with shape = torch.Size([1, 233, 50257])
tensor(488.7142)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 7057, 3263,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 61.5493,  38.1193,  21.8254,  ..., 144.9911, 124.6011, 129.0053],
         [ 65.31

 20%|██        | 203/1000 [00:21<02:06,  6.28it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 63.5891,  39.5804,  24.5101,  ..., 148.6578, 127.8894, 130.5339],
         [ 60.9653,  37.5961,  20.6509,  ..., 143.3717, 122.7860, 127.6702],
         ...,
         [ 77.6221,  41.3598,  53.2188,  ..., 185.3138, 143.8096, 130.4681],
         [202.0066, 161.3515, 246.3104,  ..., 359.7775, 153.8822, 156.8075],
         [ 76.2097,  40.7977,  50.8070,  ..., 181.2778, 141.1113, 129.5854]]]) with shape = torch.Size([1, 192, 50257])
tensor(494.4319)
nan
Lables = tensor([[ 2750,  2159,  1810,  2873,   837,   262,  6541,   973,  5994, 10495,
         18588,  2488,    12,    31, 30662,   837, 28490, 19679,   764,  5501,
           286,   777, 19679, 20261,   718,  4790,  2488,    13,    31,   642,
         37075,   357,   352,  2488,    11,    31,  4764,    20, 18360,  1267,
           290,   373,  6294,   379,   257, 31746, 15432,   286, 44586,   784,
           767,  2425, 18985,   583,  1218,   

 20%|██        | 204/1000 [00:22<02:23,  5.55it/s]

Outputs = tensor([[[114.3993,  98.4413, 123.9206,  ..., 234.1261, 185.9465, 203.0234],
         [ 63.6034,  39.8829,  25.1251,  ..., 148.2070, 127.8894, 130.9805],
         [ 66.1871,  41.8732,  31.3131,  ..., 153.4683, 132.1815, 133.1868],
         ...,
         [ 77.9657,  43.0156,  55.5618,  ..., 183.8075, 144.6386, 132.7661],
         [216.8078, 174.3649, 257.3445,  ..., 381.3303, 193.4703, 183.1166],
         [ 76.4341,  41.9051,  53.3025,  ..., 179.8056, 141.5339, 131.0813]]]) with shape = torch.Size([1, 196, 50257])
tensor(493.6543)
nan
Lables = tensor([[  383,  7937,   705,  9233,  3211,  3263, 19954,   286,  8208,  2026,
          2488,    12,    31, 27417,   260,  1478,  2488,    12,    31,  1247,
         38813,   260,  5994,   513,   764, 18087,  6429,   286,   777,   547,
         12623,   287,  6124,   368,   689,   287,   262, 11092,   293,   290,
         23510,  5620,   290,   262,  5637,  5166,   547, 12623,   319,   262,
          6203,  2029,   606,   290,  6861,   

 20%|██        | 205/1000 [00:22<03:13,  4.12it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 65.0314,  41.1128,  28.2078,  ..., 152.4042, 130.8126, 131.8278],
         [ 58.3059,  34.6081,  19.1292,  ..., 138.7521, 120.1066, 124.4181],
         ...,
         [ 77.3368,  42.0750,  53.3456,  ..., 182.2561, 143.5514, 130.5280],
         [220.0186, 171.3889, 283.8246,  ..., 366.8930, 199.5371, 192.9962],
         [ 76.0617,  41.4944,  51.3156,  ..., 178.8227, 140.9077, 129.5122]]]) with shape = torch.Size([1, 390, 50257])
tensor(497.1246)
nan
Lables = tensor([[  554, 34625,   784,  4747,   262, 15923,  6541,   547,  6928,   351,
          3624,  2319,  2488,    12,    31, 28139,  1105,  2488,    13,    31,
           767, 12067,   357,   642,  2488,    13,    31,   657,   287,  1267,
          5994,  9919, 10668,  2488,    12,    31,  4007,  6541,   837, 18235,
           319,  1111,  5389,   286,   262,  2651,  2208,  7249,   942,   287,
          1440, 15203,  2488,    12,    31,  2

 21%|██        | 206/1000 [00:23<03:20,  3.97it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 63.0916,  38.2170,  21.0819,  ..., 148.0726, 126.3990, 129.3625],
         [ 70.3367,  40.9197,  36.0051,  ..., 158.7252, 131.9919, 132.8382],
         ...,
         [ 81.3685,  47.0213,  60.0626,  ..., 190.5517, 149.5944, 135.5602],
         [203.2300, 168.9659, 256.4896,  ..., 358.6023, 185.8929, 172.7040],
         [ 79.2863,  45.4878,  56.1841,  ..., 184.6885, 145.9735, 133.9646]]]) with shape = torch.Size([1, 205, 50257])
tensor(492.2210)
nan
Lables = tensor([[ 5856,   262,  3095,  2488,    12,    31, 15533,    82, 25056,   262,
         39023, 21103,   547,  4615,   290,   262,   569, 21630,   734,  2488,
            12,    31, 14896,   364,   547,  6928,   416,  8208,  5964,  2488,
            12,    31,  3170,  6964,   354, 41304,  1679,  8085,  5994,  9907,
          1657, 15923,  6541,   287,   838, 15203,  2488,    12,    31,  2485,
         30790,   764,   770,   373,   262,  3

 21%|██        | 207/1000 [00:23<04:03,  3.26it/s]

Outputs = tensor([[[102.5669,  84.3615,  92.0346,  ..., 210.6908, 172.6682, 188.5788],
         [308.8616, 269.4698, 332.3140,  ..., 441.6548, 299.0386, 281.3168],
         [ 84.2025,  52.9419,  56.6738,  ..., 185.6294, 153.2023, 142.3648],
         ...,
         [ 80.0975,  44.4375,  55.6670,  ..., 185.8148, 146.9021, 133.8445],
         [228.3136, 192.6318, 276.5470,  ..., 390.2758, 206.4353, 181.5735],
         [ 79.0130,  43.8866,  53.6584,  ..., 182.7177, 144.8494, 133.1722]]]) with shape = torch.Size([1, 276, 50257])
tensor(490.3314)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 9985,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.5962,  41

 21%|██        | 211/1000 [00:24<02:36,  5.03it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 56.2419,  32.4856,  13.7458,  ..., 134.7196, 115.5090, 122.8549],
         [ 60.6549,  36.1155,  18.3829,  ..., 143.2278, 122.2474, 127.3688],
         ...,
         [ 76.0435,  39.1270,  50.8287,  ..., 180.7732, 140.2185, 127.4256],
         [ 87.4357,  53.2654,  53.1173,  ..., 173.8879, 159.6339, 143.9099],
         [ 74.1047,  37.7850,  47.6162,  ..., 175.5240, 136.9946, 126.1623]]]) with shape = torch.Size([1, 240, 50257])
tensor(474.1887)
nan
Lables = tensor([[  383,   314,   325,  1398,   547,   262,   691,  4960, 10181,  5748,
           284,  1295,   262, 11913,  7093,  2029,   262,  7582,  7093,   355,
           262,   314,    41,    45, 16555,   284,  1234,   355,   881,  2272,
           355,  1744,  1022,   262,  4047,   781,  6475,   540, 46059,   415,
           290,  6164,   290, 39023, 20413,   602,   764,   383,  3514,   422,
         41904,   278, 19679,   379,   890, 18

 22%|██▏       | 215/1000 [00:24<01:50,  7.12it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 56.2419,  32.4856,  13.7458,  ..., 134.7196, 115.5090, 122.8549],
         [ 60.6549,  36.1155,  18.3829,  ..., 143.2278, 122.2474, 127.3688],
         ...,
         [ 80.4065,  45.7807,  55.9252,  ..., 187.8562, 147.9479, 134.3132],
         [185.2232, 150.6716, 220.8786,  ..., 294.7046, 131.0031, 155.7905],
         [ 77.1494,  44.7535,  49.3065,  ..., 178.7448, 143.7516, 134.6104]]]) with shape = torch.Size([1, 212, 50257])
tensor(479.9778)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796,  3764,  1630,   290, 15736,   796,   796,   796,
           220,   198]]) with shape torch.Size([1, 12])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.

 22%|██▏       | 217/1000 [00:24<02:04,  6.29it/s]

Outputs = tensor([[[100.4517,  80.1170,  85.1540,  ..., 204.7937, 167.7377, 183.4373],
         [358.4468, 281.3883, 358.3213,  ..., 511.0024, 307.4412, 295.6605],
         [ 91.6071,  57.6681,  66.9612,  ..., 201.0391, 160.4865, 146.6755],
         ...,
         [ 77.8006,  41.4620,  53.3234,  ..., 183.6284, 144.6589, 131.0476],
         [177.6737, 139.1759, 221.5471,  ..., 290.4165, 152.2982, 149.7087],
         [ 75.2850,  41.5849,  47.0326,  ..., 175.1691, 141.3791, 132.6944]]]) with shape = torch.Size([1, 288, 50257])
tensor(493.2020)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796, 30767,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 9])
Outputs = tensor([[[101.6950,  80.9774, 100.6089,  ..., 216.1597, 175.4110, 184.6033],
         [ 61.4784,  38.1162,  21.7412,  ..., 144.7068, 124.5626, 128.9748],
         [ 65.

 22%|██▏       | 220/1000 [00:25<01:47,  7.26it/s]

Outputs = tensor([[[111.3610,  92.1133, 113.9396,  ..., 218.8160, 172.9429, 200.6375],
         [ 61.7280,  36.5498,  19.1465,  ..., 145.4255, 123.3847, 128.2794],
         [ 69.3958,  40.4251,  31.5945,  ..., 156.9365, 130.1869, 134.2117],
         ...,
         [ 73.6243,  37.8588,  49.7260,  ..., 176.9377, 139.6760, 127.5941],
         [200.9582, 154.4832, 241.8322,  ..., 326.4584, 166.7045, 175.8965],
         [ 72.1306,  36.7483,  47.1585,  ..., 172.3449, 136.2926, 125.8971]]]) with shape = torch.Size([1, 148, 50257])
tensor(493.2515)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 44101,   284, 14554, 16651,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.6950,  80.9774, 100.6089,  ..., 216.1597, 175.4110, 184.6033],
         [ 62.0412,  38.5393,  22.1272,  ..., 145.7505, 125.2556, 128.8690],
      

 23%|██▎       | 227/1000 [00:25<01:16, 10.09it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 70.8182,  46.0145,  35.9000,  ..., 163.3499, 139.1117, 138.0648],
         [ 58.3844,  35.1313,  16.5624,  ..., 139.0525, 119.3379, 125.5945],
         ...,
         [ 77.4867,  40.8508,  51.6956,  ..., 182.7186, 143.6651, 131.0007],
         [ 74.5753,  40.6779,  45.1365,  ..., 173.9345, 139.6637, 131.8685],
         [ 75.0636,  40.9868,  46.0822,  ..., 174.9928, 140.3518, 132.2372]]]) with shape = torch.Size([1, 255, 50257])
tensor(490.3315)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 7057, 3263, 2458,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 11])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [

 23%|██▎       | 229/1000 [00:26<01:34,  8.19it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 63.1747,  39.2355,  21.6651,  ..., 147.8093, 125.6611, 130.1552],
         [ 72.6827,  45.9841,  36.1080,  ..., 165.0262, 139.2363, 138.9059],
         ...,
         [ 74.0319,  40.4571,  51.0372,  ..., 175.3061, 139.6000, 126.8207],
         [190.2463, 150.9774, 249.3518,  ..., 316.5433, 152.7935, 153.1645],
         [ 71.6050,  40.2099,  44.4512,  ..., 167.3126, 136.3812, 128.4117]]]) with shape = torch.Size([1, 220, 50257])
tensor(477.1811)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796, 13365,  6203, 14752,   796,   796,   796,   220,
           198]]) with shape torch.Size([1, 11])
Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 1

 23%|██▎       | 232/1000 [00:26<01:38,  7.78it/s]

Outputs = tensor([[[142.1027, 118.3303, 185.7571,  ..., 267.6248, 203.3547, 226.2244],
         [ 60.6225,  37.7200,  19.2045,  ..., 143.8870, 121.9497, 127.4089],
         [ 62.3652,  39.1459,  22.2238,  ..., 145.0000, 123.4124, 129.2142],
         ...,
         [ 76.7667,  41.7862,  52.4980,  ..., 182.3972, 143.3270, 130.3152],
         [256.8929, 210.1232, 301.0904,  ..., 432.5104, 237.1226, 202.4079],
         [ 75.5652,  41.4327,  50.2671,  ..., 178.7541, 141.0158, 129.6573]]]) with shape = torch.Size([1, 242, 50257])
tensor(486.5084)
nan
Lables = tensor([[  383,  2319,  2488,    12,    31, 42760,  2488,    12,    31,   890,
           357, 23134, 10117,   513,   287,  1267, 44338,   373,  1160, 18985,
           357,  6135, 10117,   767,   287,  1267,  3094,  2651,   290,  1367,
         18985,   357,  4570, 10117,   352,   287,  1267,   379,   262,  8286,
           764,   632,   373,   718, 18985,   357,   678, 10117,   807,   287,
          1267,  1029,   290,  3562,   284,   

 23%|██▎       | 234/1000 [00:27<01:57,  6.50it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 58.7832,  36.2688,  17.4369,  ..., 141.0568, 120.1308, 125.5908],
         [ 62.2358,  39.0872,  22.2167,  ..., 144.8550, 123.6387, 129.1043],
         ...,
         [ 78.3348,  44.0341,  54.6320,  ..., 184.7337, 145.1407, 131.7238],
         [258.3399, 211.0441, 315.9388,  ..., 434.1477, 243.8370, 216.2439],
         [ 77.6609,  43.3834,  53.8677,  ..., 183.1032, 143.7151, 130.7419]]]) with shape = torch.Size([1, 231, 50257])
tensor(498.8135)
nan
Lables = tensor([[  383,  7937,   550,   281,  1633,  1448,   286,  1367,  1123,   286,
         45138,   418, 14852,   360,    19,    56, 15647, 26127,   357, 28234,
          6447,  1438,   366, 26395,   366,  1267,   290,   317, 16590,   412,
          1433,    32, 39471,  6215,   357, 28234,  6447,  1438,   366,  3362,
           366,  1267,  5747,  6215,   550,  2478,  2761,   290,  6159,  1633,
          1448,  1683,   550,   477,   286,   

 24%|██▍       | 238/1000 [00:27<01:35,  7.95it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [306.5869, 313.2511, 364.6369,  ..., 394.9286, 323.3239, 300.6893],
         [ 74.9863,  45.5655,  46.0969,  ..., 168.1923, 143.6539, 135.9313],
         ...,
         [ 78.0379,  42.9527,  52.0668,  ..., 180.8330, 145.1082, 132.0275],
         [ 82.5853,  45.7117,  47.1644,  ..., 176.5161, 152.5785, 131.6177],
         [ 75.3275,  42.3932,  46.4008,  ..., 172.9634, 141.6312, 133.0325]]]) with shape = torch.Size([1, 166, 50257])
tensor(473.1485)
nan
Lables = tensor([[  383,  9934,   286,   262,  9233,  3211,  3263,   837,   262,  8286,
         41104,   290,   511,  6493,  8573,   373,  4143, 34304,   416,   262,
          3090,   286,   262,  5474,  6203,   837, 44338,   837, 15923,  6541,
           290,   517,  5252,   837,   290,   262,  1138, 12643,  1173,  6001,
          3220,   764,  1954, 18985,   357,   860,  2488,    13,    31,   352,
           287,  1267,   284,   362,  2488,   

 24%|██▍       | 244/1000 [00:27<01:10, 10.70it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 66.8141,  42.5445,  28.3711,  ..., 153.9864, 132.1088, 132.7154],
         [ 58.0282,  34.7106,  15.8876,  ..., 138.6475, 118.6140, 124.9774],
         ...,
         [ 76.6473,  41.1198,  53.9310,  ..., 183.1581, 143.2319, 129.4484],
         [139.6755, 103.2378, 144.4717,  ..., 246.8486, 127.0660, 144.4538],
         [ 74.3118,  39.5410,  50.2140,  ..., 177.1465, 139.1444, 127.7087]]]) with shape = torch.Size([1, 181, 50257])
tensor(488.7082)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 34844,   796,   796,   220,   198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 61.5493,  38.1193,  21.8254,  ..., 144.9910, 124.6012, 129.0053],
         [ 65.9779,  41.8067

 25%|██▍       | 246/1000 [00:28<01:44,  7.18it/s]

Outputs = tensor([[[101.8824,  82.8170,  99.1326,  ..., 217.4944, 178.2175, 184.5240],
         [ 64.0078,  40.2713,  23.1016,  ..., 149.3386, 127.1896, 130.7303],
         [ 67.3346,  42.1811,  30.2812,  ..., 152.4674, 131.2474, 133.9630],
         ...,
         [ 75.5045,  39.2237,  51.6420,  ..., 182.8187, 142.8295, 129.5066],
         [ 72.9136,  39.4550,  45.0532,  ..., 174.0184, 139.2734, 131.0106],
         [ 74.0692,  39.0559,  48.9796,  ..., 178.3863, 140.1749, 128.7972]]]) with shape = torch.Size([1, 374, 50257])
tensor(491.3943)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 2159, 1810, 2873,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 11])


 25%|██▍       | 248/1000 [00:28<01:36,  7.83it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4324,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5436,  22.1607,  ..., 145.9297, 125.1568, 128.8381],
         [ 65.7492,  41.4700,  28.7677,  ..., 151.2722, 130.5311, 133.0922],
         ...,
         [ 66.6989,  42.2098,  29.6255,  ..., 152.7434, 131.8302, 133.0999],
         [312.7542, 308.7380, 373.8253,  ..., 493.8655, 328.6159, 314.3440],
         [ 85.2643,  49.7080,  55.2229,  ..., 180.2800, 150.0163, 139.0863]]]) with shape = torch.Size([1, 11, 50257])
tensor(480.9523)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 1649,  2869,  2540,   262,  8211,  1810,   319,   807,  3426,   837,
           262, 15153,  3297,   798,   329,   262,  7979,   259, 12010,   351,
          1440,   584, 10181,  5748,   290,   262,  1657, 11920,   367, 13090,
          1477, 13090,   355, 12899,  3002,   

 25%|██▌       | 250/1000 [00:28<01:34,  7.91it/s]

Outputs = tensor([[[ 97.1868,  79.4888,  82.6113,  ..., 200.0758, 163.4622, 183.5325],
         [ 63.4621,  39.5816,  26.7953,  ..., 149.0471, 128.2236, 130.3380],
         [ 66.3396,  42.1925,  33.1578,  ..., 155.2823, 133.0760, 133.1149],
         ...,
         [ 72.9078,  37.6215,  48.0349,  ..., 173.0833, 138.9640, 126.2269],
         [118.1374, 101.8445, 146.1870,  ..., 167.1829,  59.5773, 112.5585],
         [ 71.3607,  36.1257,  45.7861,  ..., 168.9850, 136.4806, 124.6926]]]) with shape = torch.Size([1, 112, 50257])
tensor(486.6023)
nan
Lables = tensor([[ 5856,  2485, 35865,  3047,   319,   642,  1737,   837,   612,   373,
           257, 19905, 20413,   341,   287,   262,  1364,  2485,   286,  6707,
         20317,  4908,   705,    82,  3831,  1186,  1400,    13,   642,   326,
         10058,  1111,  6541,   290,  2923,  6885,  5462,  3653,   764,  5747,
         46088, 16695,   547, 21050,   284,   288,  1076,   262,  7186,  2046,
           290,  3613,   262,  4074,   764,  1

 25%|██▌       | 252/1000 [00:29<01:55,  6.49it/s]

Outputs = tensor([[[102.5669,  84.3615,  92.0346,  ..., 210.6908, 172.6682, 188.5788],
         [ 63.0009,  39.1604,  27.5060,  ..., 149.9488, 128.2318, 129.7165],
         [ 69.6385,  45.8105,  30.6972,  ..., 158.5061, 134.6643, 137.3265],
         ...,
         [ 75.7315,  43.8524,  50.2515,  ..., 174.2340, 140.7066, 133.3451],
         [ 75.4505,  43.6306,  49.8290,  ..., 173.5975, 140.1874, 133.1396],
         [ 75.5477,  42.4418,  52.2629,  ..., 175.4195, 139.4971, 130.0735]]]) with shape = torch.Size([1, 181, 50257])
tensor(485.4546)
nan
Lables = tensor([[ 1119,  4504,  1363,   319,  1478,  2795,   290,   262,   314,    41,
            45,  2540, 15223,  5410,   284,  6330,   262,  2626, 16651,   351,
         14554, 16651, 11513,   422, 10181,  5748,   764,   383, 15153,   547,
          6163,   329, 11315,   290, 30795,   422,   262,  7297,   319,  1478,
          2901,   287, 11824,   764,  1119,  6150,   319,   366, 42020,  7995,
           366,  1566,   262,  4036, 32626,  2

 26%|██▌       | 256/1000 [00:29<01:23,  8.93it/s]

Outputs = tensor([[[106.0476,  89.4800, 107.6905,  ..., 219.5386, 178.6844, 194.8031],
         [ 68.7957,  44.3321,  30.8381,  ..., 157.5355, 134.9558, 134.9946],
         [123.0211,  73.6877, 100.2904,  ..., 242.4970, 223.2526, 147.6235],
         ...,
         [ 73.3759,  36.9665,  47.5321,  ..., 177.4443, 138.8414, 126.9572],
         [188.3999, 154.5613, 229.9056,  ..., 313.7495, 141.8246, 154.7744],
         [ 71.7000,  36.0877,  44.6029,  ..., 172.0907, 135.7876, 125.6656]]]) with shape = torch.Size([1, 122, 50257])
tensor(480.5415)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796,   796,  5838,   286, 15725,  1985,    64, 31329,
           796,   796,   796,   796,   220,   198]]) with shape torch.Size([1, 16])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9026, 173.6245, 182.7835],
         [ 62.0819,  38.5436,  2

 26%|██▌       | 258/1000 [00:30<02:23,  5.18it/s]

Outputs = tensor([[[104.2267,  84.7629,  94.5854,  ..., 211.3960, 172.4096, 189.5484],
         [ 79.5965,  21.6406,  23.9882,  ..., 170.9780, 142.3067, 124.7392],
         [ 62.1269,  27.9529,  18.2009,  ..., 147.7954, 122.8922, 122.6335],
         ...,
         [ 74.2968,  41.7361,  45.5945,  ..., 174.2996, 141.0649, 131.8031],
         [200.8587, 167.1184, 256.5862,  ..., 328.5280, 162.8858, 168.8172],
         [ 75.7818,  41.4139,  50.1241,  ..., 179.1415, 142.2708, 129.7517]]]) with shape = torch.Size([1, 195, 50257])
tensor(484.3020)
nan
Lables = tensor([[  314,   325,   373,  7384,   416,  4019,  2488,    12,    31,  5629,
          6215,   422,   262,  5544,  6769,   837,   475,   484,  4054,   284,
         28640,   597,  2726,  2465,   764,  1375, 49392,  1367, 30892,  3028,
           290,   373,   691,  2277,   416,   257,  5194,  1752,   837,   319,
           262,  4807,   469,   503,  3526,   286,   262,  2493, 48786,   764,
          2773,  4974,   584, 12134,  1474,  6

 26%|██▌       | 260/1000 [00:31<03:10,  3.89it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 60.3278,  37.1340,  24.6309,  ..., 147.0591, 125.0897, 128.1604],
         [ 67.2444,  43.0734,  31.3267,  ..., 154.2633, 132.7936, 134.8187],
         ...,
         [ 76.9023,  40.0661,  49.7895,  ..., 179.0403, 142.1063, 129.2484],
         [224.5528, 180.4015, 275.1964,  ..., 382.8618, 190.0616, 177.3321],
         [ 73.9509,  39.9815,  43.3816,  ..., 170.6543, 138.1133, 130.4873]]]) with shape = torch.Size([1, 177, 50257])
tensor(481.7446)
nan
Lables = tensor([[  383,   314,    41,    45,   788,  3066,   284,   779,   262, 15153,
           290,   511,  3671,  2096,   284,  2222,   257,  3440,   286, 34139,
           837, 14239,   837, 19783,   290,   584, 10039, 21622,   736,   284,
          2869,   706,   262,  1605, 16651, 24057,   262,  2520,  2807,  6896,
           357, 14680,   509,  5350,  1267,   764,  1119,  9639,   511,   269,
           853,  3028,  3726,   319,   718,  3

 26%|██▌       | 261/1000 [00:31<03:27,  3.56it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 56.2419,  32.4856,  13.7458,  ..., 134.7196, 115.5090, 122.8549],
         [ 67.4109,  41.5403,  30.0964,  ..., 155.2851, 129.9396, 132.5152],
         ...,
         [ 78.5472,  43.0026,  52.4493,  ..., 187.1669, 144.1701, 131.7901],
         [185.8900, 148.4918, 204.6653,  ..., 287.1727, 141.0700, 159.6417],
         [ 76.0520,  41.4547,  48.4201,  ..., 180.6098, 140.1392, 129.9304]]]) with shape = torch.Size([1, 248, 50257])
tensor(492.0235)
nan
Lables = tensor([[  383,  7937,   547,  2900,   656, 12462, 15923, 13591,   625,   262,
          1306,  1811,  1933,  3584,   340,  1196,  6255,   606,  1310,   618,
           484,   547,  7384,   757,   416,  1605, 11920,  6215,   287,  2901,
           764,  1550,   262,  1987,   400,   314,   325,   373,  7425,   416,
          1936, 12134,   290,  1474,  6825,  3294,  1661,  2162,   477,  1297,
           673,  2626,  2026,  5462,  3653,  2

 26%|██▌       | 262/1000 [00:32<03:37,  3.39it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 65.0314,  41.1128,  28.2078,  ..., 152.4042, 130.8126, 131.8278],
         [ 59.6254,  35.9973,  18.0164,  ..., 141.2638, 121.8124, 126.3094],
         ...,
         [ 76.0674,  39.6718,  52.2056,  ..., 181.0957, 142.5964, 130.7004],
         [245.1089, 196.9384, 314.2203,  ..., 406.4310, 220.8868, 197.7613],
         [ 75.0191,  38.9246,  50.6680,  ..., 178.0936, 140.4737, 129.5043]]]) with shape = torch.Size([1, 216, 50257])
tensor(506.0741)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796, 11740,   371,   361, 37036,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.31

 27%|██▋       | 267/1000 [00:32<02:08,  5.72it/s]

Outputs = tensor([[[116.0099,  96.0662, 128.1933,  ..., 236.6242, 190.4295, 201.6859],
         [174.7347, 124.1503, 187.5554,  ..., 328.4305, 238.8888, 236.8607],
         [ 85.5431,  53.7684,  52.2063,  ..., 181.6361, 156.0061, 145.4676],
         ...,
         [ 78.6779,  42.6749,  54.9788,  ..., 188.3695, 146.5981, 133.0387],
         [157.8193, 130.5019, 183.9167,  ..., 246.8367, 114.2299, 134.7786],
         [ 76.3655,  41.2376,  50.8979,  ..., 181.7811, 142.4730, 131.3290]]]) with shape = torch.Size([1, 167, 50257])
tensor(506.4813)
nan
Lables = tensor([[  554,  4152,   837,   339,  2957,   262,  4403,  9368,  8785,   287,
          2060,  1622, 35401,  1141,   465,  4664,   614,   290,   900,  7055,
         26328,  1127, 35401,  4406,   329,  1111,  3451, 10242,   290,  2060,
          2488,    12,    31,  1622, 17692,   764,   679,   550,   635,   587,
           257,  7055,  3334,  3961, 29007,  5396,   357,   337,  7998,  3838,
          1267,  1181,  8783,   287,  1111,  9

 27%|██▋       | 268/1000 [00:32<02:14,  5.44it/s]

Outputs = tensor([[[ 93.4650,  72.7889,  75.0904,  ..., 192.8919, 157.5776, 173.4092],
         [ 64.4177,  40.2718,  26.1100,  ..., 149.9771, 128.4518, 131.4595],
         [ 66.9254,  42.5034,  30.5028,  ..., 153.9021, 132.0427, 134.2299],
         ...,
         [ 76.8162,  42.6187,  55.5030,  ..., 182.5979, 141.7086, 131.7211],
         [ 74.1168,  41.8180,  49.4128,  ..., 174.7899, 138.7224, 132.4661],
         [ 74.1861,  41.8080,  49.9661,  ..., 175.2889, 138.9469, 132.4149]]]) with shape = torch.Size([1, 103, 50257])
tensor(512.7037)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 3334, 1524,  796,  796,  220,  198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 59.5621,  36.3358,

 27%|██▋       | 272/1000 [00:33<01:59,  6.09it/s]

Outputs = tensor([[[106.7154,  89.5670, 112.5861,  ..., 217.0842, 169.5142, 191.3560],
         [ 63.9345,  40.1159,  25.5521,  ..., 148.9886, 127.9537, 131.2128],
         [ 66.8127,  42.6173,  30.4841,  ..., 153.5851, 132.0857, 134.3031],
         ...,
         [ 77.5629,  41.8324,  54.0661,  ..., 185.9724, 145.8565, 132.0787],
         [210.4232, 172.1515, 244.4520,  ..., 346.6069, 192.2457, 182.3935],
         [ 75.2778,  40.0475,  50.5412,  ..., 180.0153, 141.9510, 129.9586]]]) with shape = torch.Size([1, 253, 50257])
tensor(512.9739)
nan
Lables = tensor([[  632,   318, 25304,   326,   371,   361, 37036,   373,  4642,   287,
          4767,   418,  2539,   837,  7055,   287, 38525,   329,  1811,  3840,
           764, 11396,  4767,   418,  2539, 27165,   371,   361, 37036,   355,
           281,  1439,  2488,    12,    31,  1605,   886,   319,   262,  2059,
           286,  7055,  4346,  1074,   764,  4767,   418,  2539,   550, 43748,
          3353,   355,   257,  8852,   286,   

 27%|██▋       | 273/1000 [00:33<02:18,  5.23it/s]

Outputs = tensor([[[108.4113,  87.7535, 105.5195,  ..., 212.7860, 171.7813, 193.9099],
         [ 63.2870,  37.1733,  30.7514,  ..., 152.1148, 129.1440, 128.6387],
         [ 70.6066,  44.4158,  39.2640,  ..., 165.7220, 139.4237, 137.2139],
         ...,
         [ 82.0345,  46.6158,  57.5299,  ..., 192.0442, 150.5133, 135.3591],
         [ 79.0233,  45.7320,  51.4887,  ..., 183.3651, 146.6067, 135.9862],
         [ 80.1656,  45.1455,  54.9527,  ..., 187.6057, 147.3957, 133.6371]]]) with shape = torch.Size([1, 118, 50257])
tensor(496.4188)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 5535,  796,  796,  220,  198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 63.5404,  38.2478,  22.6

 28%|██▊       | 277/1000 [00:34<02:03,  5.84it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [407.5022, 301.8547, 382.4162,  ..., 595.0660, 340.7448, 311.4489],
         [ 90.4425,  55.0919,  67.7742,  ..., 203.2360, 157.0480, 142.7414],
         ...,
         [ 66.7751,  36.5098,  32.3299,  ..., 158.4590, 129.5126, 126.3828],
         [206.1792, 168.7335, 254.1768,  ..., 337.8018, 174.7890, 173.9574],
         [ 78.3044,  42.0341,  52.0293,  ..., 182.8634, 144.3635, 131.7954]]]) with shape = torch.Size([1, 281, 50257])
tensor(499.7242)
nan
Lables = tensor([[  371,   361, 37036,   705,    82,  4152,  3451,   373, 19072,   416,
          2159,  1810,  2873,  2139,   287,   262,  1578,  1829,  8565,   837,
           475,   706,  4814,   262, 15761,  1622,   837,   339,  4504,   284,
           711,   329,   262, 26328,  1127,   422, 22717,   284, 21794,   764,
           371,   361, 37036,  2826,   329,   262, 26328,  1127,   287, 12785,
         41445,  2351, 10749,  7028,   287, 21

 28%|██▊       | 278/1000 [00:34<02:32,  4.73it/s]

Outputs = tensor([[[106.7154,  89.5670, 112.5861,  ..., 217.0842, 169.5142, 191.3560],
         [ 65.9984,  40.5920,  32.6499,  ..., 155.7685, 132.0783, 132.5453],
         [ 67.7452,  43.4734,  31.7983,  ..., 155.1074, 133.2675, 135.5160],
         ...,
         [ 75.6638,  40.1878,  51.5291,  ..., 181.5453, 143.8936, 130.1443],
         [201.1361, 168.1600, 250.7217,  ..., 313.1210, 172.0438, 170.1225],
         [ 73.1922,  38.9483,  47.1091,  ..., 174.6949, 139.4416, 128.2739]]]) with shape = torch.Size([1, 235, 50257])
tensor(490.3104)
nan
Lables = tensor([[  554,   262, 21794, 12184,  1622,   837,   371,   361, 37036,  7781,
          3624, 17692,   837,  4978,  2534,  8318,   837,   290,  8618, 44300,
          5695,   357,  2026,    23,  6464,   290, 15143, 15795,  1267,   764,
           371,   361, 37036,   373,   262,  1218,  4511,  9689,   886,   287,
           262,  3277,   287, 21794,   837,   290,   339,   373,   257, 11529,
          1439,  2488,    12,    31,  1605,   

 28%|██▊       | 279/1000 [00:35<02:38,  4.54it/s]

Outputs = tensor([[[ 93.4650,  72.7889,  75.0904,  ..., 192.8919, 157.5776, 173.4092],
         [407.5026, 301.8548, 382.4165,  ..., 595.0664, 340.7451, 311.4491],
         [ 78.9625,  49.7425,  43.7683,  ..., 177.8376, 144.4041, 138.7022],
         ...,
         [ 86.6238,  51.9475,  62.2697,  ..., 196.0813, 154.2154, 139.0664],
         [203.7833, 163.5263, 255.0029,  ..., 371.0064, 163.2747, 156.7079],
         [ 84.7619,  50.5447,  59.4588,  ..., 191.1689, 150.5090, 137.4273]]]) with shape = torch.Size([1, 88, 50257])
tensor(503.9918)
nan
Lables = tensor([[ 4900,   371,   361, 37036,  5201,  5544,  1871,  3095, 14197, 49537,
          4446,   287, 21794,   837,   339,   750,   407,  5461,  1871,   262,
          1353,  3624,   764,  2750,  7208,   837, 23382, 20377,   886, 10592,
         11345,  1839,   262, 49537, 25640,   287, 24977,   475,   925,   691,
          3624,   286,   262,  1367,  1439,  2488,    12,    31,  1605,  3466,
           764,   632,   318,   407,  1598,  15

 28%|██▊       | 280/1000 [00:35<02:56,  4.07it/s]

Outputs = tensor([[[104.2267,  84.7629,  94.5854,  ..., 211.3960, 172.4096, 189.5484],
         [ 64.2301,  40.2440,  25.9938,  ..., 149.4999, 128.6416, 131.4683],
         [ 66.7528,  42.4338,  30.4012,  ..., 153.4872, 132.1528, 134.2074],
         ...,
         [ 76.9044,  40.6212,  52.2197,  ..., 181.2526, 143.6941, 130.3862],
         [221.3648, 182.9361, 249.8290,  ..., 388.1225, 195.6411, 175.7463],
         [ 75.9946,  40.1608,  50.2541,  ..., 177.9443, 141.4861, 129.8860]]]) with shape = torch.Size([1, 185, 50257])
tensor(491.0283)
nan
Lables = tensor([[  371,   361, 37036,   373,  3177,   530,   286,   262,  6000, 34307,
           705,    82,   286,   262, 16236,    82,   764,   554,  1440,  7028,
           351,   262,  7055, 26328,  1127,   837,   371,   361, 37036,  2826,
           287,  3933,  1830,   290,   550,   625,   352,  2488,    11,    31,
         12877,  5695,   286,  2472,  6907,   764,   371,   361, 37036,  2714,
           262,  2059,   286,  7055,   705,   

 28%|██▊       | 283/1000 [00:35<02:16,  5.26it/s]

Outputs = tensor([[[106.7154,  89.5671, 112.5861,  ..., 217.0842, 169.5143, 191.3562],
         [ 65.9984,  40.5920,  32.6498,  ..., 155.7686, 132.0784, 132.5453],
         [ 75.0803,  49.3854,  43.0821,  ..., 170.5119, 144.2200, 142.3596],
         ...,
         [ 77.6380,  43.0360,  51.8402,  ..., 181.7395, 144.9914, 134.9633],
         [ 77.2020,  42.7257,  51.1169,  ..., 180.7921, 144.2986, 134.5643],
         [ 78.0526,  41.8424,  54.1646,  ..., 184.2841, 144.5139, 132.1552]]]) with shape = torch.Size([1, 86, 50257])
tensor(494.8108)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 18612,  3451,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 70.7834,  4

 28%|██▊       | 285/1000 [00:36<02:01,  5.86it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 62.6238,  38.3313,  20.8215,  ..., 146.9277, 125.2974, 128.8579],
         [ 65.4371,  40.2644,  26.0545,  ..., 149.9090, 126.3387, 130.7528],
         ...,
         [ 68.0305,  38.6894,  32.8836,  ..., 159.7957, 130.5473, 126.8112],
         [243.9839, 194.0952, 296.7246,  ..., 414.7200, 224.3078, 200.3568],
         [ 79.0726,  45.8665,  49.2142,  ..., 180.5307, 144.9725, 135.4254]]]) with shape = torch.Size([1, 131, 50257])
tensor(493.1164)
nan
Lables = tensor([[  371,   361, 37036, 11406,   257,  1693,   379,   370, 44817,  5243,
           287,  8488,   837,   475,   339,  1364,   465,  6332,  1416,  1603,
           705,    82,  1693,   284,  4654,   262,  8488, 14536,   764,   554,
           262, 11445,  5134,  1622,   837,   371,   361, 37036,  1625,   736,
           422,   465,  5095,   284,   711,   329,   262,  8488, 14536,   764,
           679,  2826,   287,  1105,  1830,   

 29%|██▊       | 286/1000 [00:36<02:10,  5.48it/s]

Outputs = tensor([[[106.7154,  89.5671, 112.5861,  ..., 217.0842, 169.5143, 191.3562],
         [ 65.9984,  40.5920,  32.6498,  ..., 155.7686, 132.0784, 132.5453],
         [ 75.0803,  49.3854,  43.0821,  ..., 170.5119, 144.2200, 142.3596],
         ...,
         [ 74.2361,  38.0739,  49.1243,  ..., 180.8732, 142.5482, 129.5054],
         [159.7608, 128.8393, 181.3111,  ..., 266.9108, 110.9439, 152.2311],
         [ 72.8024,  36.6955,  47.6953,  ..., 177.6978, 140.1233, 127.7726]]]) with shape = torch.Size([1, 103, 50257])
tensor(495.4584)
nan
Lables = tensor([[  554,  1737, 27937,   837,   339,  3414,   339,   373, 29040,   422,
          4708,  4346,   284,  1716,  5701,  3437,   379,   257,  5243,  4429,
           287, 14905,   764,   679,   373,  9657,   355,   257,  6332,  1416,
          1603,   416, 36245,  1677,   357,   783,  1900,   355,   370,  3824,
            33,  1267,   837,   543,   550,   655,  2067,   262,   717,  5581,
          4429,   287, 14905,   290,   262,   

 29%|██▊       | 287/1000 [00:36<02:55,  4.07it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 59.6807,  37.4252,  19.6773,  ..., 142.4948, 121.2229, 127.0357],
         [ 62.1416,  38.5377,  21.1584,  ..., 145.7829, 124.8987, 129.0264],
         ...,
         [ 79.4236,  43.1199,  55.0298,  ..., 191.0992, 148.9486, 133.6484],
         [230.5025, 180.9646, 284.8219,  ..., 375.5720, 215.2639, 196.2614],
         [ 77.8642,  42.0285,  52.5510,  ..., 187.0080, 146.2188, 132.2018]]]) with shape = torch.Size([1, 226, 50257])
tensor(504.3134)
nan
Lables = tensor([[ 2293,  1542,   812,   351, 36245,  1677,   290,   257,  1487,   287,
          9238,   329,   262,  4429,   837,   465,   905,   373,  6928,   351,
           262, 48807,  7311,   705,    82,   383, 13633,  2677,  5438,   764,
           554,   262,  7169,    82,   837,   371,   361, 37036,  7817,  8062,
           379, 14905,   705,    82,  2019,   603,   293,  5535,   290,  4983,
           355,   257,  1221,   474,  8337,   

 29%|██▉       | 288/1000 [00:37<03:01,  3.93it/s]

Outputs = tensor([[[106.0476,  89.4800, 107.6905,  ..., 219.5386, 178.6844, 194.8031],
         [281.8209, 262.7747, 305.1234,  ..., 536.5577, 330.3469, 296.4089],
         [ 81.9199,  45.6637,  56.1036,  ..., 191.5362, 148.7214, 139.5117],
         ...,
         [ 77.7762,  42.8583,  51.8426,  ..., 189.7019, 148.9834, 131.2275],
         [242.7627, 201.7239, 297.2565,  ..., 405.3856, 224.4853, 210.1093],
         [ 77.5621,  42.8334,  51.3084,  ..., 187.0063, 147.4005, 130.9317]]]) with shape = torch.Size([1, 116, 50257])
tensor(501.1885)
nan
Lables = tensor([[  371,   361, 37036,   373,  1281, 17047,  3481, 28948,   276,   656,
           262, 14905,  9765, 26248,  4789,   286, 18864,   287,  2693,  4343,
           764,   679,   373,  1813,   262,  8407,  7129, 11289,   543,   318,
         10395,   329,   366,   883,   508,   750,   340,   717,   837,   262,
           661,   508,   550,   645,  3912,   284,  1061,   764,   366,   383,
          4789,   286, 18864,  5764,   373,  5

 29%|██▉       | 289/1000 [00:37<03:19,  3.56it/s]

Outputs = tensor([[[106.7154,  89.5671, 112.5861,  ..., 217.0842, 169.5143, 191.3562],
         [ 65.9984,  40.5920,  32.6498,  ..., 155.7686, 132.0784, 132.5453],
         [ 67.7452,  43.4734,  31.7982,  ..., 155.1073, 133.2675, 135.5160],
         ...,
         [ 65.0231,  34.0079,  29.8729,  ..., 156.7972, 129.2205, 125.3560],
         [152.6890, 122.1456, 178.6057,  ..., 222.8133,  99.5926, 141.5635],
         [ 76.8196,  38.8851,  50.5566,  ..., 182.5490, 145.3532, 131.0158]]]) with shape = torch.Size([1, 118, 50257])
tensor(498.5282)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 7884,  796,  796,  220,  198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 60.5193,  36.7994,  20.3

 30%|██▉       | 297/1000 [00:37<01:10, 10.03it/s]

Outputs = tensor([[[106.7154,  89.5671, 112.5861,  ..., 217.0842, 169.5143, 191.3562],
         [ 65.9984,  40.5920,  32.6498,  ..., 155.7686, 132.0784, 132.5453],
         [ 75.0803,  49.3854,  43.0821,  ..., 170.5119, 144.2200, 142.3596],
         ...,
         [ 77.1877,  40.1989,  56.0233,  ..., 187.7871, 148.1824, 130.5116],
         [ 74.6795,  40.3482,  49.7821,  ..., 178.9563, 144.1577, 131.8344],
         [ 75.0765,  39.0882,  52.3198,  ..., 181.7975, 144.4959, 129.1669]]]) with shape = torch.Size([1, 113, 50257])
tensor(492.1559)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796, 26539, 20215,  8545, 23964,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.31

 30%|███       | 301/1000 [00:38<01:32,  7.54it/s]

Outputs = tensor([[[ 98.7221,  79.6236,  83.2838,  ..., 204.3905, 165.4146, 180.2808],
         [297.7444, 232.5229, 310.5445,  ..., 528.2800, 377.4060, 317.1525],
         [ 86.9360,  56.3734,  55.9778,  ..., 195.7030, 154.9832, 144.4434],
         ...,
         [ 75.0901,  42.8443,  48.9979,  ..., 172.2709, 140.2511, 132.1884],
         [197.4153, 161.1532, 249.4836,  ..., 310.6124, 152.7943, 175.5130],
         [ 75.9353,  42.1746,  52.4060,  ..., 174.7355, 139.9302, 129.4699]]]) with shape = torch.Size([1, 147, 50257])
tensor(494.5028)
nan
Lables = tensor([[  791, 23073,   837,   262,  6388,  2277,  4744,  1342,   621,  1987,
          2250,   878,  1194,  1688, 23964, 14121, 13151,  2488,    12,    31,
         10591,  2488,    12,    31,   583,  2488,    12,    31,  1711,   357,
           580, 10571,  1220,   289,  1267, 13520,  7425,  2520,  3936,  2162,
          1239,   423,   734,  1688, 11700,  1952,  2277,   262,  1578,  1829,
           287,   884,  1969, 22435,   764,   

 30%|███       | 304/1000 [00:39<02:04,  5.58it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 74.9978,  49.3430,  40.4002,  ..., 169.0696, 143.5074, 140.3581],
         [ 58.4394,  35.1304,  16.5526,  ..., 139.3268, 119.3460, 125.5869],
         ...,
         [ 76.7768,  42.1314,  52.7177,  ..., 180.9904, 144.0606, 130.5296],
         [236.5684, 191.0803, 299.3389,  ..., 391.6230, 215.5652, 195.5077],
         [ 75.4050,  41.1806,  50.5350,  ..., 177.1861, 141.8474, 129.3008]]]) with shape = torch.Size([1, 300, 50257])
tensor(495.8778)
nan
Lables = tensor([[  383, 23964,  7317,  3940,   262,  1781,   286,  1194, 23964,   326,
          3804,   832,   262,  1989,   287,  2739,  2932,   837,   543,  6165,
          7425, 14159,   290,  3936,   764,   770, 23964,  2427,  9456,   257,
          2276,  7421,  2488,    12,    31, 24821,  2610,   764,  2293,  3867,
           832,   262,  7840, 45726,   837,   262, 23964, 24135,  4622,   878,
          1642, 43682,   379, 22721,   837,  4

 31%|███       | 307/1000 [00:40<02:12,  5.25it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 65.7033,  41.6651,  28.4263,  ..., 151.5721, 130.6705, 132.8781],
         [ 65.2822,  41.0873,  28.7045,  ..., 152.0508, 131.0338, 132.4729],
         ...,
         [ 80.1680,  47.4792,  56.9466,  ..., 186.6016, 147.7924, 133.9739],
         [185.8727, 148.7657, 226.9251,  ..., 282.2188, 143.7625, 169.0721],
         [ 77.6338,  45.2974,  53.4957,  ..., 180.9305, 143.8906, 131.6139]]]) with shape = torch.Size([1, 233, 50257])
tensor(489.7240)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796, 38397,   602,   290,  2928,   796,   796,   220,   198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.6950,  80.9774, 100.6089,  ..., 216.1597, 175.4110, 184.6033],
         [ 62.0412,  38.5393,  22.1272,  ..., 145.7505, 125.2556, 128.8690],
      

 31%|███       | 309/1000 [00:40<02:04,  5.57it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [ 62.5667,  38.8897,  20.9882,  ..., 147.8677, 125.3362, 129.7105],
         [ 68.3905,  43.8087,  32.9586,  ..., 155.2599, 133.5331, 135.6271],
         ...,
         [ 76.0497,  39.2177,  48.7777,  ..., 179.8373, 141.6622, 128.6852],
         [217.7831, 176.4933, 247.4508,  ..., 358.5677, 172.8096, 183.2079],
         [ 74.4609,  38.3958,  46.2173,  ..., 175.3000, 138.5796, 127.6974]]]) with shape = torch.Size([1, 162, 50257])
tensor(467.8583)
nan
Lables = tensor([[  383,  3665, 23964,  3888,   625,   393,  1474,  1811, 14807,   287,
           262, 45726,   764, 22608,   319,  7897, 18292,   290, 42177,  5451,
           547,  1111,  6108,   379,  1088, 12713, 16462,   357, 18500, 10571,
          1220,   289,  1267,   764, 22608,  4251,  9796, 16462,   357, 26607,
         10571,  1220,   289,  1267,   379, 10807,   705,    82, 42177,   837,
          1802, 16462,   357, 27829, 10571,  1

 31%|███       | 310/1000 [00:41<03:04,  3.74it/s]

Outputs = tensor([[[143.8829, 127.8608, 193.6991,  ..., 269.3243, 205.2067, 238.1103],
         [ 68.8165,  44.5101,  33.2124,  ..., 157.5412, 135.2159, 135.2243],
         [ 71.5225,  47.0007,  34.4886,  ..., 161.3842, 137.3298, 138.0551],
         ...,
         [ 73.8532,  39.6376,  51.8306,  ..., 178.9784, 141.7403, 128.5888],
         [221.3878, 183.0983, 281.3717,  ..., 359.6187, 197.9802, 198.0823],
         [ 71.8110,  38.4423,  48.4247,  ..., 173.4067, 138.1802, 126.7547]]]) with shape = torch.Size([1, 299, 50257])
tensor(502.6144)
nan
Lables = tensor([[ 1649,   262,  6388,  3888, 45817,   287,  4744,   837, 13520,  4251,
           281,  6108, 13151, 16462,   357,   580, 10571,  1220,   289,  1267,
           287, 22721,  2162,   777,  5091,   706,   262,  4151,  3804,   764,
           554,  2688, 18358,  8511,   837,   281,   368, 40077,  8630,   379,
          1551,  4019,  2488,    12,    31, 10591,  2488,    12,    31,   583,
          2488,    12,    31,  1711,   357, 20

 31%|███       | 311/1000 [00:41<03:35,  3.20it/s]

Outputs = tensor([[[ 97.1869,  79.4889,  82.6115,  ..., 200.0760, 163.4623, 183.5327],
         [330.1132, 258.4871, 327.2994,  ..., 479.9195, 281.6768, 271.2453],
         [ 86.6825,  55.5705,  57.7836,  ..., 191.2831, 154.3579, 144.3025],
         ...,
         [ 79.8593,  42.7497,  53.4812,  ..., 186.5578, 147.4706, 132.8668],
         [222.7738, 180.6191, 258.3006,  ..., 356.3567, 189.1988, 191.0182],
         [ 76.1398,  41.9100,  45.8501,  ..., 175.9483, 142.1498, 133.1496]]]) with shape = torch.Size([1, 254, 50257])
tensor(489.4659)
nan
Lables = tensor([[ 1629,  2688, 18358,  8511,   837,   262,  3741,   286,   262,  2465,
           373, 19733,   284, 28459,   764, 12168, 20132,   290, 15100, 39513,
           326,   351,  6501,   262, 35768, 23964, 20821,   837, 25359,   278,
          6483,   351,  5445,   491, 14125,   764, 22608, 42281,  2975,  5895,
           319,   867,  6483,   837,   290,  6947, 41555,  5017,   262, 30966,
           319,   257,  1957, 13126,  1781,   

 31%|███       | 312/1000 [00:42<03:57,  2.90it/s]

Outputs = tensor([[[110.1520,  88.7207, 119.0701,  ..., 224.6823, 182.1374, 193.6911],
         [ 58.3463,  35.3959,  17.8250,  ..., 139.1329, 120.0850, 125.0273],
         [ 65.3811,  41.4238,  28.8947,  ..., 150.0775, 130.3913, 132.7228],
         ...,
         [ 73.2477,  36.4626,  48.6680,  ..., 177.4612, 137.8994, 128.9077],
         [237.3924, 197.5959, 280.7656,  ..., 386.5159, 212.6463, 197.6934],
         [ 72.0739,  35.5879,  47.1265,  ..., 174.5405, 135.8924, 127.6648]]]) with shape = torch.Size([1, 251, 50257])
tensor(482.0952)
nan
Lables = tensor([[  554,  5780,   290,   520,    13,  7598,   494, 14683,   837,   262,
          6388,   373,  3177,  1871,   262,  5290,   319,  1700,   764,   383,
          6388, 33297,   617,  5682,   290, 17676,   867,  1854,   572,   511,
         19369,   764,  1629, 22559,   837, 13520,  4615,   393, 11234,  9694,
          5441,  4064,   286,   262, 42251,   287,  3240,   764,   383,  6388,
          6572,   262,  2368,  4314,   286,   

 31%|███▏      | 313/1000 [00:42<04:53,  2.34it/s]

Outputs = tensor([[[ 93.4650,  72.7888,  75.0905,  ..., 192.8919, 157.5775, 173.4092],
         [ 55.7849,  32.6160,  14.6632,  ..., 135.5867, 115.9944, 122.4898],
         [ 58.4088,  33.4897,  14.6085,  ..., 139.6995, 117.6534, 124.2582],
         ...,
         [ 77.4218,  43.1760,  54.3524,  ..., 179.4889, 144.4273, 131.5775],
         [233.8830, 193.8088, 302.5778,  ..., 381.4736, 224.0616, 208.3358],
         [ 76.0374,  42.1915,  52.3233,  ..., 175.9547, 142.2290, 130.3435]]]) with shape = torch.Size([1, 307, 50257])
tensor(487.7207)
nan
Lables = tensor([[  327,  1773,  2465,   373,  5290,  1863,   262,  3942,  5866, 21003,
          2049,  2162,  1811, 15893,   287, 22559,  5924,  2472,  9089,   837,
           290, 23605,   837,  1467,  4064,   286,   262, 35405, 13833,   837,
           393,   604,  1510, 10559,   837,   547,  6572,   764,  4650,  9015,
           763,  2840,   287, 22559,   547,  6572,   837,   290,   262,  1957,
          9015,  3265,   373, 16830,   290, 34

 31%|███▏      | 314/1000 [00:43<04:44,  2.41it/s]

Outputs = tensor([[[ 94.2198,  74.9150,  80.4288,  ..., 196.0560, 157.0670, 173.1416],
         [ 63.6817,  40.0960,  23.1376,  ..., 149.0887, 126.3000, 131.1691],
         [ 67.8961,  41.7918,  30.8442,  ..., 155.5748, 130.7924, 132.9977],
         ...,
         [ 78.1942,  42.9658,  57.6644,  ..., 182.9673, 146.5787, 132.2289],
         [225.2143, 193.5959, 274.8778,  ..., 369.1349, 203.3152, 204.0945],
         [ 76.7357,  41.8320,  55.0199,  ..., 178.7411, 144.1258, 131.0272]]]) with shape = torch.Size([1, 129, 50257])
tensor(495.5819)
nan
Lables = tensor([[ 3334, 25807,  4073, 17448,  1973,  4744,   837, 14660,  1474, 15528,
           810, 10150,  4251,   860, 10117,   357,   362,  2488,    13,    31,
           767,   285,  1267,  2769,   764,  3334, 25807,   286,   625,   767,
           287,   357, 11546,  8085,  1267,  4073,   257,  1801, 12228,   416,
         15528, 13944,  1766,    13,   284,  2270,   513, 21504,   357,   604,
          2488,    13,    31,   807, 10571,  1

 32%|███▏      | 318/1000 [00:43<02:24,  4.72it/s]

Outputs = tensor([[[ 95.8241,  73.8800,  75.4970,  ..., 197.2054, 161.9891, 174.4709],
         [ 71.8918,  46.5571,  35.0963,  ..., 163.7312, 138.9095, 138.4681],
         [ 62.5535,  37.7013,  20.2031,  ..., 146.9947, 124.7968, 128.4359],
         ...,
         [ 76.3412,  41.8645,  51.8738,  ..., 183.2755, 143.1268, 130.1869],
         [ 73.6375,  41.5234,  45.5429,  ..., 174.8040, 139.6284, 131.3118],
         [ 75.3736,  41.4618,  50.3696,  ..., 180.4533, 141.4243, 129.5418]]]) with shape = torch.Size([1, 207, 50257])
tensor(494.1248)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,  2293, 11018,   796,   796,   220,   198]]) with shape torch.Size([1, 8])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 65.7492,  

 32%|███▏      | 323/1000 [00:44<01:36,  7.00it/s]

Outputs = tensor([[[106.0476,  89.4800, 107.6905,  ..., 219.5386, 178.6844, 194.8031],
         [ 63.4460,  39.7851,  24.9121,  ..., 148.1404, 127.9511, 130.8370],
         [ 66.5969,  42.4102,  28.8555,  ..., 153.1763, 131.6859, 133.2093],
         ...,
         [ 76.5015,  41.4770,  52.3768,  ..., 180.4887, 143.4871, 130.1797],
         [ 74.0423,  41.1265,  46.4878,  ..., 172.7431, 140.2451, 131.3954],
         [ 74.1147,  41.2450,  46.7532,  ..., 172.9845, 140.4605, 131.4989]]]) with shape = torch.Size([1, 110, 50257])
tensor(494.4808)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,  5498,  5838,   286, 22255,    83,   506,  8510,   469,   796,
           220,   198]]) with shape torch.Si

 32%|███▎      | 325/1000 [00:44<01:38,  6.85it/s]

Outputs = tensor([[[143.8833, 127.8614, 193.6998,  ..., 269.3252, 205.2073, 238.1108],
         [279.4747, 264.1968, 321.1602,  ..., 545.0704, 373.3791, 325.1186],
         [ 91.1768,  51.5352,  57.0660,  ..., 202.8960, 159.6069, 139.8440],
         ...,
         [ 79.4255,  45.0548,  54.3390,  ..., 184.7653, 145.9171, 132.5697],
         [185.7243, 156.3447, 233.5238,  ..., 308.7096, 138.1107, 159.7987],
         [ 78.3591,  43.8574,  52.7683,  ..., 181.5448, 143.7466, 131.1687]]]) with shape = torch.Size([1, 111, 50257])
tensor(501.6382)
nan
Lables = tensor([[ 2293,   262,  3274,  5838,   286, 22255,    83,   506,  8510,   469,
           837,   262,  1294,  5407,   705,    82,   362,   358, 27749,  7458,
           373,  3888,   284,  4404,   262, 22255,    83,   506,  5866,  1627,
           764,   383,  7297,   837,   543,   373,  1418,  2228,   287,  5249,
           837,   373,  7425,   351,   257,  1913,  1368,   416,  1811, 17397,
           286,   262,  6983,  4380,   705,   

 33%|███▎      | 327/1000 [00:45<02:02,  5.51it/s]

Outputs = tensor([[[106.0476,  89.4800, 107.6905,  ..., 219.5386, 178.6844, 194.8031],
         [269.3107, 256.6159, 297.2519,  ..., 348.9118, 259.4417, 254.1998],
         [ 72.3874,  44.5486,  43.9379,  ..., 165.9681, 139.0347, 134.2015],
         ...,
         [ 81.3206,  47.3773,  57.5878,  ..., 184.5591, 149.4006, 136.3369],
         [146.2215, 120.7937, 171.4469,  ..., 215.6562,  86.9539, 132.7839],
         [ 78.4242,  45.3982,  52.7468,  ..., 177.3654, 144.2415, 133.8811]]]) with shape = torch.Size([1, 105, 50257])
tensor(486.4969)
nan
Lables = tensor([[  383, 25615,   286,   262,  2372,   284,   350,   385,   272,  2448,
         16912, 12053,   262,  1294, 11000, 12778,   352,   301, 43161,   282,
         11000, 26012,   284,   307,  3181,   287,   284, 19594,   262,  1294,
          5407,  6553,   764,   554,   734,  2745,   286,  4334,  4330,   837,
           262,  1294,  3386,   547,  1498,   284,  2700,   262,  2258, 25999,
           503,   286,   262, 22255,    83,   

 33%|███▎      | 333/1000 [00:45<01:06, 10.08it/s]

Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 61.5493,  38.1193,  21.8254,  ..., 144.9910, 124.6012, 129.0053],
         [ 65.4703,  41.5565,  28.5316,  ..., 151.2568, 130.5656, 133.1226],
         ...,
         [ 67.4802,  43.4218,  31.3976,  ..., 155.0186, 133.8526, 134.5612],
         [218.4137, 222.1985, 268.6674,  ..., 343.4930, 242.6174, 244.4604],
         [ 83.4324,  48.7187,  52.6716,  ..., 173.9619, 149.0072, 140.6859]]]) with shape = torch.Size([1, 7, 50257])
tensor(483.8345)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[  796,   796,   796,   350,   385,   272,  2448, 16912,   796,   796,
           796,   220,   198]]) with shape tor

 34%|███▍      | 338/1000 [00:46<01:39,  6.65it/s]

Outputs = tensor([[[101.6951,  80.9775, 100.6090,  ..., 216.1599, 175.4112, 184.6035],
         [306.5869, 313.2511, 364.6369,  ..., 394.9286, 323.3239, 300.6893],
         [ 70.7351,  42.5686,  40.3791,  ..., 160.3911, 137.7240, 132.0894],
         ...,
         [ 78.2064,  44.3728,  54.5411,  ..., 180.0297, 146.0088, 132.8379],
         [197.7067, 171.9391, 257.6534,  ..., 301.2044, 181.1659, 179.6257],
         [ 76.2643,  42.7291,  51.2961,  ..., 174.9896, 142.8117, 130.9673]]]) with shape = torch.Size([1, 325, 50257])
tensor(487.8220)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796,  796, 2693, 4574,  796,  796,  796,  220,  198]]) with shape torch.Size([1, 10])
Outputs = tensor([[[101.6950,  80.9774, 100.6089,  ..., 216.1597, 175.4110, 184.6033],
         [ 62.0412,  38.5393,  22.1272,  ..., 145.7505, 125.2556, 128.8690],
         [ 65.59

 34%|███▍      | 340/1000 [00:47<02:02,  5.37it/s]

Outputs = tensor([[[121.6616, 109.6821, 146.6531,  ..., 244.3695, 190.6791, 217.3337],
         [ 66.4288,  42.6532,  30.5444,  ..., 153.9489, 131.7435, 132.9724],
         [283.5862, 251.6949, 343.7617,  ..., 493.6848, 345.8945, 307.0486],
         ...,
         [ 76.5099,  42.0621,  52.2704,  ..., 181.0714, 143.1637, 129.7016],
         [155.3022, 125.5675, 194.2445,  ..., 250.0874, 100.8905, 134.5676],
         [ 74.1639,  40.3316,  48.7359,  ..., 175.2664, 139.2401, 127.5011]]]) with shape = torch.Size([1, 121, 50257])
tensor(487.7423)
nan
Lables = tensor([], size=(1, 0)) with shape torch.Size([1, 0])
Outputs = tensor([], size=(1, 0, 50257)) with shape = torch.Size([1, 0, 50257])
tensor(nan)
nan
Lables = tensor([[ 796,  796, 5838,  796,  796,  220,  198]]) with shape torch.Size([1, 7])
Outputs = tensor([[[101.3162,  81.9670,  94.4325,  ..., 213.9025, 173.6245, 182.7834],
         [ 62.0818,  38.5437,  22.1607,  ..., 145.9296, 125.1568, 128.8382],
         [ 67.0207,  41.0719,  34.7

 34%|███▍      | 343/1000 [00:47<01:31,  7.21it/s]


KeyboardInterrupt: 

In [105]:
perplexity

nan