In [None]:
# !pip install torchtext==0.6
# from google.colab import drive
# drive.mount('/content/drive')

# with open('/content/drive/My Drive/Assignment4_2/trainen.txt', encoding='utf8') as f:
#     eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('/content/drive/My Drive/Assignment4_2/trainta.txt', encoding='utf8') as f:
#     tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('/content/drive/My Drive/Assignment4_2/deven.txt', encoding='utf8') as f:
#     eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
# with open('/content/drive/My Drive/Assignment4_2/devta.txt', encoding='utf8') as f:
#     tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))

# embedding_glove = pickle.load(open('/content/drive/My Drive/Assignment4_2/glove.sav', 'rb'))


In [1]:
import pdb
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer, Transformer
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import spacy
import math

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
#Reading files

with open('trainen.txt', encoding='utf8') as f:
    eng_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('trainta.txt', encoding='utf8') as f:
    tamil_train = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('deven.txt', encoding='utf8') as f:
    eng_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
with open('devta.txt', encoding='utf8') as f:
    tamil_test = list(map(lambda x: x.rstrip(), f.readlines()))
    
embedding_glove = GloVe(name='6B', dim=100)

spacy_en = spacy.load('en_core_web_sm')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

stop_words = [',','.','?','!',')','(',':',']','[','$','#','&','%','--']
ENG = Field(tokenize = tokenize_en, init_token='sos', eos_token = 'eos', lower=True, stop_words=stop_words)
processed_eng_train = list(map(lambda x: ENG.preprocess(x), eng_train))
processed_eng_test = list(map(lambda x: ENG.preprocess(x), eng_test))

ENG.build_vocab(processed_eng_train, vectors=embedding_glove)

In [5]:
def preprocess(processed_eng):
    
    #function to return the numericalized version of the tokenized sentences
    X = []
    for tokenized_sentence in processed_eng:
        int_sequence = [2]  #first element is the SOS token 
        for token in tokenized_sentence:
            int_sequence.append(ENG.vocab.stoi[token])
        int_sequence.append(3) #last element is the EOS token
        X.append(int_sequence)
    
    return X

# X_train and X_test are lists of lists with the integer sequences for a given sentence
X_train = preprocess(processed_eng_train)
X_test = preprocess(processed_eng_test)

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Same thing for Tamil sentences
TAM = Tokenizer()
TAM.fit_on_texts(tamil_train)
Y_train = TAM.texts_to_sequences(tamil_train)
Y_test = TAM.texts_to_sequences(tamil_test)

#adding EOS token
EOS = len(TAM.word_index)+2
SOS = len(TAM.word_index)+1
_ = [y.append(EOS) for y in Y_train]
_ = [y.append(EOS) for y in Y_test]
_ = [y.insert(0,SOS) for y in Y_train]
_ = [y.insert(0,SOS) for y in Y_test]

Y_train = pad_sequences(Y_train, padding='post', value=0, maxlen=30)
Y_test = pad_sequences(Y_test, padding='post', value=0, maxlen=30)

Using TensorFlow backend.


In [7]:
Y_train[0]

array([18669,    11,    63,   359,     7, 18670,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0])

In [8]:
# pre-padding the inputs with value 1, from the dictionary ENG.vocab.stoi
X_train = pad_sequences(X_train, padding='pre', value=1, maxlen=55)
X_test = pad_sequences(X_test, padding='pre', value=1, maxlen=55)

In [9]:
len(Y_test[0])

30

In [10]:
source_vocab_size = len(ENG.vocab)
target_vocab_size = len(TAM.word_index)+3
print(source_vocab_size)
print(target_vocab_size)

9723
18671


In [11]:
from torch.utils.data import Dataset, DataLoader, ConcatDataset, random_split

class DatasetClass(Dataset):
    
    def __init__(self, source, target):
        
        self.source = source
        self.target = target
      
    def __len__(self):
        
        return len(self.source)
    
    def __getitem__(self, idx):
        
        return self.source[idx], self.target[idx]

In [12]:
def train_test_loader(source_train, target_train, source_test, target_test, num_workers=0, batch_size=32):

    train_data = DatasetClass(source_train, target_train)
    test_data = DatasetClass(source_test, target_test)

    trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    
    return trainloader, testloader

In [13]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, max_len=55):
        
        super(PositionEncoding, self).__init__()
        pos_enc = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # even columns use sin
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        # odd columns use cos
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        pos_enc = pos_enc.unsqueeze(0).transpose(0, 1)    # 55 x 1 x 100
        print('pos_enc.shape', pos_enc.shape)
        
        # to prevent the optimiser from changing the position encodings
        self.register_buffer('pos_enc', pos_enc)

    def forward(self, x):
        x = x + self.pos_enc[:x.size(0), :]
        return x

In [14]:
class Transformer(nn.Module):
    
    def __init__(self, target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG):
        
        super(Transformer, self).__init__()
        self.pos_encoder = PositionEncoding(embed_size)
        self.encoder = nn.Embedding.from_pretrained(ENG.vocab.vectors)
        self.transformer = nn.Transformer(embed_size, num_heads, num_layers, num_layers, dropout=dropout)
        self.decoder = nn.Embedding(target_vocab_size, embed_size)
        self.fc = nn.Linear(embed_size, target_vocab_size)
        self.logsoftmax = nn.LogSoftmax(dim=2)
        
    def forward(self, src, trg):
        
        # src and trg will be of sizes: batch_size x Ts and batch_size x Tt
        src_pad_mask = (src==1).to(device) # 1 is the padding used for source
        trg_pad_mask = (trg==0).to(device) # 0 is the padding used for target
        src = self.encoder(src)     # batch_size x Ts x 100
        src = self.pos_encoder(src.transpose(0,1)) # Ts x batch_size x 100
        trg = self.decoder(trg)     # batch_size x Tt x 100
        trg = self.pos_encoder(trg.transpose(0,1)) # Tt x batch_size x 100
        #src_mask = (torch.tril(torch.ones(src.shape[0], src.shape[0])) == 0).to(device)
        trg_mask = (torch.tril(torch.ones(trg.shape[0], trg.shape[0])) == 0).to(device)
        output = self.transformer(src, trg, tgt_mask=trg_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=trg_pad_mask) 
        output = self.fc(output)  # output will now be Tt x batch_size x target_vocab_size
        output = self.logsoftmax(output)
        
        return output

In [24]:
embed_size = 100
num_heads = 4
num_layers = 2
dropout = 0.2
trans_model = Transformer(target_vocab_size, embed_size, num_heads, num_layers, dropout, ENG)

pos_enc.shape torch.Size([55, 1, 100])


In [25]:
trans_model = trans_model.to(device)

In [26]:
def train_model(trainloader, trans, lr=0.001, mom=0.9):
    
    loss_fn = nn.NLLLoss()
    trans_optimiser = optim.SGD(trans.parameters(), lr=lr, momentum=mom)
    
    max_epochs = 100
    old_loss = np.inf
    
    for epoch in range(max_epochs):
        
        running_loss = 0.0
    
        for i in trainloader:
            
            loss_val = 0
            i[0] = i[0].type(torch.LongTensor)   # batch_size x Ts
            i[1] = i[1].type(torch.LongTensor)   # batch_size x Tt
            source_batch = i[0].to(device)
            target_batch = i[1].to(device)

            output = trans(source_batch, target_batch)  # Tt x batch_size x target_vocab_size

            # calculating loss for the batch
            for j in range(output.shape[1]):
                logprobs = output[:, j, :].squeeze(1)
                target_classes = target_batch[j].squeeze(0)
                # inputs to NLL Loss should be of sizes Tt x n_classes and Tt
                loss_val += loss_fn(logprobs, target_classes)

            loss_val.backward()

            nn.utils.clip_grad_norm_(trans.parameters(), 0.5)
            trans_optimiser.step()
            
            running_loss += loss_val.item()
        
        print('Epoch', epoch+1, ': Loss =', running_loss)
        
        if abs(running_loss-old_loss)/running_loss < 1e-3:
            print('Converged')
            break
            
        old_loss = running_loss
    
    print('Finished Training')

In [27]:
trainloader, testloader = train_test_loader(X_train, Y_train, X_test, Y_test)

In [None]:
#torch.save(trans_model.state_dict(), 'q5_wts.pt')

In [19]:
trans_model.load_state_dict(torch.load('q5_wts.pt'))

<All keys matched successfully>

In [28]:
train_model(trainloader, trans_model)

Epoch 1 : Loss = 67989.87211227417
Epoch 2 : Loss = 32402.623106002808
Epoch 3 : Loss = 29782.40039539337
Epoch 4 : Loss = 27946.705994606018


KeyboardInterrupt: 

In [29]:
_ , testloader = train_test_loader(X_train, Y_train, X_test, Y_test, batch_size=1)

In [30]:
# trainloader with batch_size = 1 must be used for testing
def test_model(testloader, trans_model):
    
    with torch.no_grad():
    
        for i in testloader:

            i[0] = i[0].type(torch.LongTensor)   # batch_size x Ts
            i[1] = i[1].type(torch.LongTensor)   # batch_size x Tt
            source_sentence = i[0].to(device)
            target_sentence = i[1].to(device)
            y_t = torch.cuda.LongTensor([SOS]).unsqueeze(0)  # 1 x 1
            Tt = i[1].shape[1]  # 30
            for i in range(Tt):
                # y_t is appended to make it of size 1 x t at every time
                output = trans_model(source_sentence, y_t)
                last_output = output[-1,:,:]
                last_output = last_output.squeeze(0) # to make it 1 x target_vocab_size
                _ , index = last_output.topk(1)
                latest_predicted = index
                if latest_predicted == EOS:
                    print('eos')
                    break
                else:
                    y_t = torch.cat((y_t, latest_predicted.unsqueeze(0)), dim=1)
                    
            print("Predicted tokens=", y_t)
            print('Target=', target_sentence )

In [31]:
test_model(testloader, trans_model)

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   345,  3720, 16052,  1865,    80,   620,    36,   196,    43,
          4457, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   833,  1015,  3061,   160,   809,  1198, 18670,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    10,   264,   629,   409,    81,   166,    52,     7,    12,
          2979,    20, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   241,  3405,   118,    19,    27,     3,  1931,  4241,  2038,
            20, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    55,  1281,    86,     9,    41,    81,   328,    16,
           716, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   218,     1,     2,   446,   284,     2,   953,     1,  3444,
           408,   858,     8,    29, 18670,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    10,    17,  1008,    12,    57,    21,    23,     1,  2472,
             2,    12,  3331,    21,   483,  2754,   109,     6, 18670,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   178,  1562,    13,    27,   143,   171,  9071,   113,     3,
         18670,     0,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  2814, 18017,   101,  1074,     8,   348,   321,  4056,    19,
         18670,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   231,  2196,    53,  6452,  4121,   858,    21, 18670,     0,
             0,     0,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    15,   270,  1974,  1200,  4656,   876,  1627,  1572, 16664,
         18670,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    22,     3,   320,   275,   607,    89,    33,    25,     1,
            72,  1251, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   140,   482,    49, 10321,     6,     4,   441,  1325,  4929,
          3967,    25,     3,   755,  4273, 18670,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    48,    23,   214,     3,   348,    53,    26,     6,
          2825,   324, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    42,   202,   332,  1910,   189,    43,   677,    12,
           759, 11211, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    41,    81,  3444,   408, 18085,     2,   199,   506, 11974,
           188,     6, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   102,  1054,   551,    25,  1682,   729,    62,    22, 16973,
           941,   356,    51, 18670,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    14,  2364,  2161,     1,   174,   198,  2161,    14,    28,
             1,   199,     4, 18670,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,     4,     1,     2,   213,  3623,  2520,  2699,   751,
            33, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  1475,  1332,  2609,    41,     2,    49,   200,     6,    24,
             2,   105,     6,  3595, 18670,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669, 15484,    44,  1206,    81,     1,    89,     6,   164,  3388,
            16,   416, 17909,   188,   131, 18670,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]], device='cuda:0')
Target= tensor([[18669,  3406,    12,    53,  6299,     1,   398,   857,     8,  3288,
           859,  2502,  2786,   857, 18670,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   290,    45, 11952,    25,     1,     3,   108,   992,   367,
          1735,    45, 14329,    37, 18670,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    10,    12,   983,     1,   690,   629,   458,    62, 18670,
             0,     0,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    75,     2,    50,    29,    62,   164,   108,     8, 11265,
          1075,     1,   135,   139, 18670,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    76,  1931,   526,     8,  5787,    25,    76,    95,  6930,
            20, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    24,     1,   593,   943,  1585,   943,  3767,    27,   187,
           409, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    21,    12,  4891,     6,     7,    17,    93,    24,
             2,   170,   126,    73,     6, 18670,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    16, 15063,  4125,   107, 13329,    11,    43,    26,    68,
             7, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   184, 13772,   274,  1317,  1294,  3358, 11587,    45,     5,
         14977,   101,    17,   546, 18670,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  2411,     1,    59,     3,   569,     3,   728,    54,     7,
            16,  2453,    68,     7,   458, 18670,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    47,    48, 14259,   143,    16,  3295,  1247,   327,
          2862,  1209, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   218,    19,     2,     7,     2,  7872,     2,    19,    50,
           750,    37,   223, 18670,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,    31,   234,   188,  2367,  1038,     2,     3,  1652,
            73,    24, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    55,    93,    14,   809,  3487,  1913,  2549, 18670,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    45,   896,    37,     2,   383,   179,   122,    66,  2186,
            37,     6, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    24,   106,     8,  1097,    16,  2662,   517,  5440,   107,
           618,     7, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    14, 15121,    87,     3,   731,     2,   339,     3,   268,
           342, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    30,   477,    26,     6,    24,    22, 13589,   215,  5986,
           463,    25,    76,   274,  2585,   286,   665, 18670,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,   141,    12,   318,  6874,   145,   393,    75,  1393,
           384,     1,   113,  7025,   342,  1396,    33,   34

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     4,     1,    31,    14,   220,    28,   148,   871,    26,
             2,    17,   321,   448, 18670,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     8,   629,   127,  6589,   722,   468,   127,  4442,  1781,
         13112, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   273,     4,    43,  9054,   108,    75,    16,   184,    48,
          6052,    33, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    16, 15063,  4125,   107, 13329,    11,    43,    26,    68,
             7, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,   954,   610,   147,  1229,   743,  3710,     4,    62,
           170,     2,  5093,     4,   119, 18670,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    25,    25,  1493,    48,    23,   108,   182,     2,    70,
           276,   150, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    15,   137,  3460,  1232,   222,   151,   100,    23,   429,
         18670,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1, 10529,    72,   212,   289,    24,  7713,   919,    11,
            46,  8665,  3125, 18670,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,  6989,   753,    58,    67,     1,    16, 18039,    28,
             2,   149,   139, 18670,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    11,   482,   873,  5552,  3007,  1107,    25,   264,  1768,
          2142, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   121,    59,    32,     1,   199,    18,    13,    56, 12463,
           313,     8, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  3277,    24,     1,    16,   805,   530,    23, 17768,   130,
            23,    14,    71,    40, 18670,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  3998, 15891,   956,   143,    72,   452,    24,     1,    19,
          3826,    43, 12689,  1260,     6, 18670,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  2841,  1313,  2227,     1,    86,    14,   220,    60,    80,
           560,   149, 18670,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   532,    64, 10608,  1319,  4345,  7047,   335, 15916,  2334,
         18670,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1, 11300,  3708,    25,  2006,   344,   773,  3025, 17695,
           136, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  1678,    27,    11,     9,   379,   387,   619,    18,   693,
             8, 13301,  3243,  4413, 18670,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     2,    21,   751,   223,    62,     1,    36,    59,   260,
            29, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    16,  2863,   983,   458,  2891,   346,    37,     6, 15242,
             2,   690,   291,    49,    73,   443, 18670,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   168,     1,     3,   129,  6302,     4,     1,     3,  1289,
             4, 18670,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    24,    11,     8,   147,   411,     6,   543,  2682, 13063,
             2,   690,   291,   179,    58,   474,    25,   461,  1511, 18670,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,  7151,     2,     3,   175,   202,  1914,  1099,     2,   230,
            81,    69,     2,    38,  1783, 18670,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     8,  6563,   170,    22,     7,    22,   126,   140,  1179,
          2151, 18670,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    11,   125,    15,    89,    29,    77,  1091,    11,  3630,
         16320,   190,     6, 18670,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,    11,     3,  3492,     6,   425,   415,     8, 13538,    58,
            49,  1157, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,   271,  2172,   562,    13,  1420,   817, 16858,  1420,  7141,
         18670,     0,     0,     0,     0,     0,     0,     

Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,   112,   100,    28,   446,     4,     3,    83,   326,
            33,    32, 18670,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       device='cuda:0')
Predicted tokens= tensor([[18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669, 18669,
         18669]], device='cuda:0')
Target= tensor([[18669,     1,  2781,    12,  5998,  4456,    25,   333,  4250,     5,
             7,   465, 18670,     0,     0,     0,     0,     