## Attention Is All You Need

Original paper: https://arxiv.org/pdf/1706.03762.pdf

Implementing a transformer kind of from scratch using numpy and torch

In [1]:
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import math
import spacy
import os
from torch.utils.data import DataLoader, Dataset
import re

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


Need to implement:
- [x] Scaled dot-product attention
- [x] Multi-head attention
- [x] Positional encoding
- [x] Layer normalization
- [x] Position-wise feed forward
- [x] Embeddings
- [x] Encoder layer (combination of some of the above)
- [x] Encoder (stack of encoder layers)
- [x] Multi-head cross attention
- [x] Decoder layer
- [x] Decoder
- [x] Transformer (combining encoder and decoder, plus some additional stuff)
- [ ] Weight init
- [ ] Optimization
- [ ] Preprocess, Dataset, DataLoader

In [2]:
def scaled_dot_product_attention(q, k, v, mask=None):
    numerator = q @ torch.transpose(k, -2, -1) # May have to fix this transpose
    if mask is not None:
        numerator = numerator.permute(1, 0, 2, 3) + mask
        numerator = numerator.permute(1, 0, 2, 3)
    denominator = math.sqrt(k.shape[-1])
    attn = F.softmax((numerator/denominator), dim=-1, dtype=torch.float32)
    result = attn @ v
    return result, attn

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model):
        super().__init__()
        self.heads = heads
        self.d_model = d_model
        self.head_dim = d_model // heads # Embed dim must be divisible by heads
        self.q_linear = nn.Linear(self.d_model, self.d_model)
        self.k_linear = nn.Linear(self.d_model, self.d_model)
        self.v_linear = nn.Linear(self.d_model, self.d_model)
        self.linear_out = nn.Linear(self.d_model, self.d_model)
        
    def forward(self, q, k, v, mask=None):
        batch_size, seq_length, _ = q.size()
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)
        q, k, v = [x.view(batch_size, seq_length, self.heads, self.head_dim).transpose(1,2) for x in [q,k,v]]
        values, attn = scaled_dot_product_attention(q, k, v, mask)
        x = values.transpose(1,2).reshape(batch_size, seq_length, self.heads * self.head_dim)
        x = self.linear_out(x)
        return x

In [4]:
test = torch.randn((30,50,512))

mh = MultiHeadAttention(8, 512)
res = mh(test, test, test)
print(res.shape)

# mh_torch = nn.MultiheadAttention(512, 8, bias=False, batch_first=True)
# res1 = mh_torch(test, test, test)
# print(res1[0].shape)

# Check if tensors equal within threshold
#torch.all(torch.lt(torch.abs(torch.add(res, -res1[0])), 1e-2))

torch.Size([30, 50, 512])


In [5]:
# Create example to visualize why you need this: x.view(batch_size, seq_length, self.heads, self.head_dim).transpose(1,2)
# as opposed to just reshaping to that desired shape only using view.
ex_q = torch.randint(low=0, high=10, size=(2,5,18))
ex_k = torch.randint(low=0, high=10, size=(2,5,18))
ex_v = torch.randint(low=0, high=10, size=(2,5,18))
r = ex_q.view(2,3,5,6)
t = ex_q.view(2,5,3,6).transpose(1,2)

# Toy example: 2 batches with a sequence length of 5 and an embedding of size 18.
# Keep in mind, ex_q is an example of what q would look like. If you print out ex_q, r, t. You can see that r simply
# goes across row by row of ex_q dividing the data amongst the "heads" completely incorrectly as it's taking some info from 
# the first input sequence and then it carries over into the second input sequence, so it's clearly wrong which is why
# you need to used both the view and transpose in order to move the data correctly.

In [6]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len, drop_prob=0.1): # Max seq length is set to 50
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=drop_prob)
        self.max_seq_len = max_seq_len
        
        # Calculate denominator, it's the same for even and odd dimensions so you can reuse it
        evens = torch.arange(0, self.d_model, 2).float()
        denom = torch.pow(10000, evens/self.d_model)
        
        # Calculate positional encodings
        self.pe = torch.zeros(self.max_seq_len, self.d_model)
        positions = torch.arange(0, self.max_seq_len).float().reshape(self.max_seq_len, 1)
        
        self.pe[:, 0::2] = torch.sin(positions / denom)
        self.pe[:, 1::2] = torch.cos(positions / denom)
        self.pe = self.pe.unsqueeze(0)
        
    def forward(self, x):
        x = x + self.pe
        x = self.dropout(x)
        
        return x


In [8]:
class LayerNormalization(nn.Module):
    def __init__(self, parameter_shape, eps=1e-5):
        super().__init__()
        self.parameter_shape = parameter_shape
        self.eps = eps
        
        # Define layer norm learnable parameters
        self.gamma = nn.Parameter(torch.ones(parameter_shape))
        self.beta = nn.Parameter(torch.zeros(parameter_shape))
        
    def forward(self, inputs):
        # The layer norm is computed based on each matrix of the batch, not across the batch.
        mean = inputs.mean(-1, keepdim=True)
        std = inputs.std(-1, keepdim=True)
        
        norm = (self.gamma * ((inputs - mean) / (std + self.eps))) + self.beta
        
        return norm

In [9]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embed(x) * math.sqrt(self.d_model)

In [10]:
# x -> Multi-Head Attention -> LayerNorm(residual + x) -> PWFeedForward -> LayerNorm(residual + x)
#
# MultiHeadAttention: heads, d_model
# LayerNormalization: parameter_shape, eps=1e-5
# PositionWiseFeedForward: d_model, hidden, drop_prob=0.1

class EncoderLayer(nn.Module):
    def __init__(self, heads, d_model, hidden, drop_prob=0.1):
        super().__init__()
        self.heads = heads
        self.d_model = d_model
        self.hidden = hidden
        self.drop_prob = drop_prob
        
        self.attn = MultiHeadAttention(self.heads, self.d_model)
        self.norm1 = LayerNormalization(self.d_model)
        self.drop1 = nn.Dropout(p=drop_prob)
        self.pwff = PositionWiseFeedForward(self.d_model, self.hidden, self.drop_prob)
        self.norm2 = LayerNormalization(self.d_model) # Might have to change this
        self.drop2 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, mask):
        residual_x = x.clone()
        x = self.attn(x, x, x, mask=mask)
        x = self.norm1(residual_x + x)
        x = self.drop1(x)
        residual_x = x.clone()
        x = self.pwff(x)
        x = self.norm2(residual_x + x)
        x = self.drop2(x)
        
        return x
        

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, heads, d_model, hidden, drop_prob=0.1):
        super().__init__()
        self.heads = heads
        self.d_model = d_model
        self.hidden = hidden
        self.drop_prob = drop_prob
        
        self.mask_attn = MultiHeadAttention(self.heads, self.d_model)
        self.norm1 = LayerNormalization(self.d_model)
        self.drop1 = nn.Dropout(p=drop_prob)
        self.cross_attn = MultiHeadAttention(self.heads, self.d_model)
        self.norm2 = LayerNormalization(self.d_model)
        self.drop2 = nn.Dropout(p=drop_prob)
        self.pwff = PositionWiseFeedForward(self.d_model, self.hidden, self.drop_prob)
        self.norm3 = LayerNormalization(self.d_model) # Might have to change this
        self.drop3 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, y, self_mask, cross_mask):
        residual_x = x.clone()
        x = self.mask_attn(x, x, x, mask=self_mask)
        x = self.norm1(residual_x + x)
        x = self.drop1(x)
        residual_x = x.clone()
        x = self.cross_attn(x, y, y, mask=cross_mask) # FINISH THIS 
        x = self.norm2(residual_x + x)
        x = self.drop2(x)
        residual_x = x.clone()
        x = self.pwff(x)
        x = self.norm2(residual_x + x)
        x = self.drop2(x)
        
        return x

In [12]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, mask = inputs
        for module in self._modules.values():
            out = module(x, mask)
        return out

In [13]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_mask, cross_mask = inputs
        for module in self._modules.values():
            out = module(x, y, self_mask, cross_mask)
        return out

In [14]:
class Encoder(nn.Module):
    def __init__(self, heads, d_model, hidden, num_layers):
        super().__init__()
        self.layers = SequentialEncoder(*[EncoderLayer(heads, d_model, hidden) for _ in range(num_layers)])
        
    def forward(self, x, mask):
        x = self.layers(x, mask)
        return x

In [15]:
class Decoder(nn.Module):
    def __init__(self, heads, d_model, hidden, num_layers):
        super().__init__()
        self.layers = SequentialDecoder(*[DecoderLayer(heads, d_model, hidden) for _ in range(num_layers)])
        
    def forward(self, x, y, self_mask, cross_mask):
        x = self.layers(x, y, self_mask, cross_mask)
        return x

In [16]:
class Transformer(nn.Module):
    def __init__(self, max_sequence_length, src_vocab_size, tgt_vocab_size,
                 num_layers, heads, d_model, hidden, drop_prob=0.1):
        super().__init__()
        self.src_embed = Embeddings(src_vocab_size, d_model)
        self.tgt_embed = Embeddings(tgt_vocab_size, d_model)
        
        self.enc_pe = PositionalEncoding(d_model, max_sequence_length, drop_prob)
        self.dec_pe = PositionalEncoding(d_model, max_sequence_length, drop_prob)
        
        self.encoder = Encoder(heads, d_model, hidden, num_layers)
        self.decoder = Decoder(heads, d_model, hidden, num_layers)
        
        self.linear = nn.Linear(d_model, tgt_vocab_size)
        
    
    def forward(self, src, tgt, enc_self_mask, dec_self_mask, dec_cross_mask):
        x = self.src_embed(src)
        y = self.tgt_embed(tgt)
        
        x = self.enc_pe(x)
        y = self.dec_pe(y)
        
        enc = self.encoder(x, enc_self_mask)
        dec = self.decoder(y, enc, dec_self_mask, dec_cross_mask)
        
        out = self.linear(dec)
        
        return out
        

In [17]:
# From: https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning
import pickle
from collections import Counter
import io

class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(file, tokenizer, threshold=4):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    
    with io.open(file, 'r', encoding='utf-8') as file:
        sent_list = file.read().split('\n')

    for sentence in sent_list:
        tokens = tokenize(sentence, tokenizer)
        
        counter.update(tokens)
        
    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [18]:
# From: https://nlp.seas.harvard.edu/annotated-transformer/
# Load spacy tokenizer models, download them if they haven't been downloaded already
def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_de, spacy_en


def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]



In [19]:
# Create dataset class
class Multi30k(Dataset):
    
    def __init__(self, en_list, de_list, en_tokenizer, de_tokenizer, en_vocab, de_vocab, max_seq_len):
        
        self.en_list = en_list
        self.de_list = de_list
        self.en_tokenizer = en_tokenizer
        self.de_tokenizer = de_tokenizer
        self.en_vocab = en_vocab
        self.de_vocab = de_vocab
        self.max_seq_len = max_seq_len
    
    def __getitem__(self, idx):
        
        en_sent = self.en_list[idx]
        de_sent = self.de_list[idx]
        
        en_tok = tokenize(en_sent, self.en_tokenizer)
        de_tok = tokenize(de_sent, self.de_tokenizer)
        
        en_vect = []
        de_vect = []
        
        en_vect.append(self.en_vocab('<start>'))
        de_vect.append(self.de_vocab('<start>'))
        en_vect.extend([self.en_vocab(token) for token in en_tok])
        de_vect.extend([self.de_vocab(token) for token in de_tok])
        
        en_vect.append(self.en_vocab('<end>'))
        de_vect.append(self.de_vocab('<end>'))
        
        max_seq = self.max_seq_len
            
        if len(en_vect) < max_seq:
            tmp = [0] * (max_seq - len(en_vect))
            en_vect.extend(tmp)
            
        if len(de_vect) < max_seq:
            tmp = [0] * (max_seq - len(de_vect))
            de_vect.extend(tmp)
        
        src = torch.tensor(en_vect, dtype=torch.long)
        tgt = torch.tensor(de_vect, dtype=torch.long)
        
        return src, tgt
    
    def viewSentences(self, idx):
    
        en = self.en_list[idx]
        de = self.de_list[idx]
            
        return en, de
    
    def __len__(self):
        return len(self.en_list)

In [20]:
def filter_sentences(english_sentences, german_sentences, max_words):
    filtered_english = []
    filtered_german = []
    sum_ = 0

    for eng_sent, ger_sent in zip(english_sentences, german_sentences):
        eng_words = len(eng_sent.split())
        ger_words = len(ger_sent.split())
        
        # Subtracting two accounts for the start and stop tokens
        if eng_words <= max_words-2 and ger_words <= max_words-2:
            filtered_english.append(re.sub(r'[^\w\s]', '', eng_sent))
            filtered_german.append(re.sub(r'[^\w\s]', '', ger_sent))

    return filtered_english, filtered_german

In [21]:
def collate_fn(data):
    
    src, tgt = zip(*data)
    
    src = torch.stack(src, 0)
    tgt = torch.stack(tgt, 0)
    labels = []
    
    for targ in tgt:
        labels.append(targ[targ.nonzero().squeeze()])
     
    return src, tgt, labels

In [22]:
def create_dataloader(en_list, de_list, en_tokenizer, de_tokenizer, en_vocab, de_vocab, max_seq_length, batch_size):
    data = Multi30k(en_list, de_list, en_tokenizer, de_tokenizer, en_vocab, de_vocab, max_seq_length)
    data_loader = torch.utils.data.DataLoader(dataset=data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    
    return data_loader
    #return data

In [23]:
data_loader_test = create_dataloader(filtered_en, filtered_de, spacy_en, spacy_de, en_vocab, de_vocab,
                               max_seq_length=20, batch_size=1)

NameError: name 'filtered_en' is not defined

In [24]:
# a, b = data_loader_test[35]
# a = a.unsqueeze(0)
# b = b.unsqueeze(0)

# s_enc, s_dec, c_dec = create_masks(a, b, 20)

In [59]:
# Adapted from: https://github.com/ajhalthor/Transformer-Neural-Network/blob/main/Sentence_Tokenization.ipynb
NEG_INFTY = -1e9

def create_masks(eng_batch, de_batch, max_sequence_length):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
        try:
            # Sometimes there's no padding
            eng_end_idx = torch.where(eng_batch[idx] == 0)[0][0].item()
        except:
            eng_end_idx = max_sequence_length
        try:
            de_end_idx = torch.where(de_batch[idx] == 0)[0][0].item()
        except:
            de_end_idx = max_sequence_length
            
        eng_chars_to_padding_mask = np.arange(eng_end_idx+1, max_sequence_length)
        de_chars_to_padding_mask = np.arange(de_end_idx+1, max_sequence_length)
        encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
        encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, de_chars_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, de_chars_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, de_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    #print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}:\n {encoder_self_attention_mask[0, :10, :10]}")
    #print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}:\n {decoder_self_attention_mask[0, :10, :10]}")
    #print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}:\n {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [54]:
# Load tokenizers and build vocabs if not done already
spacy_de, spacy_en = load_tokenizers()

# en_vocab = build_vocab("train.en", spacy_en, threshold=2)
# print(len(en_vocab))

# de_vocab = build_vocab("train.de", spacy_de, threshold=2)
# print(len(de_vocab))

with open("./en_vocab.pkl", 'rb') as f:
    en_vocab = pickle.load(f)

with open("./de_vocab.pkl", 'rb') as f:
    de_vocab = pickle.load(f)

# Define parameters
heads = 8
d_model = 240#512
hidden = 2048
max_sequence_length = 20
num_layers = 1#6
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(de_vocab)

# Trim some sentences
with io.open("train.en", 'r', encoding='utf-8') as file:
    en_list = file.read().split('\n')
    
with io.open("train.de", 'r', encoding='utf-8') as file:
    de_list = file.read().split('\n')
    
filtered_en, filtered_de = filter_sentences(en_list, de_list, max_words=max_sequence_length)

print("Total sentences in dataset:", len(filtered_en))

data_loader = create_dataloader(filtered_en, filtered_de, spacy_en, spacy_de, en_vocab, de_vocab,
                               max_seq_length=max_sequence_length, batch_size=2)

# When computing the loss, we are ignoring cases when the label is the padding token
criterion = nn.CrossEntropyLoss(ignore_index=de_vocab.word2idx['<pad>'],
                                reduction='none')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = Transformer(max_sequence_length=max_sequence_length,
                    src_vocab_size=src_vocab_size,
                    tgt_vocab_size=tgt_vocab_size,
                    num_layers=num_layers,
                    heads=heads,
                    d_model=d_model,
                    hidden=hidden)

parameters = list(model.parameters())

for params in parameters:
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

# Total number of parameters
print("Parameters:",sum(p.nelement() for p in parameters))

Total sentences in dataset: 26945
Parameters: 8007710


In [58]:
opt = torch.optim.Adam(parameters, lr=0.0003)

model.train()

epochs = 1
for epoch in range(epochs):
    for i, (src, tgt, labels) in enumerate(data_loader):
        # Create masks
        enc_self_mask, dec_self_mask, dec_cross_mask = create_masks(src, tgt, max_sequence_length)
        logits = model(src, tgt, enc_self_mask, dec_self_mask, dec_cross_mask)
        logits = logits.view(-1,tgt_vocab_size)
        label = tgt.view(-1)
        loss = criterion(logits, label)
        print(loss)
        #model.zero_grad()
        #loss.backward()
        #opt.step()
        
        if i % 25 == 0:
            continue
            #print(loss.item())
            #torch.save(decoder.state_dict(), './decoder_{}_{}.ckpt'.format(epoch, i))
            #torch.save(encoder.state_dict(), './encoder_{}_{}.ckpt'.format(epoch, i))
            
# Save the model
#torch.save(decoder.state_dict(), './decoder_final.ckpt')
#torch.save(encoder.state_dict(), './encoder_final.ckpt')

tensor([9.1838, 9.3030, 9.2869, 9.3258, 8.4496, 9.0553, 9.3912, 9.2222, 9.0135,
        9.1257, 9.2356, 9.4772, 9.5953, 8.8611, 9.8692, 8.7445, 0.0000, 0.0000,
        0.0000, 0.0000, 9.3377, 9.1465, 9.2013, 9.0884, 8.9458, 8.9641, 8.5061,
        9.0782, 8.6354, 9.2082, 8.8919, 8.6776, 9.1810, 9.3513, 9.3635, 9.2638,
        8.9157, 8.8610, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.0890, 9.4051, 9.1622, 9.0489, 9.2666, 9.0582, 9.1294, 8.9840, 8.8595,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.8314, 9.1910, 9.3888, 8.7623, 9.3059, 8.7719, 8.6368,
        8.9671, 9.1683, 8.9220, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.2605, 9.3449, 9.0096, 9.6747, 8.8460, 9.1411, 8.9451, 8.8977, 8.8350,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.3229, 8.6614, 9.2399, 8.7257, 9.3328

tensor([8.8655, 9.3804, 8.5216, 8.6145, 8.9933, 9.0134, 9.3343, 8.6669, 8.9622,
        9.1961, 8.8303, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.0292, 9.2496, 9.1914, 8.7279, 8.8529, 9.2317, 8.4807,
        9.0259, 8.8421, 8.8433, 8.5549, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.4437, 8.5972, 9.6934, 9.2752, 8.9925, 9.2993, 9.0313, 9.1058, 8.8005,
        9.3706, 8.9999, 9.0445, 9.4547, 8.8700, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.1368, 9.3472, 8.9807, 9.4305, 9.2115, 9.4650, 8.8516,
        9.3045, 8.8053, 8.6401, 8.8371, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.0287, 8.7422, 9.5694, 9.4668, 9.2761, 8.7413, 9.2927, 8.8489, 8.8790,
        8.7824, 8.9289, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.7332, 9.3476, 9.1267, 8.6970, 9.1842

tensor([9.0512, 9.2152, 8.9603, 9.1558, 8.8168, 8.6864, 9.1007, 8.8309, 9.1589,
        8.5682, 9.2918, 9.0089, 8.4273, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.0932, 8.7345, 9.2154, 8.8196, 9.3384, 8.9574, 9.2230,
        9.0763, 9.0452, 9.3917, 9.0920, 9.1086, 9.0254, 8.7158, 9.1348, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.3132, 8.8534, 9.1736, 9.1212, 9.2977, 9.3969, 8.9651, 9.0989, 9.4684,
        8.9063, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.8181, 8.7931, 9.0048, 8.9056, 8.7925, 8.8629, 8.8083,
        8.9006, 8.6741, 9.4158, 8.6368, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.0181, 9.2640, 8.6080, 9.1978, 8.9960, 9.0942, 9.3046, 9.4558, 9.4610,
        9.4810, 9.0849, 9.1652, 8.9076, 9.2955, 8.8021, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2282, 9.3233, 8.9105, 9.0294, 8.8520

tensor([9.0050, 9.5351, 9.0406, 8.7038, 9.2058, 9.1914, 8.7267, 9.3306, 9.3288,
        8.8799, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2695, 8.4946, 9.7367, 8.5050, 8.9322, 8.9048, 8.6884,
        8.9801, 9.4244, 8.7635, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.1600, 8.7034, 8.9239, 8.8080, 8.4964, 8.5164, 9.2558, 8.3187, 8.9097,
        9.1900, 9.1411, 8.9647, 8.8074, 8.6245, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.8805, 9.0032, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.2105, 9.4639, 8.8974, 9.2128, 8.9678, 9.0083, 9.4749, 8.8674, 9.2386,
        8.9522, 8.4467, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.8487, 9.2065, 9.1741, 8.8266, 8.9671

tensor([9.3145, 9.4237, 8.8799, 8.9998, 8.8825, 8.8960, 9.3513, 9.2820, 9.1282,
        9.1480, 8.3230, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2336, 9.4977, 8.7429, 8.6604, 8.8328, 9.0587, 9.5618,
        9.0573, 8.9960, 9.1572, 9.0497, 9.0693, 8.6077, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.1909, 9.2587, 9.3751, 8.9733, 9.6324, 8.9079, 9.3075, 8.7586, 8.9515,
        8.7919, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2198, 9.4129, 8.9956, 8.6905, 8.6223, 9.0786, 8.5810,
        8.8935, 9.2446, 8.8168, 9.0520, 9.3879, 8.7365, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.0983, 8.8916, 8.9756, 9.0142, 8.9792, 9.2973, 8.8374, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.9350, 8.8957, 9.3559, 8.7704, 9.3848

tensor([9.0549, 8.6233, 8.5719, 9.2400, 9.3141, 9.0309, 9.4492, 9.1127, 9.3769,
        8.6833, 8.8264, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.3939, 9.3931, 8.8693, 8.9955, 9.1851, 9.3247, 9.4267,
        9.1569, 8.8260, 9.3553, 8.7973, 8.8744, 8.8936, 9.0795, 8.7113, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.1426, 9.0392, 8.2080, 9.2634, 9.0506, 8.9194, 8.9741, 8.9580, 8.9990,
        9.3108, 8.4969, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2610, 8.9481, 9.1211, 9.0737, 9.3852, 9.0518, 8.7602,
        9.0137, 8.9567, 8.9707, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([9.1310, 8.6738, 8.8573, 9.1456, 9.0147, 8.5282, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.1169, 9.0727, 8.9125, 8.7541, 9.0115

tensor([8.9888, 8.9087, 9.1772, 8.7602, 8.9075, 9.5739, 8.7630, 9.2520, 9.0584,
        8.9741, 8.6150, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.9881, 8.8761, 8.4339, 9.0760, 9.2088, 9.6362, 9.1084,
        9.0406, 8.9693, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([8.9205, 8.6803, 8.6748, 9.1607, 8.8659, 9.3698, 8.7203, 9.0819, 8.8867,
        8.7153, 9.1785, 8.6860, 9.4438, 8.8796, 8.7626, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 9.2727, 8.7768, 9.3147, 8.6097, 9.4001, 9.2968, 9.3510,
        8.7388, 8.7944, 8.6872, 9.1881, 8.7779, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000], grad_fn=<NllLossBackward0>)
tensor([8.8153, 8.9479, 9.4621, 8.8865, 9.1283, 8.6281, 9.1076, 8.6133, 8.8328,
        8.7142, 8.9983, 9.2269, 9.0635, 8.8302, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 8.8626, 9.3508, 8.9144, 8.5722, 9.2647

KeyboardInterrupt: 