<a href="https://colab.research.google.com/github/Soumya-2184/Transformer_English_to_Spanish/blob/main/Transformer_eng_to_spn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
import math
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import Adam
from torchtext.data.metrics import bleu_score




In [None]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, x):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out



In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x


In [None]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x

In [None]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y



In [None]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

In [None]:
class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [None]:
class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                spn_vocab_size,
                english_to_index,
                spanish_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_ind, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob,num_layers, max_sequence_length, spanish_to_ind, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, spn_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False, # We should make this true
                dec_end_token=False): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out



In [None]:

class SentenceEmbedding(nn.Module):
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):
        def tokenize(sentence, start_token, end_token):
            sentence_word_indices = [self.language_to_index.get(token, self.language_to_index[self.PADDING_TOKEN]) for token in sentence]
            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN])
            # Pad the sequence to max_sequence_length
            padding_needed = self.max_sequence_length - len(sentence_word_indices)
            if padding_needed > 0:
                sentence_word_indices.extend([self.language_to_index[self.PADDING_TOKEN]] * padding_needed)
            return sentence_word_indices[:self.max_sequence_length]

        tokenized = [tokenize(sentence, start_token, end_token) for sentence in batch]
        tokenized = torch.tensor(tokenized, dtype=torch.long)
        return tokenized

    def forward(self, x, start_token, end_token):
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x

In [1]:
import os
import zipfile
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [2]:
!unzip "/content/archive (11).zip" -d "data/"

Archive:  /content/archive (11).zip
  inflating: data/data.csv           


In [3]:
# Load data and set labels
df= pd.read_csv("data/data.csv")

# Display dimensions of dataframe
print(df.shape)
print(df.info())

print("-----------------------------------------------------------------------")
# Display 10 random samples
print(df.head())


(118964, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118964 entries, 0 to 118963
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  118964 non-null  object
 1   spanish  118964 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB
None
-----------------------------------------------------------------------
  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


In [None]:
english_set = df.iloc[:, 0]  # First column for English
spanish_set = df.iloc[:, 1]  # Second column for Spanish

START_TOKEN = '<start>'
PADDING_TOKEN = '<pad>'
END_TOKEN = '<end>'


english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                        'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                        'Y', 'Z',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

spanish_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',
                      '¡', '¿',
                      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                      'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                      'Y', 'Z',
                      'á', 'é', 'í', 'ó', 'ú', 'ü', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ü',
                      'ñ', 'Ñ',
                      '[', '\\', ']', '^', '_', '`',
                      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                      'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                      'y', 'z',
                      '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

ind_to_spanish = {k: v for k, v in enumerate(spanish_vocabulary)}
spanish_to_ind = {v: k for k, v in enumerate(spanish_vocabulary)}
ind_to_english = {k: v for k, v in enumerate(english_vocabulary)}
english_to_ind = {v: k for k, v in enumerate(english_vocabulary)}

english_sentences = english_set.tolist()
spanish_sentences = spanish_set.tolist()

max_sequence_length = 150



In [None]:
# check for valid sentences
def isvalid(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def isvalidlength(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 2)  # need to readd eos and start token

valid_indices = []
for index in range(len(english_sentences)):
    spanish_sentence, english_sentence = spanish_sentences[index], english_sentences[index]
    if isvalidlength(spanish_sentence, max_sequence_length) \
            and isvalidlength(english_sentence, max_sequence_length) \
            and isvalid(spanish_sentence, spanish_vocabulary) \
            and isvalid(english_sentence, english_vocabulary):
        valid_indices.append(index)

# Filter sentences based on valid indices
english_sentences = [english_sentences[i] for i in valid_indices]
spanish_sentences = [spanish_sentences[i] for i in valid_indices]



In [None]:
print("Sample English sentences:", english_sentences[:5])
print("Sample Spanish sentences:", spanish_sentences[:5])

max_sentences = 5000
english_sentences = english_sentences[:max_sentences]
spanish_sentences = spanish_sentences[:max_sentences]

assert len(english_sentences) == len(spanish_sentences), "mismatch"


Sample English sentences: ['Go.', 'Go.', 'Go.', 'Go.', 'Hi.']
Sample Spanish sentences: ['Ve.', 'Vete.', 'Vaya.', 'Váyase.', 'Hola.']


In [None]:
d_model = 512
batch_size = 10
ffn_hidden = 1024
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 150
spn_vocab_size = len(spanish_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          spn_vocab_size,
                          english_to_ind,
                          spanish_to_ind,
                          START_TOKEN, END_TOKEN, PADDING_TOKEN)

sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, english_to_ind , START_TOKEN, END_TOKEN, PADDING_TOKEN)


class TextDataset(Dataset):

    def __init__(self, english_sentences, spanish_sentences):
        self.english_sentences = english_sentences
        self.spanish_sentences = spanish_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.spanish_sentences[idx]

criterion = nn.CrossEntropyLoss(ignore_index=spanish_to_ind[PADDING_TOKEN], reduction='none')
optim = Adam(transformer.parameters(), lr=0.001)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

dataset = TextDataset(english_sentences, spanish_sentences)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Move model to device
transformer.to(device)

# Initialize parameters
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)


In [None]:
import torch
from nltk.translate.bleu_score import corpus_bleu

# Initializing lists for reference and candidate translations
references = []
candidates = []

iterator = iter(train_loader)
batch_size = 10

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num >= batch_size - 1:
        break

print(batch_size)

transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    #print(iterator)
    total_correct = 0
    total_tokens = 0

    references.clear()
    candidates.clear()

    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, spn_batch = batch

        # Create masks
        mask = torch.full([max_sequence_length, max_sequence_length], float('-inf'))
        mask = torch.triu(mask, diagonal=1)
        encoder_self_attention_mask = mask
        decoder_self_attention_mask = mask
        decoder_cross_attention_mask = mask

        optim.zero_grad()

        # Get predictions
        spn_predictions = transformer(eng_batch,
                                      spn_batch,
                                      encoder_self_attention_mask.to(device),
                                      decoder_self_attention_mask.to(device),
                                      decoder_cross_attention_mask.to(device),
                                      enc_start_token=True,
                                      enc_end_token=True,
                                      dec_start_token=True,
                                      dec_end_token=True)

        # Compute loss
        labels = transformer.decoder.sentence_embedding.batch_tokenize(spn_batch, start_token=False, end_token=True)
        loss = criterion(spn_predictions.view(-1, spn_vocab_size).to(device),
                         labels.view(-1).to(device))

        # Mask out padding tokens
        valid_indices = torch.where(labels.view(-1) != spanish_to_ind[PADDING_TOKEN], True, False)
        loss = loss * valid_indices.float()

        valid_loss = loss.sum()

        # Backpropagation
        valid_loss.backward()

        optim.step()

        # Get predicted translations
        spn_sentence_predicted = torch.argmax(spn_predictions, axis=2)

        # Convert predictions to sentences for BLEU score calculation
        for i in range(len(spn_batch)):
            reference = spn_batch[i]
            candidate = spn_sentence_predicted[i].tolist()  # Convert tensor to list of indices
            # Remove padding and end tokens
            candidate = [ind_to_spanish[idx] for idx in candidate if idx not in [spanish_to_ind[PADDING_TOKEN], spanish_to_ind[END_TOKEN]]]
            references.append([reference])
            candidates.append(candidate)

        if batch_num % 5 == 0:
            print(f"Iteration {batch_num} : {valid_loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Spanish Translation: {spn_batch[0]}")
            predicted_sentence = ""
            for idx in spn_sentence_predicted[0]:
                if idx == spanish_to_ind[END_TOKEN]:
                    break
                predicted_sentence += ind_to_spanish[idx.item()]
            print(f"Spanish Prediction: {predicted_sentence}")

    bleu_score = corpus_bleu(references, candidates)
    print(f"Epoch {epoch} - BLEU Score: {bleu_score}")


[("I'm trapped.", 'Nobody died.', 'Stay with us.', 'Tom changed.', "I'm 19.", 'Catch him.', 'No way!', 'I admire Tom.', 'That was Tom.', 'I am joking.'), ('Estoy atrapado.', 'No murió nadie.', 'Quédate con nosotros.', 'Tom varió.', 'Tengo diecinueve.', 'Atrápalo.', '¡No puede ser!', 'Admiro a Tom.', 'Ese era Tom.', 'Estoy jodiendo.')]
[("I'm not crazy.", 'Keep in touch.', "We're here.", 'Give it to me.', 'He is kind.', "It's a doll.", 'Say something.', 'Is this it?', 'I am at home.', 'We know you.'), ('No estoy loca.', 'Mantente en contacto.', 'Estamos acá.', 'Démelo.', 'Él es gentil.', 'Es una muñeca.', '¡Di algo!', '¿Eso es todo?', 'Estoy en casa.', 'A ti te conocemos.')]
[("This'll do.", 'What a shock!', "I'm impartial.", "I won't lose!", "Let's dance.", 'Do I look OK?', 'Let me get it.', 'Make a list.', 'No one died.', "I'm full."), ('Con éste vale.', '¡Qué susto!', 'Soy imparcial.', '¡No perderé!', 'Bailemos.', '¿Luzco bien?', 'Déjame cogerlo.', 'Hagan una lista.', 'No murió nadie

In [None]:
# Save the trained model
torch.save(transformer.state_dict(), 'transformer_model.pth')

In [None]:
# Load the trained model
transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          spn_vocab_size,
                          english_to_ind,
                          spanish_to_ind,
                          START_TOKEN, END_TOKEN, PADDING_TOKEN)
transformer.load_state_dict(torch.load('transformer_model.pth'))
transformer.eval()

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(97, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=1024, bias=True)
          (linear2): Linear(in_features=1024, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding):