# 1. Introduction

- To access the model paths first you need to mount google drive and download paths for Sequence to Sequence and T5 fine tuned version.

- Please take the models from following path : https://drive.google.com/drive/folders/1XEv-_yZOl5qXl7xok6pib9GE8mm2X94-?usp=share_link

- And remember to change the paths based on your folder !

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from nltk.translate.bleu_score import sentence_bleu
from transformers import MarianTokenizer
from transformers import T5ForConditionalGeneration,MarianTokenizer, T5Tokenizer, AdamW


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls

drive  sample_data  t5-model.pth


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


print ('Device set to {0}'.format(device))

Device set to cuda


#2. Seq2Seq Architecture

In [None]:
MAX_LENGTH = 20

class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True,bidirectional=False)


    def forward(self, x):

        embedded = self.dropout(self.embedding(x))
        output, (hidden,cell) = self.rnn(embedded)

        return  (hidden,cell)


class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden,cell):
        #print('----Decoder----')

        x = x.unsqueeze(1) #Shape is changed to [batch_size, 1, embedding_size]
        #print("Input shape:", x.shape)

        embedded = self.dropout(self.embedding(x))
        #print("Embedded shape:", embedded.shape)


        output, (hidden,cell) = self.rnn(embedded, (hidden,cell))
        output = output.squeeze(1)
        #print("Output shape from LSTM:", output.shape)

        predictions = self.fc(output)

        return predictions, hidden,cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target=None, teacher_forcing_ratio=0.5, max_length=20):
        # Encode the source sentence
        hidden, cell = self.encoder(source)

        # Prepare the output tensor
        batch_size = source.shape[0]
        target_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, max_length, target_vocab_size).to(self.device)

        # <sos> token is used as the initial input to the decoder
        decoder_input = torch.zeros(batch_size, dtype=torch.long).to(self.device)  # Assuming 0 is the <sos> token index

        for t in range(1, max_length):
            decoder_output, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[:, t, :] = decoder_output
            top1 = decoder_output.argmax(1)
            decoder_input = top1

        return outputs


In [None]:
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-sv')

num_token_id = tokenizer.convert_tokens_to_ids('<num>')
if num_token_id == tokenizer.unk_token_id:
    tokenizer.add_tokens(['<num>'])
#Inlcude bos id
if tokenizer.bos_token_id is None:
    tokenizer.add_special_tokens({'bos_token': '<s>'})

vocab_size = len(tokenizer.get_vocab())
print("Updated tokenizer vocab size:", vocab_size) #This one should be used


model_dict = torch.load('drive/MyDrive/s_models/seq-model.pth') # This is the place need to changed !

encoder = Encoder(vocab_size,300,1024,2,0.5)
decoder = Decoder(vocab_size,300,1024,2,0.5)
model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(model_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Updated tokenizer vocab size: 56436


<All keys matched successfully>

#2. 1 Inference of Seq2Seq model

In [None]:
def infer_translation(model, tokenizer, device, input_text, max_length=20):
    model.eval()
    with torch.no_grad():
        # Tokenize the user input
        input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

        # Generate translation
        outputs = model(input_ids, max_length=max_length)
        outputs = outputs.argmax(-1)

        # Decode the predicted tokens to a human-readable string
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return predicted_text

if __name__ == '__main__':
    # Assuming the model, tokenizer, and device are already defined
    while True:
        input_text = input("Enter text to translate (or 'exit' to quit): ")
        if input_text.lower() == 'exit':
            break
        translation = infer_translation(model, tokenizer, device, input_text)
        print(f"Translation: {translation}")


Enter text to translate (or 'exit' to quit): hello world
Translation: 
Enter text to translate (or 'exit' to quit): hi can you see i t
Translation: kan jag t
Enter text to translate (or 'exit' to quit): my name is this
Translation: det här s
Enter text to translate (or 'exit' to quit): this is my dog
Translation: detta är mint
Enter text to translate (or 'exit' to quit): this is the european union
Translation: unionen unionen etabler
Enter text to translate (or 'exit' to quit): let's start this meeting
Translation: det börjar
Enter text to translate (or 'exit' to quit): mr president I want to have your attention
Translation: herr talman för att g
Enter text to translate (or 'exit' to quit): I want to talk about this subject
Translation: det här sätt
Enter text to translate (or 'exit' to quit): I want to talk about european parliament
Translation: för att ått
Enter text to translate (or 'exit' to quit): this is a meeting 
Translation: det här en
Enter text to translate (or 'exit' to qui

# 3. T5 Arhitecture

In [16]:
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-sv')

modelt5 = T5ForConditionalGeneration.from_pretrained('t5-small')
modelt5.resize_token_embeddings(len(tokenizer))

#modelt5_path1 =torch.load('drive/MyDrive/s_models/t5-model.pth')
modelt5_path = torch.load('drive/MyDrive/s_models/t5-model.pth',map_location=torch.device('cpu')) # This part needs to be changed also

modelt5.load_state_dict(modelt5_path)



<All keys matched successfully>

In [None]:
def infer_translation2(model, tokenizer, device, input_text, max_length=20):
    model.eval()
    predictions = []
    with torch.no_grad():
        # Tokenize the input_text and ensure it's in the right format
        input_ids = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize and convert to tensor
        input_ids = input_ids.to(device)  # Move to the appropriate device

        # Generate output using the model
        outputs = model.generate(input_ids=input_ids, max_length=max_length, num_beams=1, early_stopping=False)
        predicted_sentences = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(predicted_sentences)
        #print(predicted_sentences)

    return predicted_sentences


In [None]:
device = torch.device('cpu')
print ('Device set to {0}'.format(device))

while True:
  input_text = input("Enter text to translate (or 'exit' to quit): ")
  if input_text.lower() == 'exit':
    break
  translation2 = infer_translation2(modelt5, tokenizer, device, input_text)
  print(f"Translation: {translation2}")

Device set to cpu
Enter text to translate (or 'exit' to quit): this is the beginning of the session
Translation: ['detta är början på sessionen det är början på sessionen det är början på den sessionen']
Enter text to translate (or 'exit' to quit): exit


#Minor Note

In these models, we can observe that the sequence to sequence model gives some results where usually the start of the sentence matches. The T5 model, even though it gives much more accurate translation, however seems to suffer from an issue whereby it repeats the translation more than one times. Things that are worth investigating further...

- Have fun with playing with english to swedish translation