In [11]:
########################################################################################################################
## -- libraries and packages -- ########################################################################################
########################################################################################################################
import os
import sys
sys.path.append(os.path.abspath(".."))
import transformer
from torch.utils.data import DataLoader

########################################################################################################################
## -- testing the tokenizer module -- ##################################################################################
########################################################################################################################
src_vocab_path = "../data/vocabs/en_vocab.json"
tgt_vocab_path = "../data/vocabs/fa_vocab.json"
src_path, src_name = "../data/dataset/Tatoeba.zip", "en.txt"
tgt_path, tgt_name = "../data/dataset/Tatoeba.zip", "fa.txt"
SOS_TOKEN, PAD_TOKEN, EOS_TOKEN = '<SOS>', '<PAD>', '<EOS>'

data_handler = transformer.DataHandler(src_path, src_name, src_vocab_path, tgt_path, tgt_name, tgt_vocab_path, 
                                       SOS_TOKEN, PAD_TOKEN, EOS_TOKEN, max_sequence_length = 36, max_sentences = 1000)

data = data_handler.data()

max_sequence_length = 36
en_tokenizer = transformer.Tokenizer(data.src_stoi, max_sequence_length = max_sequence_length,
                                     SOS_TOKEN = SOS_TOKEN, PAD_TOKEN = PAD_TOKEN, EOS_TOKEN = EOS_TOKEN)
fa_tokenizer = transformer.Tokenizer(data.tgt_stoi, max_sequence_length = max_sequence_length, 
                                     SOS_TOKEN = SOS_TOKEN, PAD_TOKEN = PAD_TOKEN, EOS_TOKEN = EOS_TOKEN)

batch_size = 4
dataset = transformer.TransformerDataset(data.src_sentences, data.tgt_sentences)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle = False)

for batch_idx, (en_batch, fa_batch) in enumerate(dataloader):
  print(f"In batch {batch_idx}, there are {batch_size} English and Persian sentences")
  print(f"The first English sentence in this batch is: \n{en_batch[0]}")
  print(f"The first Persian sentence in this batch is: \n{fa_batch[0]}")

  en_batch_tokenized = en_tokenizer.batch_tokenize(en_batch, sos_token = True, eos_token = True)
  fa_batch_tokenized = fa_tokenizer.batch_tokenize(fa_batch, sos_token = True, eos_token = True)
  print()
  print(f"In batch {batch_idx}, there are {batch_size} English and Persian sentences")
  print(f"The first tokenized English sentence in this batch is: \n{en_batch_tokenized.shape}, {en_batch_tokenized[0, :36]}")
  print(f"The first tokenized Persian sentence in this batch is: \n{fa_batch_tokenized.shape}, {fa_batch_tokenized[0, :36]}")
  break

In batch 0, there are 4 English and Persian sentences
The first English sentence in this batch is: 
i just don't know what to say.
The first Persian sentence in this batch is: 
من فقط نمی دانم چه بگویم.

In batch 0, there are 4 English and Persian sentences
The first tokenized English sentence in this batch is: 
torch.Size([4, 36]), tensor([ 0,  9, 95, 10, 21, 19, 20, 95,  4, 15, 14, 87, 20, 95, 11, 14, 15, 23,
        95, 23,  8,  1, 20, 95, 20, 15, 95, 19,  1, 25, 91, 97, 96, 96, 96, 96])
The first tokenized Persian sentence in this batch is: 
torch.Size([4, 36]), tensor([ 0, 28, 29, 82, 23, 24, 19, 82, 29, 28, 32, 82, 10,  1, 29, 28, 82,  7,
        31, 82,  2, 26, 30, 32, 28, 78, 84, 83, 83, 83, 83, 83, 83, 83, 83, 83])


In [12]:
########################################################################################################################
## -- testing the token embedding module -- ############################################################################
########################################################################################################################
max_sequence_length, model_emb, stoi, dropout_p = 256, 512, data.src_stoi, 0.1
token_embedding = transformer.TokenEmbedding(max_sequence_length, model_emb, stoi, dropout_p,
                                             SOS_TOKEN = SOS_TOKEN, PAD_TOKEN = PAD_TOKEN, 
                                             EOS_TOKEN = EOS_TOKEN, device = 'cpu')
inp = ("Hello", "Bye Bye", "Wait for me too", )
out = token_embedding(inp, sos_token = True, eos_token = True)
out.shape

torch.Size([3, 256, 512])