In [1]:
import torch
from torch import nn
from torch import optim
import math
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
import numpy as np
import pathlib
import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [2]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [3]:
# read in the data from files to lists of strings
START_TOKEN = ''
PADDING_TOKEN = ''
END_TOKEN = ''

swa_sentences = []
with open("./dataset/gamayun_kit5k.swa","r") as f:
    for s in f.readlines():
      s = START_TOKEN + s.rstrip("\n") + END_TOKEN
      swa_sentences.append(s)

eng_sentences = []
with open("./dataset/gamayun_kit5k.eng","r") as f:
    for s in f.readlines():
      s =  s.rstrip("\n")
      eng_sentences.append(s)

print(f"Size of swahili dataset: {len(swa_sentences)} ")
print(f"Size of english dataset: {len(eng_sentences)} ")
print(f"Max swahili sentence: {max([len(s) for s in swa_sentences])} ")
print(f"Max english sentence: {max([len(s) for s in eng_sentences])} ")

print(swa_sentences[:5])
print(eng_sentences[:5])

# prep swa vocab
swa_vocab = list(set(''.join(swa_sentences)))
swa_vocab.insert(0,START_TOKEN)
swa_vocab.append(PADDING_TOKEN)
swa_vocab.append(END_TOKEN)

# prep eng vocab
eng_vocab = list(set(''.join(eng_sentences)))
eng_vocab.append(PADDING_TOKEN)
eng_vocab.insert(0,START_TOKEN)
eng_vocab.append(PADDING_TOKEN)
eng_vocab.append(END_TOKEN)

print(f"Eng vocab: {eng_vocab}")
print(f"Swa vocab: {swa_vocab}")

swa_vocab_size = len(swa_vocab)
eng_vocab_size = len(eng_vocab)


print(f"Eng vocab_size :{len(swa_vocab)}")
print(f"Swa vocab_size :{len(eng_vocab)}")

swa_token_to_index = {t:i for i,t in enumerate(swa_vocab)}
print(swa_token_to_index)
swa_index_to_token = {i:t for i,t in enumerate(swa_vocab)}
# print(swa_index_to_token)
eng_token_to_index = {t:i for i,t in enumerate(eng_vocab)}
print(eng_token_to_index)
eng_index_to_token = {i:t for i,t in enumerate(eng_vocab)}
# print(eng_index_to_token)

# tokenize
swahili_sentences_tokenized = [[swa_token_to_index[t] for t in s] for s in swa_sentences]
english_sentences_tokenized = [[eng_token_to_index[t] for t in s] for s in eng_sentences]

# train/test split
swahili_sentences_tokenized_train = swahili_sentences_tokenized[:4500]
swahili_sentences_tokenized_test = swahili_sentences_tokenized[4500:]
english_sentences_tokenized_train = english_sentences_tokenized[:4500]
english_sentences_tokenized_test = english_sentences_tokenized[4500:]

Size of swahili dataset: 5000 
Size of english dataset: 5000 
Max swahili sentence: 249 
Max english sentence: 233 
['Huyo ni rafiki yako mpya?', 'Job hana hamu ya mpira wa vikapu.', 'Adam aliniambia kuwa Alice alikuwa na mpenzi mpya wa kiume', 'Radio haikutanga kuhusu ajali hiyo.', 'Adamu ana wasiwasi tutapotea.']
['Is that your new friend?', "Jacob wasn't interested in baseball.", 'Adam told me that Alice had a new boyfriend.', "The radio didn't inform about the accident.", "Adam is worried we'll get lost."]
Eng vocab: ['', '°', '’', 'w', 'a', 'x', 's', 'I', "'", 'E', 'i', 'Z', 'S', '"', 'O', 'T', 'j', 'C', '!', 'H', '7', 'g', 'P', 'u', 'v', '3', 'N', ',', 'z', '8', 'm', 'A', '?', '5', 'K', '1', 'c', 'M', 'k', '(', 'é', 'b', '”', 'V', 'o', 'l', 'J', '$', '&', 'B', '4', 'Y', 'r', '“', '9', '0', 'e', 'à', '—', 'R', 'd', 'L', 'q', 'F', 'n', 'f', ')', 'G', ';', '2', 'W', '6', 'y', 'p', 't', ':', '-', '_', 'D', ' ', 'U', 'Q', '.', 'h', '', '', '']
Swa vocab: ['', 'w', 'a', 'x', 's', 'I', 

In [4]:
NEG_INFTY = -1e10

class TranslationDataset(Dataset):

    def __init__(self, swahili_sentences, english_sentences, transforms=None, eng_max_sequence_length=260, swa_max_sequence_length=260):
        self.english_sentences = english_sentences
        self.swahili_sentences = swahili_sentences
        self.transforms = transforms
        self.eng_max_sequence_length = eng_max_sequence_length
        self.swa_max_sequence_length = swa_max_sequence_length
        self.encoder_padding_mask = torch.full([self.eng_max_sequence_length, self.eng_max_sequence_length] , False) # each sentence gets a mask
        self.look_ahead_mask = torch.triu(torch.full([self.swa_max_sequence_length, self.swa_max_sequence_length] , True), diagonal=1)
        self.decoder_padding_mask_self_attention = torch.full([self.swa_max_sequence_length, self.swa_max_sequence_length] , False) # each sentence gets a mask
        self.decoder_padding_mask_cross_attention = torch.full([self.swa_max_sequence_length, self.eng_max_sequence_length] , False) # each sentence gets a mask

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        eng_sentence = self.english_sentences[idx]
        swa_sentence = self.swahili_sentences[idx]
        eng_sentence_length = len(eng_sentence)
        swa_sentence_length = len(swa_sentence)
        eng_chars_to_padding_mask = np.arange(eng_sentence_length, self.eng_max_sequence_length) # fillers
        swa_chars_to_padding_mask = np.arange(swa_sentence_length, self.swa_max_sequence_length) # fillers

        for _ in range(len(eng_sentence), self.eng_max_sequence_length):
            eng_sentence.append(eng_token_to_index[PADDING_TOKEN])
        for _ in range(len(swa_sentence), self.swa_max_sequence_length):
            swa_sentence.append(swa_token_to_index[PADDING_TOKEN])

        self.encoder_padding_mask[:, eng_chars_to_padding_mask] = True
        self.encoder_padding_mask[eng_chars_to_padding_mask, :] = True
        encoder_padding_mask = torch.where(self.encoder_padding_mask, NEG_INFTY, 0) # encoder mask

        self.decoder_padding_mask_self_attention[:, swa_chars_to_padding_mask] = True
        self.decoder_padding_mask_self_attention[swa_chars_to_padding_mask, :] = True
        decoder_self_attention_mask = torch.where(self.decoder_padding_mask_self_attention+self.look_ahead_mask, NEG_INFTY, 0) # decoder self-attention mask

        self.decoder_padding_mask_cross_attention[:, eng_chars_to_padding_mask] = True
        self.decoder_padding_mask_cross_attention[swa_chars_to_padding_mask, :] = True
        decoder_cross_attention_mask = torch.where(self.decoder_padding_mask_cross_attention, NEG_INFTY, 0) # decoder cross-attention mask

        if self.transforms:
            swa_sentence = self.transforms(swa_sentence)
            eng_sentence = self.transforms(eng_sentence)

        return eng_sentence,encoder_padding_mask, swa_sentence, decoder_self_attention_mask, decoder_cross_attention_mask

In [5]:
class EmeddingsLayer(nn.Module):
    def __init__(self, d_model:int, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.d_model)

    def forward(self, X):
        return self.embedding(X) * math.sqrt(self.d_model) # "In the embedding layers, we multiply those weights by sqrt(d_model)"


class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, context_size:int):
        super().__init__()
        self.d_model = d_model
        self.context_size = context_size
        # print(self.context_size)

        self.pe = torch.zeros(self.context_size, self.d_model,requires_grad=False)
        for pos in range(self.context_size):
            for i in range(0, self.d_model, 2):
                self.pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.d_model)))
                self.pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.d_model)))

    def forward(self):
        return self.pe.unsqueeze(0)


# embed_test = EmeddingsLayer(d_model=512,vocab_size=50000)
# pencoding = PositionalEncoding(d_model=512,context_size=1024)
# example_data = torch.randint(1,50000,(64,1024))
# print(example_data.shape)
# embed_output = embed_test(example_data)
# print(embed_output.shape,embed_output[0][0][:10])
# pe_output = pencoding()
# print(pe_output.shape,pe_output[0][0][:10])
# embed_pos_output = embed_output + pe_output
# print(embed_pos_output.shape,embed_pos_output[0][0][:10])

In [6]:
class AttentionHead(nn.Module):

    def __init__(self, head_dim:int,p_drop:float) -> None:
        super().__init__()
        self.queries= nn.Linear(in_features=head_dim,out_features=head_dim,device=device) # kaparthy set bias=False why?
        self.keys = nn.Linear(in_features=head_dim,out_features=head_dim,device=device) # kaparthy set bias=False why?
        self.values = nn.Linear(in_features=head_dim,out_features=head_dim,device=device) # kaparthy set bias=False why?
        self.dropout = nn.Dropout(p=p_drop)

    def forward(self,Q:torch.Tensor,K:torch.Tensor,V:torch.Tensor,mask:torch.Tensor = None) -> torch.Tensor:
        B,T,C = K.shape
        # print(Q.device)
        Q = self.dropout(self.queries(Q))
        K = self.dropout(self.keys(K))
        V = self.dropout(self.values(V))
        scaled_dot_product_attention = (Q @ K.transpose(2,1))/torch.sqrt(torch.tensor(C))
        if mask is not None:
            scaled_dot_product_attention = scaled_dot_product_attention + mask
        dot_product_softened = torch.softmax(scaled_dot_product_attention,dim=-1)
        return dot_product_softened @ V


class MultiHeadSelfAttention(nn.Module):
    def __init__(self,d_model:int, p_drop:float, num_heads:int = 8) -> None:
        super().__init__()
        self.head_dim = math.floor(d_model/num_heads)
        self.layer_norm = nn.LayerNorm(d_model)
        self.heads = [AttentionHead(head_dim=self.head_dim,p_drop=p_drop) for h in range(num_heads)]
        self.linear = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(p=p_drop)

    def forward(self,X:torch.Tensor,Q:tuple,K:tuple,V:tuple, mask:torch.Tensor = None) ->torch.Tensor:
        heads_output = []
        for head_index,head in enumerate(self.heads):
            queries = Q[head_index]
            keys = K[head_index]
            values = V[head_index]
            v = head(queries,keys,values,mask) # this could be distributed to multiple devices for // processing
            heads_output.append(v) # accumulate result

        o = torch.cat(heads_output,dim=-1) #concat
        linear_output = self.linear(o) #linear
        dropped_output = self.dropout(linear_output) #dropout
        mhsa_output = self.layer_norm(X+dropped_output)
        return mhsa_output


class FeedForward(nn.Module):
    def __init__(self,d_model:int,p_drop:float,d_ff:int) -> None:
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(in_features=d_model,out_features=d_ff),
            nn.ReLU(),
            nn.Dropout(p=p_drop),
            nn.Linear(in_features=d_ff,out_features=d_model)
        )
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self,X:torch.Tensor) -> torch.Tensor:
        return self.layer_norm(X+self.ffn(X))


mhsa = MultiHeadSelfAttention(d_model=512,p_drop=0.1,num_heads=8).to(device)
sample_data = torch.randn((10,250,512)).to(device)
splits = torch.split(sample_data,64,dim=2)
mhsa(sample_data,splits,splits,splits,mask=None).shape

torch.Size([10, 250, 512])

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model:int, p_drop:float, d_ff:int, num_heads:int, **kwargs) -> None:
        super().__init__()
        self.head_dim = d_model // num_heads
        self.multihead_self_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.feedforward = FeedForward(d_model=d_model,p_drop=p_drop,d_ff=d_ff)

    def forward(self,X:torch.Tensor,mask:torch.Tensor=None) -> torch.Tensor:
        splits = torch.split(X,self.head_dim,dim=2)
        return self.feedforward(self.multihead_self_attention(X,splits,splits,splits,mask))


class Encoder(nn.Module):
    def __init__(self, number_of_encoder_blocks:int=6,**kwargs) -> None:
        super().__init__()
        self.encoders = nn.ModuleList([EncoderLayer(**kwargs) for n in range(number_of_encoder_blocks)])

    def forward(self,X:torch.Tensor,mask:torch.Tensor=None) -> torch.Tensor:
        outputs = X
        for encoder_layer in self.encoders:
            outputs = encoder_layer(outputs,mask)
        return outputs


class DecoderLayer(nn.Module):
    def __init__(self,d_model:int,p_drop:float,d_ff:int,num_heads:int,**kwargs) -> None:
        super().__init__()
        self.head_dim = d_model // num_heads
        self.masked_multi_head_self_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.masked_multi_head_cross_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.feedforward = FeedForward(d_model=d_model,p_drop=p_drop,d_ff=d_ff)

    def forward(self,outputs:torch.Tensor,encoded_sequence:torch.Tensor,self_attention_mask:torch.Tensor=None,cross_attention_mask:torch.Tensor=None) -> torch.Tensor:
        output_splits = torch.split(outputs,self.head_dim,dim=2)
        masked_output = self.masked_multi_head_self_attention(outputs,Q=output_splits,K=output_splits,V=output_splits,mask=self_attention_mask)

        masked_output_splits = torch.split(masked_output,self.head_dim,dim=2)
        encoded_sequence_splits = torch.split(encoded_sequence,self.head_dim,dim=2)
        mhsa_output = self.masked_multi_head_cross_attention(masked_output,Q=masked_output_splits,K=encoded_sequence_splits,V=encoded_sequence_splits,mask=cross_attention_mask)
        return self.feedforward(mhsa_output)


class Decoder(nn.Module):
    def __init__(self, number_of_decoder_blocks:int, **kwargs) -> None:
        super().__init__()
        self.decoder_layers = nn.ModuleList([DecoderLayer(**kwargs) for n in range(number_of_decoder_blocks)])

    def forward(self,outputs:torch.Tensor,encoded_sequence:torch.Tensor,self_attention_mask:torch.Tensor=None,cross_attention_mask:torch.Tensor=None) -> torch.Tensor:
        for decoder_layer in self.decoder_layers:
            outputs = decoder_layer(outputs,encoded_sequence,self_attention_mask,cross_attention_mask)
        return outputs



In [8]:
class Transformer(nn.Module):
    def __init__(self,
                 eng_vocab_size:int,
                 swa_vocab_size:int,
                 batch_size:int,
                 context_size:int,
                 d_model:int,
                 d_ff:int,
                 num_heads:int,
                 number_of_encoder_blocks:int,
                 number_of_decoder_blocks:int,
                 p_drop:float):

        super().__init__()
        self.context_size = context_size

        self.encoder_embedding = EmeddingsLayer(d_model=d_model,vocab_size=eng_vocab_size)
        self.decoder_embedding = EmeddingsLayer(d_model=d_model,vocab_size=swa_vocab_size)
        self.positional_encoding = PositionalEncoding(d_model=d_model,context_size=context_size)
        self.dropout = nn.Dropout(p=p_drop)
        self.encoder = Encoder(
                            batch_size=batch_size,
                            context_size=context_size,
                            d_model=d_model,
                            d_ff=d_ff,
                            num_heads=num_heads,
                            number_of_encoder_blocks=number_of_encoder_blocks,
                            p_drop=p_drop)

        self.decoder = Decoder(
                            batch_size=batch_size,
                            context_size=context_size,
                            d_model=d_model,
                            p_drop=p_drop,
                            d_ff=d_ff,
                            num_heads=num_heads,
                            number_of_decoder_blocks=number_of_decoder_blocks)

        self.linear = nn.Linear(in_features=d_model,out_features=swa_vocab_size)


    def forward(self,X:torch.Tensor,y:torch.Tensor,encoder_mask:torch.Tensor,decoder_self_attention_mask:torch.Tensor,decoder_cross_attention_mask:torch.Tensor) -> torch.Tensor:
        pos_encoding = self.positional_encoding().to(device) #
        # encode
        input_embeddings = self.encoder_embedding(X).to(device)
        inputs = self.dropout(input_embeddings+pos_encoding) # B*T*C
        encoded_sequence = self.encoder(inputs,encoder_mask)
        # decode
        output_embedding = self.decoder_embedding(y)
        outputs = self.dropout(output_embedding+pos_encoding) # B*T*C
        decoder_output = self.decoder(outputs,encoded_sequence,self_attention_mask=decoder_self_attention_mask,cross_attention_mask=decoder_cross_attention_mask)
        # linear
        output_logits = self.linear(decoder_output)
        # output_probs = torch.softmax(output_logits,dim=-1)

        return output_logits


training_dataset = TranslationDataset(swahili_sentences=swahili_sentences_tokenized_train,english_sentences=english_sentences_tokenized_train,transforms=transforms.ToTensor())
testing_dataset = TranslationDataset(swahili_sentences=swahili_sentences_tokenized_test,english_sentences=english_sentences_tokenized_test,transforms=transforms.ToTensor())
training_dataloader = DataLoader(training_dataset,batch_size=100,shuffle=True)
testing_dataloader = DataLoader(testing_dataset,batch_size=100,shuffle=False)

eng_sentence,encoder_mask,swa_sentence,decoder_self_attention_mask,decoder_cross_attention_mask = next(iter(training_dataloader))
# print(eng_sentence.shape,swa_sentence.shape)
# print(eng_sentence[0])

config = {
    "eng_vocab_size":eng_vocab_size,
    "swa_vocab_size":swa_vocab_size,
    "batch_size":20,
    "context_size":260,
    "d_model":512,
    "num_heads":8,
    "d_ff":2048,
    "number_of_encoder_blocks": 6,
    "number_of_decoder_blocks": 6,
    "p_drop":0.1
}

model = Transformer(**config).to(device)
# model.load_state_dict(torch.load(pathlib.Path("models/translator.pt")))

In [None]:
EPOCHS = 10
loss_fn = nn.CrossEntropyLoss(ignore_index=swa_token_to_index[PADDING_TOKEN],reduction='none').to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-5)

for epoch in tqdm.tqdm(range(EPOCHS)):
    print(f"Epoch {epoch}")
    # train
    model.train()
    for batch,(eng_batch,encoder_mask,swa_batch,decoder_self_attention_mask,decoder_cross_attention_mask) in enumerate(training_dataloader):
        train_logits = model(eng_batch.to(device),swa_batch.to(device),encoder_mask.to(device),decoder_self_attention_mask.to(device),decoder_cross_attention_mask.to(device))
        loss = loss_fn(train_logits.view(-1,swa_vocab_size).to(device),swa_batch.view(-1).to(device))
        valid_indicies = torch.where(swa_batch.view(-1) == swa_token_to_index[PADDING_TOKEN], 0, 1)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optimizer.step()
        if batch % 100 == 0:
            print(f"Iteration: {batch} Loss: {loss.item()}")
            print(f"English: {''.join([eng_index_to_token[t.item()] for t in eng_batch[0]])}")
            print(f"Swahili Translation: {''.join([swa_index_to_token[t.item()] for t in swa_batch[0]])}")
            swa_sentence_predicted = torch.softmax(train_logits[0],dim=1).argmax(dim=1)
            predicted_sentence = ''
            for idx in swa_sentence_predicted:
              if idx.item() == swa_token_to_index[END_TOKEN]:
                break
              elif idx.item() == swa_token_to_index[PADDING_TOKEN]:
                predicted_sentence += ''
              else:
                predicted_sentence += swa_index_to_token[idx.item()]
            print(f"Swahili Prediction: {predicted_sentence}")
            print()

    # test
    model.eval()
    with torch.inference_mode():
      for test_batch,(test_eng_batch,test_encoder_mask,test_swa_batch,test_decoder_self_attention_mask,test_decoder_cross_attention_mask) in enumerate(testing_dataloader):
          print(f"Test English: {''.join([eng_index_to_token[t.item()] for t in test_eng_batch[0]])}")
          test_logits = model(test_eng_batch.to(device),test_swa_batch.to(device),test_encoder_mask.to(device),test_decoder_self_attention_mask.to(device),test_decoder_cross_attention_mask.to(device))
          swa_sentence_predicted = torch.softmax(test_logits[0],dim=1).argmax(dim=1)
          predicted_sentence = ''
          for idx in swa_sentence_predicted:
            if idx == swa_token_to_index[END_TOKEN]:
              break
            elif idx == swa_token_to_index[PADDING_TOKEN]:
              predicted_sentence += ''
            else:
              predicted_sentence += swa_index_to_token[idx.item()]
          print(f"Test Swahili Prediction: {predicted_sentence}")
          loss = loss_fn(test_logits.view(-1,swa_vocab_size).to(device),test_swa_batch.view(-1).to(device))
          valid_indicies = torch.where(test_swa_batch.view(-1) == swa_token_to_index[PADDING_TOKEN], False, True)
          loss = loss.sum() / valid_indicies.sum()
          print(f"Testing Loss: {loss.item()}")
          print()
          # break
    print("------------------------------------------------------------------------")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0
Iteration: 0 Loss: 4.752274036407471
English: I don't have enough RAM.********************************************************************************************************************************************************************************************************************************************
Swahili Translation: <Sina RAM za kutosha.>**********************************************************************************************************************************************************************************************************************************************
Swahili Prediction: ———nr;N<6r<rh—hx”o"h”xv​———r—————E—pP———————EEE——p”—”p——————————p—————w———————p​————————————<p————​v————————​——p—E————E​———————6————————​————————————p————————​————————————————​E——————————————E—E—Pp​————————————————————E———————————E————<—p—————————<—​——————​———

Test English: My airport shuttle bus leaves at six o'clock.**************************************************************

 10%|█         | 1/10 [00:29<04:23, 29.32s/it]

Test Swahili Prediction: <nhama nhako uha kisiasa ni nisaui kanisau
Testing Loss: 1.2805752754211426

------------------------------------------------------------------------
Epoch 1
Iteration: 0 Loss: 1.4511011838912964
English: If only I'd sold that property during the economic bubble, I wouldn't have suffered such a big loss.****************************************************************************************************************************************************************
Swahili Translation: <Laiti ningekuwa nimeuza mali hiyo wakati uchumi ulipokuwa mzuri, nisingepata hasara kubwa kama hii.>***************************************************************************************************************************************************************
Swahili Prediction: <naiti ninnekuwa nimeuma mali hiyo wakati unhumi uninokuwa mmueiu nisinneoata hasaua kunwa kama hiiuoouoouoouououououuuoouoouuuuuuuououooooooooooouoouooouuuuuouuuuouuuoouuuouuoouuoouooouoouuuuoououoooouuuuuu

 20%|██        | 2/10 [00:58<03:54, 29.31s/it]

Test Swahili Prediction: <<hama  hako  ha kisiasa ni fisadi kabisa.
Testing Loss: 0.31992509961128235

------------------------------------------------------------------------
Epoch 2
Iteration: 0 Loss: 0.40784838795661926
English: There is an important alliance between these two countries.*********************************************************************************************************************************************************************************************************
Swahili Translation: <Kuna muungano muhimu kati ya nchi hizi mbili.>*********************************************************************************************************************************************************************************************************************
Swahili Prediction: <<una muungano muhimu kati ya nchi hizi mbili.

Test English: My airport shuttle bus leaves at six o'clock.**************************************************************************************************

 30%|███       | 3/10 [01:27<03:25, 29.30s/it]

Test Swahili Prediction: <<hama chako cha kisiasa ni fisadi kabisa.
Testing Loss: 0.16619783639907837

------------------------------------------------------------------------
Epoch 3
Iteration: 0 Loss: 0.19036589562892914
English: I know Jacob doesn't know why Susie is doing that.******************************************************************************************************************************************************************************************************************
Swahili Translation: <Najua Jacob hajui kwa nini Susie anafanya hivyo.>******************************************************************************************************************************************************************************************************************
Swahili Prediction: <Najua bacob hajui kwa nini 



 30%|███       | 3/10 [01:37<03:48, 32.63s/it]


KeyboardInterrupt: ignored

In [9]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()
print((param_size + buffer_size))
size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))
!mkdir models
MODEL_PATH = pathlib.Path("models/translator.pt")

torch.save(model.state_dict(),MODEL_PATH)

120338764
model size: 114.764MB
mkdir: cannot create directory ‘models’: File exists


In [10]:
def tokenize(eng_sentence,swa_sentence):
  english_sentence_tokenized = [[eng_token_to_index[t] for t in s] for s in eng_sentence]
  swahili_sentence_tokenized = [[swa_token_to_index[t] for t in s] for s in swa_sentence]
  # print(english_sentence_tokenized)
  # print(swahili_sentence_tokenized)
  eval_dataset = TranslationDataset(swahili_sentences=swahili_sentence_tokenized,english_sentences=english_sentence_tokenized,transforms=transforms.ToTensor())
  eval_dataloader = DataLoader(eval_dataset,batch_size=1,shuffle=False)
  return next(iter(eval_dataloader))


def translate(eng_sentence):
  # model = Transformer(**config).to(device)
  # model.load_state_dict(torch.load(pathlib.Path("models/translator.pt")))
  model.eval()
  with torch.inference_mode():
    eng_sentence = (eng_sentence,)
    swa_sentence = (START_TOKEN,)

    for word_counter in range(260):
      eng_batch,encoder_mask,swa_batch,decoder_self_attention_mask,decoder_cross_attention_mask = tokenize(eng_sentence,swa_sentence)
      # print(encoder_mask)
      predictions = model(eng_batch.to(device),swa_batch.to(device),encoder_mask.to(device),decoder_self_attention_mask.to(device),decoder_cross_attention_mask.to(device))
      next_token_prob_distribution = torch.softmax(predictions[0],dim=1).argmax(dim=1)
      # print(next_token_prob_distribution)
      next_token_index = next_token_prob_distribution[word_counter].item()
      next_token = swa_index_to_token[next_token_index]
      swa_sentence = (swa_sentence[0] + next_token, )
      # print(next_token)
      if next_token == END_TOKEN:
        break
      # break
    return swa_sentence[0]

In [11]:
translation = translate("I am a man")
translation

'00000000U000U0UUU00UUUUUUUUU0UUUUUUUUUUUUUUUUUUUUoUUUUUUUUoUUUUUUUUUoUoUUoUUUoUUUUoooUUoooUoUUUUUooooooUUUoooooUUUooUoUoooUooUoooooooooooooooooooooUoooooooooooooooooooooooooooooo!!ooo!oooo!ooooo!!!!o!!!!!!!o!!!!!!!!!o!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'