# Transformer Architecture
Follows the "Attention is all you need" paper architecture

# Import Libraries

In [1]:
import torch
from torch import nn
from torch import optim
import math
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
import numpy as np
import pathlib
import tqdm

# Data Preparation

In [2]:
# read in the data from files to lists of strings
swa_sentences = []
with open("./data/translate/gamayun_kit5k.swa","r") as f:
    swa_sentences = f.readlines()
eng_sentences = []
with open("./data/translate/gamayun_kit5k.eng","r") as f:
    eng_sentences = f.readlines()

# remove last \n at the end of every line
swa_sentences = [s.rstrip("\n") for s in swa_sentences]
eng_sentences = [s.rstrip("\n") for s in eng_sentences]

print(f"Size of swahili dataset: {len(swa_sentences)} ")
print(f"Size of english dataset: {len(eng_sentences)} ")
print(f"Max swahili sentence: {max([len(s) for s in swa_sentences])} ")
print(f"Max english sentence: {max([len(s) for s in eng_sentences])} ")

print(swa_sentences[:5])
print(eng_sentences[:5])

START_TOKEN = '>'
PADDING_TOKEN = '*'
END_TOKEN = '<'

# prep swa vocab
swa_vocab = list(set(''.join(swa_sentences)))
swa_vocab.insert(0,START_TOKEN)
swa_vocab.append(PADDING_TOKEN)
swa_vocab.append(END_TOKEN)

# prep eng vocab
eng_vocab = list(set(''.join(eng_sentences)))
eng_vocab.insert(0,START_TOKEN)
eng_vocab.append(PADDING_TOKEN)
eng_vocab.append(END_TOKEN)

print(f"Eng vocab: {eng_vocab}")
print(f"Swa vocab: {swa_vocab}")

swa_vocab_size = len(swa_vocab)
eng_vocab_size = len(eng_vocab)


print(f"Eng vocab_size :{len(swa_vocab)}")
print(f"Swa vocab_size :{len(eng_vocab)}")

swa_token_to_index = {t:i for i,t in enumerate(swa_vocab)}
print(swa_token_to_index)
swa_index_to_token = {i:t for i,t in enumerate(swa_vocab)}
# print(swa_index_to_token)
eng_token_to_index = {t:i for i,t in enumerate(eng_vocab)}
print(eng_token_to_index)
eng_index_to_token = {i:t for i,t in enumerate(eng_vocab)}
# print(eng_index_to_token)

# tokenize
swahili_sentences_tokenized = [[swa_token_to_index[t] for t in s] for s in swa_sentences]
english_sentences_tokenized = [[eng_token_to_index[t] for t in s] for s in eng_sentences]

# train/test split
swahili_sentences_tokenized_train = swahili_sentences_tokenized[:4500]
swahili_sentences_tokenized_test = swahili_sentences_tokenized[4500:]
english_sentences_tokenized_train = english_sentences_tokenized[:4500]
english_sentences_tokenized_test = english_sentences_tokenized[4500:]

Size of swahili dataset: 5000 
Size of english dataset: 5000 
Max swahili sentence: 249 
Max english sentence: 233 
['Huyo ni rafiki yako mpya?', 'Job hana hamu ya mpira wa vikapu.', 'Adam aliniambia kuwa Alice alikuwa na mpenzi mpya wa kiume', 'Radio haikutanga kuhusu ajali hiyo.', 'Adamu ana wasiwasi tutapotea.']
['Is that your new friend?', "Jacob wasn't interested in baseball.", 'Adam told me that Alice had a new boyfriend.', "The radio didn't inform about the accident.", "Adam is worried we'll get lost."]
Eng vocab: ['>', '.', ' ', 'D', 'O', 'g', 'l', 'c', ')', '—', 'H', '0', '?', 'E', 'F', '“', 'é', 'Y', '(', '5', 'A', 'S', 'i', '1', '8', 'Q', 'h', 'T', 'Z', 'x', 'n', 'L', 'W', 'U', '°', 'à', 'y', 'a', '’', ':', 'q', 'C', 'o', 'b', 'f', 'J', '4', '&', 'w', 'M', 'N', '-', '"', '7', '6', 's', 'G', 'd', 'P', '_', 'K', 'k', '3', 'v', 'u', 'j', ',', '”', '9', 'p', 'm', 'V', 'z', 'e', '$', 'I', '2', 't', 'r', 'B', '!', "'", ';', 'R', '*', '<']
Swa vocab: ['>', '.', ' ', 'D', 'O', 'g', 

In [3]:
NEG_INFTY = -1e9

class TranslationDataset(Dataset):

    def __init__(self, swahili_sentences, english_sentences, transforms=None, eng_max_sequence_length=250, swa_max_sequence_length=250):
        print(swa_max_sequence_length,eng_max_sequence_length)
        self.english_sentences = english_sentences
        self.swahili_sentences = swahili_sentences
        self.transforms = transforms
        self.eng_max_sequence_length = eng_max_sequence_length
        self.swa_max_sequence_length = swa_max_sequence_length
        self.encoder_padding_mask = torch.full([self.eng_max_sequence_length, self.eng_max_sequence_length] , False) # each sentence gets a mask
        self.look_ahead_mask = torch.triu(torch.full([self.swa_max_sequence_length, self.swa_max_sequence_length] , True), diagonal=1)
        self.decoder_padding_mask_self_attention = torch.full([self.swa_max_sequence_length, self.swa_max_sequence_length] , False) # each sentence gets a mask
        self.decoder_padding_mask_cross_attention = torch.full([self.swa_max_sequence_length, self.eng_max_sequence_length] , False) # each sentence gets a mask

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        eng_sentence = self.english_sentences[idx]
        swa_sentence = self.swahili_sentences[idx]
        eng_sentence_length = len(eng_sentence)
        swa_sentence_length = len(swa_sentence)
        eng_chars_to_padding_mask = np.arange(eng_sentence_length, self.eng_max_sequence_length) # fillers
        swa_chars_to_padding_mask = np.arange(swa_sentence_length, self.swa_max_sequence_length) # fillers

        for _ in range(len(eng_sentence), self.eng_max_sequence_length):
            eng_sentence.append(eng_token_to_index[PADDING_TOKEN])
        for _ in range(len(swa_sentence), self.swa_max_sequence_length):
            swa_sentence.append(swa_token_to_index[PADDING_TOKEN])

        self.encoder_padding_mask[:, eng_chars_to_padding_mask] = True
        self.encoder_padding_mask[eng_chars_to_padding_mask, :] = True
        encoder_padding_mask = torch.where(self.encoder_padding_mask, NEG_INFTY, 0) # encoder mask

        self.decoder_padding_mask_self_attention[:, swa_chars_to_padding_mask] = True
        self.decoder_padding_mask_self_attention[swa_chars_to_padding_mask, :] = True
        decoder_self_attention_mask = torch.where(self.decoder_padding_mask_self_attention+self.look_ahead_mask, NEG_INFTY, 0) # decoder self-attention mask

        self.decoder_padding_mask_cross_attention[:, eng_chars_to_padding_mask] = True
        self.decoder_padding_mask_cross_attention[swa_chars_to_padding_mask, :] = True
        decoder_cross_attention_mask = torch.where(self.decoder_padding_mask_cross_attention, NEG_INFTY, 0) # decoder cross-attention mask

        if self.transforms:
            swa_sentence = self.transforms(swa_sentence)
            eng_sentence = self.transforms(eng_sentence)

        return eng_sentence,encoder_padding_mask, swa_sentence, decoder_self_attention_mask, decoder_cross_attention_mask

# Input

Token Embeddings & Positional Encodings

In [4]:
class EmeddingsLayer(nn.Module):
    def __init__(self, d_model:int, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.d_model)

    def forward(self, X):
        return self.embedding(X) * math.sqrt(self.d_model) # "In the embedding layers, we multiply those weights by sqrt(d_model)"


class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, context_size:int):
        super().__init__()
        self.d_model = d_model
        self.context_size = context_size
        # print(self.context_size)

        self.pe = torch.zeros(self.context_size, self.d_model,requires_grad=False)
        for pos in range(self.context_size):
            for i in range(0, self.d_model, 2):
                self.pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.d_model)))
                self.pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.d_model)))
    
    def forward(self):
        return self.pe.unsqueeze(0)


embed_test = EmeddingsLayer(d_model=512,vocab_size=50000)
pencoding = PositionalEncoding(d_model=512,context_size=1024)
example_data = torch.randint(1,50000,(64,1024))
print(example_data.shape)
embed_output = embed_test(example_data)
print(embed_output.shape,embed_output[0][0][:10])
pe_output = pencoding()
print(pe_output.shape,pe_output[0][0][:10])
embed_pos_output = embed_output + pe_output
print(embed_pos_output.shape,embed_pos_output[0][0][:10])


torch.Size([64, 1024])
torch.Size([64, 1024, 512]) tensor([-25.7044,  -8.7932,  58.8619,  48.2357, -36.0688,  -3.0579, -11.2855,
         -8.4295, -44.5982, -29.2443], grad_fn=<SliceBackward0>)
torch.Size([1, 1024, 512]) tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1.])
torch.Size([64, 1024, 512]) tensor([-25.7044,  -7.7932,  58.8619,  49.2357, -36.0688,  -2.0579, -11.2855,
         -7.4295, -44.5982, -28.2443], grad_fn=<SliceBackward0>)


# Attention

In [5]:
class AttentionHead(nn.Module):
    
    def __init__(self, head_dim:int,p_drop:float) -> None:
        super().__init__()
        self.queries= nn.Linear(in_features=head_dim,out_features=head_dim) # kaparthy set bias=False why?
        self.keys = nn.Linear(in_features=head_dim,out_features=head_dim) # kaparthy set bias=False why?
        self.values = nn.Linear(in_features=head_dim,out_features=head_dim) # kaparthy set bias=False why?
        self.dropout = nn.Dropout(p=p_drop)

    def forward(self,Q:torch.Tensor,K:torch.Tensor,V:torch.Tensor,mask:torch.Tensor = None) -> torch.Tensor:
        B,T,C = K.shape
        Q = self.dropout(self.queries(Q))
        K = self.dropout(self.keys(K))
        V = self.dropout(self.values(V))
        scaled_dot_product_attention = (Q @ K.transpose(2,1))/torch.sqrt(torch.tensor(C))
        if mask is not None:
            scaled_dot_product_attention = scaled_dot_product_attention + mask
        dot_product_softened = torch.softmax(scaled_dot_product_attention,dim=-1)
        return dot_product_softened @ V


class MultiHeadSelfAttention(nn.Module):
    def __init__(self,d_model:int, p_drop:float, num_heads:int = 8) -> None:
        super().__init__()
        self.head_dim = math.floor(d_model/num_heads)
        self.layer_norm = nn.LayerNorm(d_model)
        self.heads = [AttentionHead(head_dim=self.head_dim,p_drop=p_drop) for h in range(num_heads)]
        self.linear = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(p=p_drop)
    
    def forward(self,X:torch.Tensor,Q:tuple,K:tuple,V:tuple, mask:torch.Tensor = None) ->torch.Tensor:
        heads_output = []
        for head_index,head in enumerate(self.heads):
            queries = Q[head_index]
            keys = K[head_index]
            values = V[head_index]
            v = head(queries,keys,values,mask) # this could be distributed to multiple devices for // processing
            heads_output.append(v) # accumulate result
         
        o = torch.cat(heads_output,dim=-1) #concat
        linear_output = self.linear(o) #linear
        dropped_output = self.dropout(linear_output) #dropout
        mhsa_output = self.layer_norm(X+dropped_output)
        return mhsa_output

mhsa = MultiHeadSelfAttention(d_model=512,p_drop=0.1,num_heads=8)
sample_data = torch.randn((10,250,512))
splits = torch.split(sample_data,64,dim=2)
mhsa(sample_data,splits,splits,splits,mask=None).shape

torch.Size([10, 250, 512])

# Position-wise Feedforward Network

In [6]:
class FeedForward(nn.Module):
    def __init__(self,d_model:int,p_drop:float,d_ff:int) -> None:
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(in_features=d_model,out_features=d_ff),
            nn.ReLU(),
            nn.Dropout(p=p_drop),
            nn.Linear(in_features=d_ff,out_features=d_model)
        )
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self,X:torch.Tensor) -> torch.Tensor:
        return self.layer_norm(X+self.ffn(X))

# Encorder

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model:int, p_drop:float, d_ff:int, num_heads:int, **kwargs) -> None:
        super().__init__()
        self.head_dim = d_model // num_heads
        self.multihead_self_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.feedforward = FeedForward(d_model=d_model,p_drop=p_drop,d_ff=d_ff)

    def forward(self,X:torch.Tensor,mask:torch.Tensor=None) -> torch.Tensor:
        splits = torch.split(X,self.head_dim,dim=2)
        return self.feedforward(self.multihead_self_attention(X,splits,splits,splits,mask))
    
class Encoder(nn.Module):
    def __init__(self, number_of_encoder_blocks:int=6,**kwargs) -> None:
        super().__init__()
        self.encoders = nn.ModuleList([EncoderLayer(**kwargs) for n in range(number_of_encoder_blocks)])

    def forward(self,X:torch.Tensor,mask:torch.Tensor=None) -> torch.Tensor:
        outputs = X
        for encoder_layer in self.encoders:
            outputs = encoder_layer(outputs,mask)
        return outputs

# Decoder

In [8]:
class DecoderLayer(nn.Module):
    def __init__(self,d_model:int,p_drop:float,d_ff:int,num_heads:int,**kwargs) -> None:
        super().__init__()
        self.head_dim = d_model // num_heads
        self.masked_multi_head_self_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.masked_multi_head_cross_attention = MultiHeadSelfAttention(d_model=d_model,p_drop=p_drop,num_heads=num_heads)
        self.feedforward = FeedForward(d_model=d_model,p_drop=p_drop,d_ff=d_ff)

    def forward(self,outputs:torch.Tensor,encoded_sequence:torch.Tensor,self_attention_mask:torch.Tensor=None,cross_attention_mask:torch.Tensor=None) -> torch.Tensor:
        output_splits = torch.split(outputs,self.head_dim,dim=2)
        masked_output = self.masked_multi_head_self_attention(outputs,Q=output_splits,K=output_splits,V=output_splits,mask=self_attention_mask)
        
        masked_output_splits = torch.split(masked_output,self.head_dim,dim=2)
        encoded_sequence_splits = torch.split(encoded_sequence,self.head_dim,dim=2)
        mhsa_output = self.masked_multi_head_cross_attention(masked_output,Q=masked_output_splits,K=encoded_sequence_splits,V=encoded_sequence_splits,mask=cross_attention_mask)
        return self.feedforward(mhsa_output)


class Decoder(nn.Module):
    def __init__(self, number_of_decoder_blocks:int, **kwargs) -> None:
        super().__init__()
        self.decoder_layers = nn.ModuleList([DecoderLayer(**kwargs) for n in range(number_of_decoder_blocks)])
        
    def forward(self,outputs:torch.Tensor,encoded_sequence:torch.Tensor,self_attention_mask:torch.Tensor=None,cross_attention_mask:torch.Tensor=None) -> torch.Tensor:
        for decoder_layer in self.decoder_layers:
            outputs = decoder_layer(outputs,encoded_sequence,self_attention_mask,cross_attention_mask)
        return outputs



# Transformer

In [9]:
class Transformer(nn.Module):
    def __init__(self,
                 eng_vocab_size:int,
                 swa_vocab_size:int,
                 batch_size:int,
                 context_size:int,
                 d_model:int,
                 d_ff:int,
                 num_heads:int,
                 number_of_encoder_blocks:int,
                 number_of_decoder_blocks:int,
                 p_drop:float):
        
        super().__init__()
        self.context_size = context_size

        self.encoder_embedding = EmeddingsLayer(d_model=d_model,vocab_size=eng_vocab_size)
        self.decoder_embedding = EmeddingsLayer(d_model=d_model,vocab_size=swa_vocab_size)
        self.positional_encoding = PositionalEncoding(d_model=d_model,context_size=context_size)
        self.dropout = nn.Dropout(p=p_drop)
        self.encoder = Encoder(
                            batch_size=batch_size,
                            context_size=context_size,
                            d_model=d_model,
                            d_ff=d_ff,
                            num_heads=num_heads,
                            number_of_encoder_blocks=number_of_encoder_blocks,
                            p_drop=p_drop)
        
        self.decoder = Decoder(
                            batch_size=batch_size,
                            context_size=context_size,
                            d_model=d_model,
                            p_drop=p_drop,
                            d_ff=d_ff,
                            num_heads=num_heads,
                            number_of_decoder_blocks=number_of_decoder_blocks)
        
        self.linear = nn.Linear(in_features=d_model,out_features=swa_vocab_size)


    def forward(self,X:torch.Tensor,y:torch.Tensor,encoder_mask:torch.Tensor,decoder_self_attention_mask:torch.Tensor,decoder_cross_attention_mask:torch.Tensor) -> torch.Tensor:
        pos_encoding = self.positional_encoding() #
        # print(pos_encoding[:5])

        # encode
        input_embeddings = self.encoder_embedding(X) 
        # print(input_embeddings[:5])

        inputs = self.dropout(input_embeddings+pos_encoding) # B*T*C
        # print(inputs[:5])
        # print(encoder_mask[:5])

        encoded_sequence = self.encoder(inputs,encoder_mask)
        # print(encoded_sequence[:5])
        # decode
        output_embedding = self.decoder_embedding(y)
        outputs = self.dropout(output_embedding+pos_encoding) # B*T*C
        decoder_output = self.decoder(outputs,encoded_sequence,self_attention_mask=decoder_self_attention_mask,cross_attention_mask=decoder_cross_attention_mask)
        # linear
        output_logits = self.linear(decoder_output)
        # output_probs = torch.softmax(output_logits,dim=-1)

        return output_logits

In [10]:
training_dataset = TranslationDataset(swahili_sentences=swahili_sentences_tokenized_train,english_sentences=english_sentences_tokenized_train,transforms=transforms.ToTensor())
testing_dataset = TranslationDataset(swahili_sentences=swahili_sentences_tokenized_test,english_sentences=english_sentences_tokenized_test,transforms=transforms.ToTensor())
training_dataloader = DataLoader(training_dataset,batch_size=10,shuffle=True)
testing_dataloader = DataLoader(testing_dataset,batch_size=10,shuffle=False)

eng_sentence,encoder_mask,swa_sentence,decoder_self_attention_mask,decoder_cross_attention_mask = next(iter(training_dataloader))
print(eng_sentence.shape,swa_sentence.shape)
print(eng_sentence[0])

250 250
250 250
torch.Size([10, 250]) torch.Size([10, 250])
tensor([49, 64, 26, 37, 70, 70, 37, 57,  2, 55, 48, 22, 77,  7, 26, 73, 57,  2,
        42, 44, 44,  2, 37,  6,  6,  2, 77, 26, 73,  2,  6, 22,  5, 26, 77, 55,
         2, 37, 30, 57,  2,  6, 22, 77,  2, 77, 26, 73,  2,  7, 37, 30, 57,  6,
        73, 55,  1, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
        84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 

# Train Transformer


In [13]:
config = {
    "eng_vocab_size":eng_vocab_size,
    "swa_vocab_size":swa_vocab_size,
    "batch_size":100,
    "context_size":250,
    "d_model":512,
    "num_heads":8,
    "d_ff":2048,
    "number_of_encoder_blocks": 2,
    "number_of_decoder_blocks": 2,
    "p_drop":0.1
}
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Transformer(**config)
# model

In [15]:
EPOCHS = 10
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-4)

for epoch in tqdm.tqdm(range(EPOCHS)):
    print(f"Epoch {epoch}")
    # train
    model.train()
    for batch,(eng_batch,encoder_mask,swa_batch,decoder_self_attention_mask,decoder_cross_attention_mask) in enumerate(training_dataloader):
        train_logits = model(eng_batch,swa_batch,encoder_mask,decoder_self_attention_mask,decoder_cross_attention_mask)
        loss = loss_fn(train_logits.view(-1,swa_vocab_size),swa_batch.view(-1))
        valid_indicies = torch.where(swa_batch.view(-1) == swa_token_to_index[PADDING_TOKEN], 0, 1)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optimizer.step()
        if batch % 100 == 0:
            print(f"Iteration: {batch} Loss: {loss.item()}")
            print(f"English: {''.join([eng_index_to_token[t.item()] for t in eng_batch[0]])}")
            print(f"Swahili Translation: {''.join([swa_index_to_token[t.item()] for t in swa_batch[0]])}")
            swa_sentence_predicted = torch.argmax(train_logits[0], axis=1)
            predicted_sentence = ''
            for idx in swa_sentence_predicted:
              if idx.item() == swa_token_to_index[END_TOKEN]:
                break
              elif idx.item() == swa_token_to_index[PADDING_TOKEN]:
                predicted_sentence += ''
              else:
                predicted_sentence += swa_index_to_token[idx.item()]
            print(f"Swahili Prediction: {predicted_sentence}")

    # test
    model.eval()
    for test_batch,(test_eng_batch,test_encoder_mask,test_swa_batch,test_decoder_self_attention_mask,test_decoder_cross_attention_mask) in enumerate(testing_dataloader):
        print(f"Test English: {''.join([eng_index_to_token[t.item()] for t in test_eng_batch[0]])}")
        test_logits = model(test_eng_batch,test_swa_batch,test_encoder_mask,test_decoder_self_attention_mask,test_decoder_cross_attention_mask)
        swa_sentence_predicted = torch.argmax(test_logits[0], axis=1)
        predicted_sentence = ''
        for idx in swa_sentence_predicted:
          if idx == swa_token_to_index[END_TOKEN]:
            break
          elif idx == swa_token_to_index[PADDING_TOKEN]:
            predicted_sentence += ''
          else:
            predicted_sentence += swa_index_to_token[idx.item()]
        print(f"Test Swahili Prediction: {predicted_sentence}")
        loss = loss_fn(test_logits.view(-1,swa_vocab_size),test_swa_batch.view(-1))
        valid_indicies = torch.where(test_swa_batch.view(-1) == swa_token_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        print(f"Testing Loss: {loss.item()}")
        print()
    print("------------------------------------------------------------------------")


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0
Iteration: 0 Loss: 0.008157229982316494
English: Science has discovered that there are five types of human beings.*****************************************************************************************************************************************************************************************
Swahili Translation: Sayansi imegundua kuwa kuna aina tano za wanadamu.********************************************************************************************************************************************************************************************************
Swahili Prediction: ”Y)+(Ed)RL+hH(l​;&7P;;"fk;L5d(?;1;(1)”;)Vm(;1​L1O((((((4(4(4(4444:(:((444444V)4((:4(((((((((:(4:44((((((((444:4(((4(4(((((:((((((4(V4(4(4((4(4(44(:((V(((444(444(((((V((4(44((4((4((44((V((((((((s(((444V(4(44(44::4V(:s4(((((:(44(44V((((:((((4((4(444(4


In [14]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()
print((param_size + buffer_size))
size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))
!mkdir models
MODEL_PATH = pathlib.Path("models/translator.pt")

torch.save(model.state_dict(),MODEL_PATH)

40456524
model size: 38.582MB
mkdir: cannot create directory ‘models’: File exists
