In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from nltk.tokenize import word_tokenize
from torchtext.vocab import build_vocab_from_iterator
import string
import random
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
file_path = "../Sample Data/english to bengali.csv"
pd.read_csv(file_path).head()

Unnamed: 0,english_caption,bengali_caption
0,a child in a pink dress is climbing up a set o...,একটি গোলাপী জামা পরা বাচ্চা মেয়ে একটি বাড়ির প্...
1,a girl going into a wooden building .,একটি মেয়ে শিশু একটি কাঠের বাড়িতে ঢুকছে
2,a little girl climbing into a wooden playhouse .,একটি বাচ্চা তার কাঠের খেলাঘরে উঠছে ।
3,a little girl climbing the stairs to her playh...,ছোট মেয়েটি তার খেলার ঘরের সিড়ি বেয়ে উঠছে
4,a little girl in a pink dress going into a woo...,গোলাপি জামা পড়া ছোট একটি মেয়ে একটি কাঠের তৈরি...


In [74]:
class BaseDataset(Dataset):
    def __init__(self, file_path: str, column: str, add_sos_eos: bool=False):
        self.df = pd.read_csv(file_path)
        self.sentences = self.df[column]
        self.vocabs = build_vocab_from_iterator(
            self.token_genarator(self.sentences)
            )
        self.add_sos_eos = add_sos_eos
        if self.add_sos_eos == True:
            extra_tokens = ["<PAD>", "<SOS>", "<EOS>", "<UNK>"]
            for token in extra_tokens:
                self.vocabs.append_token(token)
        else:
            extra_tokens = ["<PAD>", "<UNK>"]
            for token in extra_tokens:
                self.vocabs.append_token(token)

    def token_genarator(self, sentences):
        for text in sentences:
            clean_text = "".join(
                [word for word in text 
                 if word not in string.punctuation]
            )
            tokens = word_tokenize(clean_text)
            yield tokens

    def text_to_sequences(self, sentences):
        sequence = [
            self.vocabs[token] if token in self.vocabs
            else self.vocabs["<UNK>"]
            for token in word_tokenize(sentences)
            ]
        if self.add_sos_eos == True:
            sequence = [self.vocabs["<SOS>"]] + sequence + [self.vocabs["<EOS>"]]

        return sequence

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        item = self.sentences[index]
        sequence = self.text_to_sequences(item)
        return torch.tensor(sequence)
    

class CombineDataset(Dataset):
    def __init__(self, data_path, eng_column, ban_column):
        self.eng_data = BaseDataset(
            file_path= data_path,
            column = eng_column, 
            add_sos_eos=True,
        )
        self.bng_data = BaseDataset(
            file_path= data_path,
            column = ban_column,
            add_sos_eos=True
        )

    @staticmethod
    def collate_fn(batch):
        en, bn = zip(*batch)
        # en = [item[0] for item in batch]
        # bn = [item[1] for item in batch]
        en_padded = pad_sequence(en, padding_value=0, batch_first=False)
        bn_padded = pad_sequence(bn, padding_value=0, batch_first=False)
        return en_padded, bn_padded 

    def __len__(self):
        return len(self.eng_data)
    
    def __getitem__(self, index):
        eng_item = self.eng_data[index]
        bng_item = self.bng_data[index]
        return eng_item, bng_item

In [75]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, n_layers, dropout_rate, debugging=False):
        super(Encoder, self).__init__()
        self.debugging = debugging
        self.embed_layer = nn.Embedding(input_size, embed_size)
        self.rnn_layer = nn.GRU(embed_size, hidden_size, n_layers)
        self.dropout_layer = nn.Dropout(p=dropout_rate)

    def forward(self, source):
        # Apply dropout and get embedding
        embedding = self.dropout_layer(self.embed_layer(source))
        # Unpack RNN
        output, hidden = self.rnn_layer(embedding)

        if self.debugging:
            print("Encoder Embedding Shape", embedding.shape)
            print("Encoder Output Shape", output.shape)
            print("Encoder Hidden Shape", hidden.shape)
            
        return output, hidden

In [76]:
class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, n_layers, dropout_rate, debugging=False):
        super(Decoder, self).__init__()
        self.debugging = debugging
        self.embed_layer = nn.Embedding(output_size, embed_size)
        self.rnn_layer = nn.GRU(embed_size, hidden_size, n_layers)
        self.fc_layer = nn.Linear(hidden_size, output_size)
        self.dropout_layer = nn.Dropout(p=dropout_rate)

    def forward(self, input, encoder_hidden):
        #Input shape (batch_size) so we have to add an extra dim (1, batch_size)
        input = input.unsqueeze(0)
        embed = self.dropout_layer(self.embed_layer(input))
        #embed size (1, batch_size, embed_size)
        #encoder hidden shape (layer_size, batch_size, hidden_size)
        output, hidden = self.rnn_layer(embed, encoder_hidden)
        prediction = self.fc_layer(output)
        #prediction shape (1, batch_size, target_vocab_size) but need (batch_size, target_vocab_size)
        prediction = prediction.squeeze(0)

        if self.debugging:
            print("Decoder Embedding Shape", embed.shape)
            print("Decoder Input Shape", input.shape)
            print("Decoder Prediction Shape", prediction.shape)
            print("Decoder Hidden Shape", hidden.shape)

        return prediction, hidden

In [77]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, output_size):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.output_size = output_size

    def forward(self, source, target, tfr=0.5):
        batch_size = source.shape[1]
        seq_len = target.shape[0]

        encoder_output, encoder_hidden = self.encoder(source)
        # Start with the first word of the target "<SOS>"
        start = target[0]
        outputs = torch.zeros(seq_len, batch_size, self.output_size).to(device)
        for t in range(1, seq_len):
            decoder_output, decoder_hidden = self.decoder(start, encoder_hidden)
            outputs[t] = decoder_output

            top_pred = decoder_output.argmax(1)
            start = (target[t] if random.random() < tfr else top_pred)
            
        return outputs

In [78]:
def train_fn(model, loss_fn, dataloader, optimizer, device, clip_size=1.0):
    model.train()
    current_loss = 0.0
    
    for batch, (source, target) in enumerate(dataloader):
        source = source.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        output = model(source, target)
        output = output[1:].reshape(-1, output.shape[-1])
        target = target[1:].reshape(-1)
        loss = loss_fn(output, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_size)
        optimizer.step()

        current_loss += loss.item()
    avg_loss = current_loss / len(dataloader)
    return avg_loss

In [79]:
def evaluate_fn(model, loss_fn, dataloader, device):
    model.eval()
    current_loss = 0.0
    
    with torch.no_grad():
        for batch, (source, target) in enumerate(dataloader):
            source = source.to(device)
            target = target.to(device)

            output = model(source, target)
            output = output[1:].reshape(-1, output.shape[-1])
            target = target[1:].reshape(-1)
            loss = loss_fn(output, target)

            current_loss += loss.item()

    avg_loss = current_loss / len(dataloader)
    return avg_loss

In [80]:
dataset = CombineDataset(
    data_path="/kaggle/input/english-to-bengali-for-machine-translation/english to bengali.csv",
    eng_column="english_caption", ban_column="bengali_caption")

dataloader = DataLoader(
    dataset,
    batch_size=512,
    shuffle=True,
    collate_fn=dataset.collate_fn)

english, bangla = next(iter(dataloader))
print(english.shape, bangla.shape)
print(f"Bangla Vocabs: {len(dataset.bng_data.vocabs)}")
print(f"English Vocabs: {len(dataset.eng_data.vocabs)}")

input_size = len(dataset.eng_data.vocabs)
output_size = len(dataset.bng_data.vocabs)
embed_size = 500
hidden_size = 128
n_layers = 1
dropout_rate = 0.1
lr_rate = 0.01

encoder = Encoder(
    input_size, 
    embed_size, 
    hidden_size, 
    n_layers, 
    dropout_rate).to(device)
decoder = Decoder(
    output_size, 
    embed_size, 
    hidden_size, 
    n_layers, 
    dropout_rate).to(device)

model = Seq2Seq(encoder, decoder, output_size).to(device)

pad_idx = dataset.bng_data.vocabs["<PAD>"]
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)

torch.Size([33, 512]) torch.Size([27, 512])
Bangla Vocabs: 16744
English Vocabs: 8748


In [81]:
epochs = 10
for epoch in tqdm(range(epochs)):
    train_loss = train_fn(model, loss_fn,dataloader,optimizer,device)
    val_loss = evaluate_fn(model, loss_fn, dataloader, device)
    print("Train Loss", train_loss)
    print("Validation Loss", val_loss)

 10%|█         | 1/10 [02:09<19:25, 129.46s/it]

Train Loss 3.0242136360762957
Validation Loss 2.543363803392881


 20%|██        | 2/10 [04:19<17:19, 129.91s/it]

Train Loss 2.602988229169474
Validation Loss 2.4377011772874115


 30%|███       | 3/10 [06:31<15:16, 130.95s/it]

Train Loss 2.4374144154709656
Validation Loss 2.3145221502749953


 40%|████      | 4/10 [08:44<13:09, 131.66s/it]

Train Loss 2.30901465787516
Validation Loss 2.2233435627701996


 50%|█████     | 5/10 [10:57<11:00, 132.00s/it]

Train Loss 2.261038986119357
Validation Loss 2.1581644562931803


 60%|██████    | 6/10 [13:10<08:49, 132.38s/it]

Train Loss 2.2437540494002306
Validation Loss 2.1386744465146745


 70%|███████   | 7/10 [15:22<06:37, 132.40s/it]

Train Loss 2.1654809759808824
Validation Loss 2.1134472125536434


 80%|████████  | 8/10 [17:33<04:23, 131.84s/it]

Train Loss 2.133637058270442
Validation Loss 2.0551144627781657


 90%|█████████ | 9/10 [19:42<02:11, 131.06s/it]

Train Loss 2.0964670475427205
Validation Loss 2.0448613352589793


100%|██████████| 10/10 [21:51<00:00, 131.14s/it]

Train Loss 2.1328918779051147
Validation Loss 2.031452330675992





In [1]:
# torch.save(model.state_dict(), "Seq2Seq_Model")

In [2]:
# model.load_state_dict(torch.load("/kaggle/working/Seq2Seq_Model", map_location=device))

In [None]:
def translate_sentence(sentence, model=model, dataset=dataset, device=device):
    model.eval()
    with torch.no_grad():
        tokens = [token for token in word_tokenize(sentence)]
        tokens = ["<SOS>"] + tokens + ["<EOS>"]
        indices = dataset.eng_data.vocabs.lookup_indices(tokens)
        
        tensor = torch.LongTensor(indices).unsqueeze(-1).to(device)
        _, hidden = model.encoder(tensor)
        inputs = dataset.bng_data.vocabs.lookup_indices(["<SOS>"])
        
        for _ in range(len(sentence)):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden = model.decoder(inputs_tensor, hidden)
            predicted_token = output.argmax(1)
            inputs.append(predicted_token)
            if predicted_token == dataset.bng_data.vocabs["<EOS>"]:
                break
        tokens = dataset.bng_data.vocabs.lookup_tokens(inputs)
    return " ".join(tokens)

sentence = "a girl"
translate_sentence(sentence)