# Imports and utils

In [3]:
from utils import *
import numpy as np
import pickle
import wandb

In [4]:
index_to_english_alphabet=pickle.load(open('vocab_tools/index_to_english_alphabet.pickle', 'rb'))
index_to_hindi_alphabet=pickle.load(open('vocab_tools/index_to_hindi_alphabet.pickle', 'rb'))

In [5]:
hindi_alphabet_to_index=pickle.load(open('vocab_tools/hindi_alphabet_to_index.pickle', 'rb')) 
english_alphabet_to_index=pickle.load(open('vocab_tools/english_alphabet_to_index.pickle', 'rb')) 

In [6]:
X_train=np.load('simple_data/X_train.npy')
X_valid=np.load('simple_data/X_val.npy')

y_train=np.load('simple_data/y_train.npy')
y_valid=np.load('simple_data/y_val.npy')

In [7]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

  from .autonotebook import tqdm as notebook_tqdm


# Dataloaders

In [8]:
class Eng_Hind_Dataset(Dataset):

    def __init__(self, in_file, out_file, root_dir='simple_data',device='cuda'):

        self.input = torch.tensor(np.load(root_dir+'/'+in_file))
        self.output = torch.tensor(np.load(root_dir+'/'+out_file))
        
        assert(len(self.input)==len(self.output),"Error: I/O Lengths must be same")
        

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        X=self.input[idx]
        X=X.to(device)
        y=self.output[idx]
        y=y.to(device)
        


        sample = {'input': X, 'output': y}

        return sample

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
device

device(type='cuda')

In [11]:
training_data=Eng_Hind_Dataset("X_train.npy","y_train.npy",device=device)
val_data=Eng_Hind_Dataset("X_val.npy","y_val.npy",device=device)
test_data=Eng_Hind_Dataset("X_test.npy","y_test.npy",device=device)

In [12]:
train_val_sets = torch.utils.data.ConcatDataset([training_data, val_data])

In [13]:
train_val_dataloader = DataLoader(train_val_sets, batch_size=16,shuffle=True)

In [14]:
test_dataloader = DataLoader(test_data, batch_size=16,shuffle=True)

In [15]:
import torch.nn as nn

In [16]:
def cell_type(mode:str='rnn'):
    mode=mode.lower()
    if mode == 'rnn':
        return nn.RNN
    elif mode =='gru':
        return nn.GRU
    else:
        return nn.LSTM
        

# Model

In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, enc_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi):
        """
        Encoder module for sequence-to-sequence model.

        Parameters
        ----------
        input_size : int
            Input size, should be equal to the source vocabulary size.

        enc_embed_size : int
            Embedding layer's dimension.

        hid_size : int
            LSTM Hidden/Cell state's dimension.

        num_layers : int
            Number of LSTM layers.

        cell_mode : str
            Type of cell to use: LSTM, GRU, or RNN.

        dropout : float
            Dropout rate for the LSTM layer.

        is_bi : bool
            Whether the LSTM layer is bidirectional or not.
        """
        super().__init__()

        # Create embedding layer
        self.embedding = nn.Embedding(input_size, enc_embed_size, padding_idx=english_alphabet_to_index['.'])

        # Create LSTM/GRU/RNN cell
        cell = cell_type(cell_mode)

        self.cell = cell(enc_embed_size, hid_size, num_layers, dropout=dropout, bidirectional=is_bi, batch_first=True)
        self.cell_mode = cell_mode

    def forward(self, input_batch: torch.LongTensor):
        """
        Forward pass of the Encoder module.

        Parameters
        ----------
        input_batch : torch.LongTensor


        Returns
        -------
 
        
        hidden : torch.LongTensor
            Hidden state of the LSTM layer. 

        cell : torch.LongTensor
            Cell state of the LSTM layer. 
        """
        embedded = self.embedding(input_batch)  # [sent len, batch size, emb dim]

        if self.cell_mode.lower() == 'lstm':
            outputs, (hidden, cell) = self.cell(embedded)
        else:
            outputs, hidden = self.cell(embedded)
            cell = outputs

        return hidden, cell


In [18]:
class Decoder(nn.Module):
    def __init__(self, output_size, dec_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi):
        """
        Decoder module for sequence-to-sequence model.

        Parameters
        ----------
        output_size : int
            Output size, should be equal to the target vocabulary size.

        dec_embed_size : int
            Embedding layer's dimension.

        hid_size : int
            LSTM Hidden/Cell state's dimension.

        num_layers : int
            Number of LSTM layers.

        cell_mode : str
            Type of cell to use: LSTM, GRU, or RNN.

        dropout : float
            Dropout rate for the LSTM layer.

        is_bi : bool
            Whether the LSTM layer is bidirectional or not.
        """
        super().__init__()

        self.embedding = nn.Embedding(output_size, dec_embed_size, padding_idx=hindi_alphabet_to_index['.'])

        cell = cell_type(cell_mode)

        self.cell = cell(dec_embed_size, hid_size, num_layers, dropout=dropout, bidirectional=is_bi, batch_first=True)
        if is_bi:
            self.out = nn.Linear(hid_size*2, output_size)
        else:
            self.out = nn.Linear(hid_size, output_size)

        self.output_size = output_size
        self.cell_mode = cell_mode

    def forward(self, trg: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor):
        """
        Forward pass of the Decoder module.

        Parameters
        ----------
        trg : torch.LongTensor
            Target tensor of shape [batch size].

        hidden : torch.FloatTensor
            Hidden state of the Encoder's LSTM layer.

        cell : torch.FloatTensor
            Cell state of the Encoder's LSTM layer.

        Returns
        -------
        prediction : torch.FloatTensor
            Prediction tensor.

        hidden : torch.FloatTensor
            Hidden state of the Decoder's LSTM layer.

        cell : torch.FloatTensor
            Cell state of the Decoder's LSTM layer.
        """
        embedded = self.embedding(trg.unsqueeze(1))

        if self.cell_mode.lower() == 'lstm':
            outputs, (hidden, cell) = self.cell(embedded, (hidden, cell))
        else:
            outputs, hidden = self.cell(embedded, hidden)
            cell = hidden

        prediction = self.out(outputs.squeeze(1))
        return prediction, hidden, cell


In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        """
        Sequence-to-Sequence model consisting of an Encoder and Decoder.

        Parameters
        ----------
        encoder : nn.Module
            Encoder module.

        decoder : nn.Module
            Decoder module.

        device : str
            Device to run the model on (e.g., 'cpu', 'cuda').
        """
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source_batch, target_batch, teacher_forcing_ratio=0.5):
        """
        Forward pass of the Seq2Seq model.

        Parameters
        ----------
        source_batch : torch.LongTensor
            Batched tokenized source sentences

        target_batch : torch.LongTensor
            Batched tokenized target sentences

        teacher_forcing_ratio : float, optional
            The probability of using teacher forcing during training.

        Returns
        -------
        outputs : torch.FloatTensor
        """
        batch_size, max_len = target_batch.shape
        target_vocab_size = self.decoder.output_size

        # Tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, target_vocab_size).to(self.device)

        # Last hidden and cell state of the encoder is used as the decoder's initial hidden state
        hidden, cell = self.encoder(source_batch)

        trg = target_batch[:, 0]
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction

            if np.random.random() < teacher_forcing_ratio:
                trg = target_batch[:, i]
            else:
                trg = prediction.argmax(1)

        return outputs


In [20]:
def accuracy_calc(target_seq,seq2,mode='full',device=device):# predicted
    eos_index=(target_seq==hindi_alphabet_to_index['>']).nonzero()
    eos_idx=eos_index[:,1]
    
    correct=torch.Tensor([0]).to(device)
    correct_chars=torch.Tensor([0]).to(device)
    tot_chars=torch.Tensor([0]).to(device)
    for iterate,idx in enumerate(eos_idx):
        inputter=seq2[iterate][:idx]
        outputter=target_seq[iterate][:idx]
        if torch.all(torch.eq(inputter,outputter)):
            correct+=1
            correct_chars+=idx
            tot_chars+=idx
        else:
            correct_chars+=torch.sum(inputter == outputter).item()
            tot_chars+=idx
            
#         print(correct,correct_chars,tot_chars)
        
    return correct.item(),correct_chars.item(),tot_chars.item()
            
            
        
    

In [21]:
def train(seq2seq, iterator, optimizer, criterion):
    """
    Train the Seq2Seq model.

    Parameters
    ----------
    seq2seq : nn.Module
        Seq2Seq model.

    iterator : torch.utils.data.DataLoader
        Data iterator.

    optimizer : torch.optim.Optimizer
        Optimizer for training.

    criterion : nn.Module
        Loss function.

    Returns
    -------
    epoch_loss : float
        Average loss per epoch.

    accuracy : float
        Accuracy of the model (per sequence).

    char_accuracy : float
        Accuracy of the model (per character).
    """
    seq2seq.train()

    epoch_loss = 0
    correct = 0
    correct_char = 0
    tot_char = 0
    relax_acc = 0

    for batch in iterator:
        optimizer.zero_grad()
        outputs = seq2seq(batch['input'], batch['output'])
        batch_label = batch['output'].transpose(0, 1)
        batch_size = len(batch['output'])

        _, predicted = torch.max(outputs, dim=2)
        outputs_flatten = outputs.view(-1, outputs.shape[-1])
        trg_flatten = batch_label.reshape(-1)

        trg_flatten.requires_grad = False
        loss = criterion(outputs_flatten, trg_flatten)
        correct_temp, correct_chars_temp, tot_chars_temp = accuracy_calc(batch['output'], predicted.transpose(0, 1))

        correct += correct_temp
        correct_char += correct_chars_temp
        tot_char += tot_chars_temp

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator), correct / (len(iterator) * 16), correct_char / tot_char


In [22]:
def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    correct=0
    correct_char=0
    tot_char=0
    
    relax_acc=0
    
    with torch.no_grad():
        for batch in iterator:
            outputs = seq2seq(batch['input'], batch['output'],teacher_forcing_ratio=0)
            batch_label=batch['output'].transpose(0,1)
            batch_size=len(batch['output'])


            _, predicted = torch.max(outputs, dim=2)
            #print('wow_preds',predicted.shape)

            outputs_flatten = outputs.view(-1, outputs.shape[-1])
            trg_flatten = batch_label.reshape(-1)

            loss = criterion(outputs_flatten, trg_flatten)
            
            correct_temp,correct_chars_temp,tot_chars_temp=accuracy_calc(batch['output'],predicted.transpose(0,1))
        
            #___________

            correct+=correct_temp
            correct_char+=correct_chars_temp
            tot_char+=tot_chars_temp

            #_______________       
            
            epoch_loss += loss.item()
            

    return epoch_loss / len(iterator), correct/(len(iterator)*16),correct_char/tot_char



In [23]:
def epoch_time(start_time, end_time):
    e_time = end_time - start_time
    mins = e_time // 60
    secs = e_time%60
    return mins, secs,

In [24]:
def count_params(model):
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

In [25]:
import time
import random

In [26]:
import torch.optim as optim
def make_model(train_iterator,valid_iterator, enc_embed_size,dec_embed_size,
               hid_size, num_layers, cell_mode, dropout, is_bi, epochs=20):
    E=Encoder(30, enc_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)
    E=E.to(device)
    
    D=Decoder(68, dec_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)
    
    D=D.to(device)
    S=Seq2Seq(E,D,device)
    S.to(device)    
    print(f'The model has {count_params(S):,} trainable parameters')
    
    optimizer = optim.Adam(S.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=hindi_alphabet_to_index['.'])
    criterion=criterion.to(device)
    
    best_valid_loss = float('inf')

    for epoch in range(epochs):    
        start_time = time.time()
        train_loss,train_acc,train_stuff = train(S, train_iterator, optimizer, criterion)
        valid_loss,valid_acc,val_stuff = evaluate(S, valid_iterator, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(S.state_dict(), 'model1.pt')


        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        print(f'\t Relaxed Train. Acc: {train_stuff*100:.2f}% | Relaxed Val. Acc: {val_stuff*100:.2f}%')
        
    return S

    

# Make custom model 
pass your data loaders, configurations etc.

In [28]:

train_iterator=train_val_dataloader
valid_iterator=test_dataloader
SS=make_model(train_iterator,valid_iterator,enc_embed_size=128, dec_embed_size=128,
               hid_size=256, num_layers=3, cell_mode='lstm', dropout=0.3, is_bi=True, epochs=30)

The model has 7,936,324 trainable parameters
Epoch: 01 | Time: 1.0m 18.74s
	 Train Loss: 1.546 | Train Acc: 12.95%
	 Val. Loss: 1.521 |  Val. Acc: 27.29%
	 Relaxed Train. Acc: 64.22% | Relaxed Val. Acc: 68.21%


In [29]:
#Uncomment to Save Model

# torch.save(SS, 'noattn_model.model')



In [30]:
# # Uncomment to laod model:

# the_model = torch.load('noattn_model.model')
the_model=SS

In [28]:
test_full_dataloader = DataLoader(test_data, batch_size=len(test_data),shuffle=False)

In [29]:
#Run on test dataset

the_model.eval()
preds=the_model(next(iter(test_full_dataloader))['input'],next(iter(test_full_dataloader))['output'],teacher_forcing_ratio=0)

In [30]:
_, predicted = torch.max(preds, dim=2)

# Utils To view the words

Pass predictions.transpose(0,1)

Pass batch['output']

In [35]:
import math

In [36]:
def word_from_torchies(torchie1,index_toalp):
    torchie=torchie1.cpu().numpy()
    return word_from_vecs(torchie,index_toalp,False)

In [37]:
def word_from_batch(batch):
    wordlet=[]
    for i in range(len(batch)):
        wordlet.append(word_from_torchies(batch[i],index_to_hindi_alphabet))
    return wordlet
        

In [38]:
def word_from_batch_eng(batch):
    wordlet=[]
    for i in range(len(batch)):
        wordlet.append(word_from_torchies(batch[i],index_to_english_alphabet))
    return wordlet

In [39]:
test_preds=word_from_batch(predicted.transpose(0,1))

In [41]:
test_actual=word_from_batch(next(iter(test_full_dataloader))['output'])

In [42]:
test_input=word_from_batch_eng(next(iter(test_full_dataloader))['input'])

In [44]:
#view in table form
import pandas as pd
datas={'Ground truth':test_actual,'Predictions':test_preds,}
df=pd.DataFrame(data=datas,index=test_input)


In [45]:
df_correct=df[df['Ground truth']==df['Predictions']]

In [46]:
df_correct

Unnamed: 0,Ground truth,Predictions
sikhaaega,सिखाएगा,सिखाएगा
twitters,ट्विटर्स,ट्विटर्स
tirunelveli,तिरुनेलवेली,तिरुनेलवेली
independence,इंडिपेंडेंस,इंडिपेंडेंस
speshiyon,स्पेशियों,स्पेशियों
...,...,...
seho,सेहो,सेहो
belcha,बेलचा,बेलचा
shbana,शबाना,शबाना
khaatootolaa,खातूटोला,खातूटोला


In [47]:
df_incorrect=df[df['Ground truth']!=df['Predictions']]

In [48]:
[df_incorrect.iloc[-1]['Predictions']]

['शिवास्तवा']

In [49]:
for i,j in zip(df_incorrect.iloc[-5]['Predictions'],df_incorrect.iloc[-5]['Ground truth']):
    print(i==j)
    

True
True
True
True
False
False


In [50]:
[*df_incorrect.iloc[-5]['Predictions']]

['अ', 'फ', 'स', 'र', 'ो', 'ं']

In [51]:
[*df_incorrect.iloc[-5]['Ground truth']]

['अ', 'फ', 'स', 'र', 'ा', 'े', 'ं']

In [52]:
df_incorrect.iloc[-5]['Ground truth']

'अफसराें'

In [53]:
df_incorrect

Unnamed: 0,Ground truth,Predictions
learn,लर्न,लीरन
shurooh,शुरूः,शुरूह
ajhar,अजहर,अझर
karaar,क़रार,करार
anka,अंक,एएनकका
...,...,...
aphasaron,अफसराें,अफसरों
chabate,चबाते,चबते
miti,मिति,मिटी
saflata,सफ़लता,सफलता
