In [1]:
from utils import *
import numpy as np
import pickle
import wandb

In [2]:
index_to_english_alphabet=pickle.load(open('vocab_tools/index_to_english_alphabet.pickle', 'rb'))
index_to_hindi_alphabet=pickle.load(open('vocab_tools/index_to_hindi_alphabet.pickle', 'rb'))

In [3]:
hindi_alphabet_to_index=pickle.load(open('vocab_tools/hindi_alphabet_to_index.pickle', 'rb')) 
english_alphabet_to_index=pickle.load(open('vocab_tools/english_alphabet_to_index.pickle', 'rb')) 

In [4]:
X_train=np.load('simple_data/X_train.npy')
X_valid=np.load('simple_data/X_val.npy')

y_train=np.load('simple_data/y_train.npy')
y_valid=np.load('simple_data/y_val.npy')

In [5]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
class Eng_Hind_Dataset(Dataset):

    def __init__(self, in_file, out_file, root_dir='simple_data',device='cuda'):

        self.input = torch.tensor(np.load(root_dir+'/'+in_file))
        self.output = torch.tensor(np.load(root_dir+'/'+out_file))
        
        assert(len(self.input)==len(self.output),"Error: I/O Lengths must be same")
        

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        X=self.input[idx]
        X=X.to(device)
        y=self.output[idx]
        y=y.to(device)
        


        sample = {'input': X, 'output': y}

        return sample

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
training_data=Eng_Hind_Dataset("X_train.npy","y_train.npy",device=device)
val_data=Eng_Hind_Dataset("X_val.npy","y_val.npy",device=device)
test_data=Eng_Hind_Dataset("X_test.npy","y_test.npy",device=device)

In [10]:
train_val_sets = torch.utils.data.ConcatDataset([training_data, val_data])

In [11]:
train_val_dataloader = DataLoader(train_val_sets, batch_size=16,shuffle=True)

In [12]:
test_dataloader = DataLoader(test_data, batch_size=16,shuffle=True)

In [13]:
import torch.nn as nn

In [14]:
def cell_type(mode:str='rnn'):
    mode=mode.lower()
    if mode == 'rnn':
        return nn.RNN
    elif mode =='gru':
        return nn.GRU
    else:
        return nn.LSTM
        

In [15]:
class Encoder(nn.Module):
    """
    Input :
        - source batch
    Layer : 
        source batch -> Embedding -> LSTM
    Output :
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    input_dim : int
        Input dimension, should equal to the source vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    def __init__(self, input_size, enc_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi):
        super().__init__()
        
        # create embedding layer
        self.embedding = nn.Embedding(input_size, enc_embed_size,padding_idx=english_alphabet_to_index['.'])

        #creating LSTM/GRU/RNN cell
        cell=cell_type(cell_mode)
        
        self.cell=cell(enc_embed_size,hid_size,num_layers,dropout=dropout,bidirectional=is_bi,batch_first=True)
        self.cell_mode=cell_mode
        
        

    def forward(self, input_batch: torch.LongTensor):
        """

        Parameters
        ----------
        src_batch : 2d torch.LongTensor
            Batched tokenized source sentence of shape [sent len, batch size].

        Returns
        -------
        hidden, cell : 3d torch.LongTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        embedded = self.embedding(input_batch) # [sent len, batch size, emb dim]
        #print('encoder embd',embedded.shape)
        
        if self.cell_mode.lower()=='lstm':
            outputs, (hidden, cell) = self.cell(embedded)
            
            
        else:
            outputs, hidden = self.cell(embedded)
            cell=outputs
        # outputs -> [sent len, batch size, hidden dim * n directions]
        return hidden, cell

In [16]:
class Decoder(nn.Module):
    """
    Input :
        - first token in the target batch
        - LSTM hidden state from the encoder
        - LSTM cell state from the encoder
    Layer :
        target batch -> Embedding -- 
                                   |
        encoder hidden state ------|--> LSTM -> Linear
                                   |
        encoder cell state   -------
        
    Output :
        - prediction
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    output : int
        Output dimension, should equal to the target vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    
    def __init__(self, output_size, dec_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi):
        super().__init__()


        self.embedding = nn.Embedding(output_size, dec_embed_size,padding_idx=hindi_alphabet_to_index['.'])
        
        cell=cell_type(cell_mode)
        
        self.cell=cell(dec_embed_size,hid_size,num_layers,dropout=dropout,bidirectional=is_bi,batch_first=True)
        if is_bi:
            self.out = nn.Linear(hid_size*2, output_size)
        else:
             self.out = nn.Linear(hid_size, output_size)
        
        self.output_size=output_size
        self.cell_mode=cell_mode
        

    def forward(self, trg: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor):
        """

        Parameters
        ----------
        trg : 1d torch.LongTensor
            Batched tokenized source sentence of shape [batch size].
            
        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]

        Returns
        -------
        prediction : 2d torch.LongTensor
            For each token in the batch, the predicted target vobulary.
            Shape [batch size, output dim]

        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        # [1, batch size, emb dim], the 1 serves as sent len
        embedded = self.embedding(trg.unsqueeze(1))
        if self.cell_mode.lower()=='lstm':
            #print('decoder embed',embedded.shape)
            outputs, (hidden, cell) = self.cell(embedded, (hidden, cell))
        else:
            outputs, hidden = self.cell(embedded, hidden)
            cell=hidden
        prediction = self.out(outputs.squeeze(1))
        return prediction, hidden, cell

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device


    def forward(self, source_batch, target_batch, teacher_forcing_ratio=0.5):

        batch_size,max_len  = target_batch.shape
        #print(max_len,batch_size)
        target_vocab_size = self.decoder.output_size
        #print(target_vocab_size)

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, target_vocab_size).to(self.device)

        # last hidden & cell state of the encoder is used as the decoder's initial hidden state
        hidden, cell = self.encoder(source_batch)       

        trg = target_batch[:,0]
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction

            if np.random.random() < teacher_forcing_ratio:
                trg = target_batch[:,i]
            else:
                trg = prediction.argmax(1)

        return outputs


        
        

In [18]:
hindi_alphabet_to_index['>']

1

In [19]:
# E=Encoder(30, embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)
# E=E.to(device)

# D=Decoder(68, embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)

# D=D.to(device)
# S=Seq2Seq(E,D,device)
# S.to(device)    
# print(f'The model has {count_params(S):,} trainable parameters')

In [20]:
E=Encoder(30, 64, 256, 1, 'lstm', 0.1, True)
E=E.to(device)

D=Decoder(68, 128, 256, 1, 'lstm', 0.1, True)

D=D.to(device)
S=Seq2Seq(E,D,device)
S.to(device)    


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(30, 64, padding_idx=2)
    (cell): LSTM(64, 256, batch_first=True, dropout=0.1, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(68, 128, padding_idx=2)
    (cell): LSTM(128, 256, batch_first=True, dropout=0.1, bidirectional=True)
    (out): Linear(in_features=512, out_features=68, bias=True)
  )
)

In [21]:
def accuracy_calc(target_seq,seq2,mode='full',device=device):# predicted
    eos_index=(target_seq==hindi_alphabet_to_index['>']).nonzero()
    eos_idx=eos_index[:,1]
    
    correct=torch.Tensor([0]).to(device)
    correct_chars=torch.Tensor([0]).to(device)
    tot_chars=torch.Tensor([0]).to(device)
    for iterate,idx in enumerate(eos_idx):
        inputter=seq2[iterate][:idx]
        outputter=target_seq[iterate][:idx]
        if torch.all(torch.eq(inputter,outputter)):
            correct+=1
            correct_chars+=idx
            tot_chars+=idx
        else:
            correct_chars+=torch.sum(inputter == outputter).item()
            tot_chars+=idx
            
#         print(correct,correct_chars,tot_chars)
        
    return correct.item(),correct_chars.item(),tot_chars.item()
            
            
        
    

In [22]:
def train(seq2seq, iterator, optimizer, criterion):
    
    
    seq2seq.train()
    
    epoch_loss = 0
    correct=0
    correct_char=0
    tot_char=0
    
    relax_acc=0
    
    
    for batch in iterator:
        optimizer.zero_grad()
        outputs = seq2seq(batch['input'], batch['output'])
        batch_label=batch['output'].transpose(0,1)
        batch_size=len(batch['output'])
        
        _, predicted = torch.max(outputs, dim=2)
        outputs_flatten = outputs.view(-1, outputs.shape[-1])
        trg_flatten = batch_label.reshape(-1)
        

        trg_flatten.requires_grad=False
        loss = criterion(outputs_flatten, trg_flatten)
        correct_temp,correct_chars_temp,tot_chars_temp=accuracy_calc(batch['output'],predicted.transpose(0,1))
        
        #___________
        
        correct+=correct_temp
        correct_char+=correct_chars_temp
        tot_char+=tot_chars_temp
        
        
        #_______________
        

        loss.backward()
        optimizer.step()
        

        epoch_loss += loss.item()
        


    return epoch_loss / len(iterator), correct/(len(iterator)*16),correct_char/tot_char

In [23]:
def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    correct=0
    correct_char=0
    tot_char=0
    
    relax_acc=0
    
    with torch.no_grad():
        for batch in iterator:
            outputs = seq2seq(batch['input'], batch['output'],teacher_forcing_ratio=0)
            batch_label=batch['output'].transpose(0,1)
            batch_size=len(batch['output'])


            _, predicted = torch.max(outputs, dim=2)
            #print('wow_preds',predicted.shape)

            outputs_flatten = outputs.view(-1, outputs.shape[-1])
            trg_flatten = batch_label.reshape(-1)

            loss = criterion(outputs_flatten, trg_flatten)
            
            correct_temp,correct_chars_temp,tot_chars_temp=accuracy_calc(batch['output'],predicted.transpose(0,1))
        
            #___________

            correct+=correct_temp
            correct_char+=correct_chars_temp
            tot_char+=tot_chars_temp

            #_______________       
            
            epoch_loss += loss.item()
            

    return epoch_loss / len(iterator), correct/(len(iterator)*16),correct_char/tot_char



In [24]:
def epoch_time(start_time, end_time):
    e_time = end_time - start_time
    mins = e_time // 60
    secs = e_time%60
    return mins, secs,

In [25]:
def count_params(model):
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

In [26]:
import time
import random

In [27]:
import torch.optim as optim
def make_model(train_iterator,valid_iterator, enc_embed_size,dec_embed_size,
               hid_size, num_layers, cell_mode, dropout, is_bi, epochs=20):
    E=Encoder(30, enc_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)
    E=E.to(device)
    
    D=Decoder(68, dec_embed_size, hid_size, num_layers, cell_mode, dropout, is_bi)
    
    D=D.to(device)
    S=Seq2Seq(E,D,device)
    S.to(device)    
    print(f'The model has {count_params(S):,} trainable parameters')
    
    optimizer = optim.Adam(S.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=hindi_alphabet_to_index['.'])
    criterion=criterion.to(device)
    
    best_valid_loss = float('inf')

    for epoch in range(epochs):    
        start_time = time.time()
        train_loss,train_acc,train_stuff = train(S, train_iterator, optimizer, criterion)
        valid_loss,valid_acc,val_stuff = evaluate(S, valid_iterator, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(S.state_dict(), 'model1.pt')

        # it's easier to see a change in perplexity between epoch as it's an exponential
        # of the loss, hence the scale of the measure is much bigger
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\t Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        print(f'\t Relaxed Train. Acc: {train_stuff*100:.2f}% | Relaxed Val. Acc: {val_stuff*100:.2f}%')
#         wandb.log({'epoch':epoch, 'train loss':train_loss, 'train acc':train_acc, 'valid loss': valid_loss,
#                   'valid acc': valid_acc, 'relxd train acc': train_stuff, 'relxd valid acc': val_stuff})
        
    return S

    

In [28]:
# train_iterator=train_val_dataloader
# valid_iterator=test_dataloader
# SS=make_model(train_iterator,valid_iterator,enc_embed_size=128, dec_embed_size=128,
#                hid_size=256, num_layers=3, cell_mode='lstm', dropout=0.3, is_bi=True, epochs=30)

In [29]:
# # The second saves and loads the entire model:

# torch.save(SS, 'noattn_model.model')



In [30]:
# Then later:

the_model = torch.load('noattn_model.model')


In [31]:
test_full_dataloader = DataLoader(test_data, batch_size=len(test_data),shuffle=False)

In [32]:
the_model.eval()
preds=the_model(next(iter(test_full_dataloader))['input'],next(iter(test_full_dataloader))['output'],teacher_forcing_ratio=0)

In [33]:
_, predicted = torch.max(preds, dim=2)

In [34]:
predicted

tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [29, 35, 21,  ..., 47, 25, 18],
        [ 4, 20,  6,  ...,  9,  4,  6],
        ...,
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  1,  1,  ...,  1,  1,  1]], device='cuda:0')

In [35]:
import math

In [36]:
def word_from_torchies(torchie1,index_toalp):
    torchie=torchie1.cpu().numpy()
    return word_from_vecs(torchie,index_toalp,False)

In [37]:
def word_from_batch(batch):
    wordlet=[]
    for i in range(len(batch)):
        wordlet.append(word_from_torchies(batch[i],index_to_hindi_alphabet))
    return wordlet
        

In [38]:
def word_from_batch_eng(batch):
    wordlet=[]
    for i in range(len(batch)):
        wordlet.append(word_from_torchies(batch[i],index_to_english_alphabet))
    return wordlet

In [39]:
test_preds=word_from_batch(predicted.transpose(0,1))

In [40]:
np.array(accuracy_calc(next(iter(test_full_dataloader))['output'],predicted.transpose(0,1)))/len(next(iter(test_full_dataloader))['output'])

array([0.41074481, 5.97655678, 7.85201465])

In [41]:
test_actual=word_from_batch(next(iter(test_full_dataloader))['output'])

In [42]:
test_input=word_from_batch_eng(next(iter(test_full_dataloader))['input'])

In [43]:
test_input

['sikhaaega',
 'learn',
 'twitters',
 'tirunelveli',
 'independence',
 'speshiyon',
 'shurooh',
 'kolhapur',
 'ajhar',
 'karaar',
 'anka',
 'wpd',
 'haashie',
 'glendale',
 'udhed',
 'ekthi',
 'idea',
 'ambikapur',
 'makerere',
 'saboodaane',
 'foohadta',
 'sequent',
 'shueb',
 'panihati',
 'sametati',
 'ukhrul',
 'brahmlin',
 'utaraadhikaaree',
 'iqbal',
 'dayaalapuraa',
 'sohrai',
 'takreeban',
 'farrukhnagar',
 'theinga',
 'tyoiharon',
 'karneshvardhaam',
 'umanath',
 'daanshil',
 'saahityotsav',
 'shantiniketan',
 'shikayatkarta',
 'andarkhane',
 'panter',
 'leedaron',
 'galgand',
 'kaarniyaan',
 'murgipaalan',
 'mushahid',
 'modules',
 'rajouri',
 'sushrushaa',
 'shringaar',
 'holt',
 'laigikata',
 'ijaajat',
 'vankshetra',
 'bhutal',
 'swaadpremiyon',
 'nineteez',
 'frektar',
 'likhkar',
 'eyarkandeeshnar',
 'nabz',
 'quess',
 'bouni',
 'kaaragujaariyaan',
 'gaangnam',
 'tapia',
 'tezpur',
 'talve',
 'seemaai',
 'darshnaarthi',
 'rivas',
 'tarkvaad',
 'anusaarakaa',
 'coachella',

In [44]:
import pandas as pd
datas={'Ground truth':test_actual,'Predictions':test_preds,}
df=pd.DataFrame(data=datas,index=test_input)


In [45]:
df_correct=df[df['Ground truth']==df['Predictions']]

In [46]:
df_correct

Unnamed: 0,Ground truth,Predictions
sikhaaega,सिखाएगा,सिखाएगा
twitters,ट्विटर्स,ट्विटर्स
tirunelveli,तिरुनेलवेली,तिरुनेलवेली
independence,इंडिपेंडेंस,इंडिपेंडेंस
speshiyon,स्पेशियों,स्पेशियों
...,...,...
seho,सेहो,सेहो
belcha,बेलचा,बेलचा
shbana,शबाना,शबाना
khaatootolaa,खातूटोला,खातूटोला


In [47]:
df_incorrect=df[df['Ground truth']!=df['Predictions']]

In [48]:
[df_incorrect.iloc[-1]['Predictions']]

['शिवास्तवा']

In [49]:
for i,j in zip(df_incorrect.iloc[-5]['Predictions'],df_incorrect.iloc[-5]['Ground truth']):
    print(i==j)
    

True
True
True
True
False
False


In [50]:
[*df_incorrect.iloc[-5]['Predictions']]

['अ', 'फ', 'स', 'र', 'ो', 'ं']

In [51]:
[*df_incorrect.iloc[-5]['Ground truth']]

['अ', 'फ', 'स', 'र', 'ा', 'े', 'ं']

In [52]:
df_incorrect.iloc[-5]['Ground truth']

'अफसराें'

In [53]:
df_incorrect

Unnamed: 0,Ground truth,Predictions
learn,लर्न,लीरन
shurooh,शुरूः,शुरूह
ajhar,अजहर,अझर
karaar,क़रार,करार
anka,अंक,एएनकका
...,...,...
aphasaron,अफसराें,अफसरों
chabate,चबाते,चबते
miti,मिति,मिटी
saflata,सफ़लता,सफलता


In [58]:
np.savetxt('predictions_vanilla.txt',test_preds, delimiter=',',encoding='utf-8',fmt='%s')

In [55]:
the_model.decoder

Decoder(
  (embedding): Embedding(68, 128, padding_idx=2)
  (cell): LSTM(128, 256, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (out): Linear(in_features=512, out_features=68, bias=True)
)