In [1]:
#JV

In [2]:
import numpy as np

import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset, DataLoader, RandomSampler

import os
import gc
import time
import math

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

from tqdm.notebook import tqdm

In [3]:
class LanguageProcessor:

    def __init__(self,language_directory,target_lang_name,mode="train",meta_tokens=True):

        """
        Default Constructor for this class.

        Params:

            language_directory : ex : "aksharantar_sampled/tel/"
            mode : "train" or "test" or "valid", accordingly the appropriate dataset is read.
            meta_tokens : If true creates the first three tokens of the dictionary as <start>,<end>,<pad>.
            
        """

        self.meta_tokens = meta_tokens
        self.language_directory = language_directory
        self.target_lang_name = target_lang
        self.mode = mode
    
        self.source_lang = 0
        self.target_lang = 1

        self.source_max_len = self.find_max_len(self.source_lang)
        self.target_max_len = self.find_max_len(self.target_lang)

        self.max_len = max(self.source_max_len,self.target_max_len)

        self.source_char2id,self.source_id2char = self.build_char_vocab(self.source_lang,self.source_max_len)
        self.target_char2id,self.target_id2char = self.build_char_vocab(self.target_lang,self.target_max_len)


    def find_max_len(self,lang):

        """
        Method to find the maximum length of a word across train/test and validation data.

        This would help in padding, the embedding accordingly.

        Params:

            lang : 0/1 (source/target) language for which the length of the longest word must be found.
        
        """

        train_df = pd.read_csv(self.language_directory+self.target_lang_name+"_train.csv",header=None)
        test_df = pd.read_csv(self.language_directory+self.target_lang_name+"_test.csv",header=None)
        valid_df = pd.read_csv(self.language_directory+self.target_lang_name+"_valid.csv",header=None)

        train_max_len = len(max(list(train_df[lang]), key = len))
        test_max_len = len(max(list(test_df[lang]), key = len))
        valid_max_len = len(max(list(valid_df[lang]), key = len))

        del train_df
        del test_df
        del valid_df

        gc.collect()

        return max(train_max_len,test_max_len,valid_max_len)

    def build_char_vocab(self,lang_id,max_len=None):

        """
        Method to create a vocabulary of characters in language corresponding to lang_id.
        """

        df = pd.read_csv(self.language_directory+self.target_lang_name+"_"+self.mode+".csv",header=None)

        self.data = df.to_numpy()

        lang_chars = []
        lang_words = df[lang_id].to_numpy()
    
        for word in lang_words:
            lang_chars += list(word)
    
        unique_lang_chars =  sorted(list(set(lang_chars)))
    
        start = 0
        
        if self.meta_tokens:
            char2id_dict = {'<start>':0,'<end>':1,'<pad>': 2}
            id2char_dict = {0:'<start>',1:'<end>',2:'<pad>'}
            start = 3
        else:
            char2id_dict = {}
            id2char_dict = {}
    
        for i in range(len(unique_lang_chars)):
            char2id_dict[unique_lang_chars[i]] = i+start
            id2char_dict[i+start] = unique_lang_chars[i]
    
        del df
        del lang_chars
        del unique_lang_chars

        gc.collect()
    
        return char2id_dict,id2char_dict

    def encode_word(self,word,lang_id,padding=False,append_eos = True):

        """
        Method to encode characters of a given word.

        Params:

            word: The word to be encoded.
            lang_id : 0/1 for source/target lang.
            padding : If true, the word encoding would be padded upto max len.
        
        """

        if lang_id == self.source_lang:
            char2id_dict = self.source_char2id
            
        else:
            char2id_dict = self.target_char2id
        
        max_len = self.max_len
        
        word_encoding = []
        
        for i in word.lower():
            word_encoding.append(char2id_dict[i])

        if padding:
            word_encoding += [char2id_dict['<pad>']] * (max_len - len(word_encoding))

        if append_eos:
            word_encoding.append(char2id_dict['<end>'])
        
        return np.array(word_encoding)

    def decode_word(self,code_word,lang_id):

        """
        Method to decode an encoded word.

        Params:

            code_word : The encoded word.
            lang_id : 0/1 for source/target lang.
        """
    
        word = []

        if lang_id == self.source_lang:
            id2char_dict = self.source_id2char
            char2id_dict = self.source_char2id
            
        else:
            id2char_dict = self.target_id2char
            char2id_dict = self.target_char2id
        
        for i in code_word:
            ## if we reached padding, then stop decoding
            if self.meta_tokens and i == char2id_dict['<pad>']:
                break
            
            word.append(id2char_dict[i])
            
        return np.array(word)
            

In [4]:
class WordDataset(Dataset):
    
    def __init__(self, language_processor,append_eos=True,device='cpu'):

        self.lp = language_processor
        self.data = self.lp.data
        self.device = device
        self.append_eos = append_eos

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_word, output_word = self.data[idx]
        
        input_sequence = self.lp.encode_word(input_word,self.lp.source_lang,padding=True,append_eos=True)
        output_sequence = self.lp.encode_word(output_word,self.lp.target_lang,padding=True,append_eos=True)
        
        return torch.tensor(input_sequence).to(device), torch.tensor(output_sequence).to(device)


In [5]:
batch_size = 32

base_dir = "aksharantar_sampled/"
target_lang = "tel"

use_meta_tokens = True
append_eos = 1

lang_dir = base_dir + target_lang + "/"

In [6]:
device = torch.device("mps")

##creating train loader
train_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="train",meta_tokens=use_meta_tokens)
train_dataset = WordDataset(train_lp,device=device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

## creating test loader
test_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="test",meta_tokens=use_meta_tokens)
test_dataset = WordDataset(test_lp,device=device)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## creating validation loader
valid_lp = LanguageProcessor(language_directory=lang_dir,target_lang_name=target_lang,mode="valid",meta_tokens=use_meta_tokens)
valid_dataset = WordDataset(valid_lp,device=device)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [7]:
##in principle these are all fixed across train/test/valid data

start_token_id = train_lp.source_char2id['<start>']
end_token_id = train_lp.source_char2id['<end>']
pad_token_id = train_lp.source_char2id['<pad>']

source_max_len = train_lp.source_max_len
target_max_len = train_lp.target_max_len

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,padding_idx = None ,dropout_p=0.1,num_layers = 1,bidirectional = False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size,padding_idx = padding_idx)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        
        return output, hidden

In [9]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size,max_len,start_token_id,num_layers = 1,bidirectional = False):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True,num_layers = num_layers,bidirectional = bidirectional)
        self.out = nn.Linear(hidden_size, output_size)
        self.max_len = max_len
        self.start_token_id = start_token_id

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(self.start_token_id)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(self.max_len):
            ## print(i,self.max_len)
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            ## print(decoder_output.shape)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [10]:
"""
One thing to take care of:

    In encoder we set no grad for pad token in nn.Embed
    
    In decoder, 
        => need to see what to do after actual word is generated. How to deal with padding?

"""

'\nOne thing to take care of:\n\n    In encoder we set no grad for pad token in nn.Embed\n    \n    In decoder, \n        => need to see what to do after actual word is generated. How to deal with padding?\n\n'

In [11]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        ## print(f"Data Shape : {data[0].shape,data[1].shape}")
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        #print(decoder_outputs.shape,decoder_outputs.view(-1, decoder_outputs.size(-1)).shape,target_tensor.shape,target_tensor.view(-1).shape)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [12]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [13]:
def compute_accuracy(dataloader,encoder,decoder,padding_token_id,ignore_padding = True,device='cpu'):

    """
    Method to compute the accuracy using the model (encoder-decoder) using dataloader.

    This method returns word and character level accuracy.

        Word Level Accuracy : Accuracy is computed at the word level and a word is right iff every character is predicted correctly.
        Char Level Accuracy : Accuracy is computed by comparing each predicted character wrt the correct char.

    Params:

        dataloader : The train/test/valid dataloader.
        encoder : The encoder 
        decoder : The decoder
        padding_token_id : The id of the padding token.
        ignore_padding : If True, then in word level accuracy, the padding characters are ignored in computing the word level accuracy.
                        char level accuracy, the padding characters are not considered at all.

                        If false, padding is considered to be a part of the word (for word level accuracy) and 
    """

    char_lvl_accuracy = 0
    word_level_accuracy = 0

    tot_chars = 0
    tot_words = 0

    tot_correct_char_preds = 0
    tot_correct_word_preds = 0

    for data in dataloader:
        
        input_tensor, target_tensor = data

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)


        ## For a batch, for each character find the most probable output word.
        multi_step_preds = torch.argmax(decoder_outputs,dim=2)
        multi_step_pred_correctness = (multi_step_preds ==  target_tensor)
        num_chars = multi_step_preds.numel() ##find the total number of characters in the current batch
        num_words = multi_step_preds.shape[0] ##find the total number of words in the current batch.

        if ignore_padding: ## if padding has to be ignored.

            ## for each word, based on the padding token ID, find the first occurance of the padding token, marking the begining of padding.
            padding_start = torch.argmax(target_tensor == padding_token_id,dim=1).to(device)
            ## Creating a mask with 1's in each position of a padding token
            mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))

            ##doing a logical OR with the mask makes sure that the padding tokens do not affect the correctness of the word
            tot_correct_word_preds += (torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum()).item()
            tot_words += num_words

            ##creating a complement of the mask so to mark padding tokens as irrelevant
            complement_mask = (1-mask.int()).bool()
            num_pad_chars = mask.int().sum().item()
            ##counting number of non_pad_chars to compute accuracy.
            num_non_pad_chars = num_chars - num_pad_chars

            tot_correct_char_preds += (torch.logical_and(multi_step_pred_correctness,complement_mask).int().sum()).item()
            tot_chars += num_non_pad_chars
            
    
        else: ##otherwise.

            tot_correct_word_preds += (torch.all(multi_step_pred_correctness,dim=1).int().sum()).item()
            tot_words += num_words
            
            tot_correct_char_preds += (multi_step_pred_correctness.int().sum()).item()
            tot_chars += num_chars

    #print(tot_correct_char_preds,tot_chars)
    #print(tot_correct_word_preds,tot_words)

    char_lvl_accuracy = round(tot_correct_char_preds*100/tot_chars,2)
    word_lvl_accuracy = round(tot_correct_word_preds*100/tot_words,2)

    return char_lvl_accuracy,word_lvl_accuracy

In [14]:
def train(train_dataloader,valid_loader, encoder, decoder, n_epochs, learning_rate=0.001,print_every=100, plot_every=100,device='cpu'):
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    lp = train_loader.dataset.lp
    
    criterion = nn.NLLLoss().to(device)

    for epoch in tqdm(range(1, n_epochs + 1)):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        val_char_lvl_accuracy,val_word_level_accuracy = compute_accuracy(valid_loader,encoder,decoder,padding_token_id = lp.source_char2id['<pad>'],ignore_padding=True,device=device)

        print(f"Epoch {epoch}\t C-Val Acc : {val_char_lvl_accuracy}%\t W-Val Acc : {val_word_level_accuracy}%")

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            #print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),epoch, epoch / n_epochs * 100, print_loss_avg))
            train_char_lvl_accuracy,train_word_level_accuracy = compute_accuracy(train_loader,encoder,decoder,padding_token_id = lp.source_char2id['<pad>'],ignore_padding=True,device=device)
            print(f"Epoch {epoch}\t C-Train Acc : {train_char_lvl_accuracy}%\t W-Train Acc : {train_word_level_accuracy}%")
            

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [16]:
"""multi_step_preds = torch.argmax(decoder_ops,dim=2)

(multi_step_preds ==  target_tensor).int().sum()

num_chars = multi_step_preds.numel()
num_words = multi_step_preds.shape[0]

multi_step_pred_correctness = (multi_step_preds == target_tensor)


adding_start = torch.argmax(target_tensor == 2,dim=1).to(device)
mask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))
word_level_accuracy = round(((torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum())/num_words).item()*100,2)

complement_mask = (1-mask.int()).bool()
num_pad_chars = mask.int().sum()
num_non_pad_chars = num_chars - num_pad_chars

multi_step_pred_correctness_for_non_pad_chars = torch.logical_and(multi_step_pred_correctness,complement_mask).int().sum()/num_non_pad_chars
char_lvl_accuracy = round(multi_step_pred_correctness_for_non_pad_chars.item()*100, 2)"""


'multi_step_preds = torch.argmax(decoder_ops,dim=2)\n\n(multi_step_preds ==  target_tensor).int().sum()\n\nnum_chars = multi_step_preds.numel()\nnum_words = multi_step_preds.shape[0]\n\nmulti_step_pred_correctness = (multi_step_preds == target_tensor)\n\n\nadding_start = torch.argmax(target_tensor == 2,dim=1).to(device)\nmask = (torch.arange(target_tensor.size(1)).unsqueeze(0).to(device) >= padding_start.unsqueeze(1))\nword_level_accuracy = round(((torch.all(torch.logical_or(multi_step_pred_correctness,mask),dim=1).int().sum())/num_words).item()*100,2)\n\ncomplement_mask = (1-mask.int()).bool()\nnum_pad_chars = mask.int().sum()\nnum_non_pad_chars = num_chars - num_pad_chars\n\nmulti_step_pred_correctness_for_non_pad_chars = torch.logical_and(multi_step_pred_correctness,complement_mask).int().sum()/num_non_pad_chars\nchar_lvl_accuracy = round(multi_step_pred_correctness_for_non_pad_chars.item()*100, 2)'

In [17]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [18]:
def evaluate(encoder, decoder, word, language_processor,device = "cpu"):

    lp = language_processor
    
    with torch.no_grad():
        input_tensor = torch.tensor(lp.encode_word(word,lp.source_lang,padding=True,append_eos = True)).to(device).view(1,-1)

        print(input_tensor.shape)
        
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        print(encoder_outputs.shape, encoder_hidden.shape)
        
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_chars = []
        for idx in decoded_ids:
            if idx.item() == end_token_id:
                decoded_chars.append('<end>')
                break
            decoded_chars.append(lp.target_id2char[idx.item()])
    return decoded_chars, decoder_attn

In [19]:
input_size = len(list(train_lp.source_char2id.keys()))
output_size = len(list(train_lp.target_char2id.keys()))

hidden_size = 64
batch_size = 32

num_encoder_layers = 1
num_decoder_layers = 1

bidirectional = False

encoder = EncoderRNN(input_size, hidden_size,num_encoder_layers,bidirectional).to(device)
decoder = DecoderRNN(hidden_size, output_size,train_lp.max_len+append_eos,start_token_id,num_decoder_layers,bidirectional).to(device)

train(train_loader,valid_loader, encoder, decoder, 3, print_every=3, plot_every=3,device=device)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1	 C-Val Acc : 3.18%	 W-Val Acc : 0.0%
Epoch 2	 C-Val Acc : 3.75%	 W-Val Acc : 0.0%
Epoch 3	 C-Val Acc : 5.0%	 W-Val Acc : 0.0%
Epoch 3	 C-Train Acc : 48.11%	 W-Train Acc : 0.07%


In [20]:
op,_ = evaluate(encoder, decoder, word="srirama", language_processor=train_lp,device = device)
#op,_ = evaluate(encoder, decoder, word="mamidi", language_processor=train_lp,device = device)

op_string = ""

for char in op:
    if char == '<pad>':
        break
    op_string += char

print(op_string)

torch.Size([1, 29])
torch.Size([1, 29, 64]) torch.Size([1, 1, 64])
స్రారియ్


In [None]:
"""
1) Push Current Encoder-Decoder Code.
2) Implement accuracy computation (wordlevel and charlevel)
3) Teacher forcing
4) Support for LSTM, GRU & RNN
5) Multiple layers : GRU etc
6) Bi-directional
7) Beam Search


"""

In [None]:
train_dataset.data

In [None]:
input_tensor = torch.tensor(train_lp.encode_word("srirama",train_lp.source_lang,padding=True,append_eos = True)).to(device)

In [None]:
len(list(train_lp.source_char2id.keys()))

In [None]:
len(list(train_lp.target_char2id.keys()))

In [None]:
train_lp.max_len

In [None]:
train_lp.target_char2id

In [None]:
device = torch.device("mps")