In [1]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading https://files.pythonhosted.org/packages/f0/58/8d1e621f87bbc4217fb8ce6628a2eb08b65a64582c5531becf41da5d721c/indic_nlp_library-0.6-py3-none-any.whl
Collecting morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: morfessor, indic-nlp-library
Successfully installed indic-nlp-library-0.6 morfessor-2.0.6


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch
from indicnlp.tokenize import indic_tokenize
import time
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pickle

In [0]:
class Encoder(nn.Module):
    def __init__(self,input_vocab_sz,input_embedding_dim,encoder_dim,decoder_dim):
        super(Encoder,self).__init__()
        self.embedding_layer=nn.Embedding(input_vocab_sz,input_embedding_dim)
        self.rnn = nn.GRU(input_embedding_dim,encoder_dim,bidirectional=True)
        self.forward_net = nn.Linear(encoder_dim * 2, decoder_dim)
    
    def forward(self,input,input_len):
        #embed input
        embeddings = self.embedding_layer(input)
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(embeddings,input_len,enforce_sorted=False)

        #feed into rnn to get all hidden states
        packed_hidden_states , last_hidden_state = self.rnn(packed_embeddings)
        hidden_states, _ = nn.utils.rnn.pad_packed_sequence(packed_hidden_states) #unpack

        #compute first hidden state for decoder
        last_hidden_state = torch.tanh(self.forward_net(torch.cat((last_hidden_state[-2,:,:], last_hidden_state[-1,:,:]), dim = 1)))

        return hidden_states, last_hidden_state

# import torch
# encoder = Encoder(12,18,24)
# inp = torch.tensor([[2,3,4,5,7],[4,5,6,1,1],[6,5,1,1,1]]).long()
# inpsz = torch.tensor([5,3,2])
# hid = encoder(inp.permute(1,0),inpsz)
# print(hid.shape)
# print(hid)




In [0]:
class Attention(nn.Module):
    def __init__(self,encoder_dim,decoder_dim):
        super(Attention,self).__init__()
        self.a = nn.Linear(encoder_dim*2 + decoder_dim,decoder_dim)
        self.v = nn.Parameter(torch.rand(decoder_dim))
        #attention computing model applies linear layer on concatenated encode, decoder inputs
        #and then multiplies with a parameter to get dimension down to 1.

    def forward(self,decoder_hidden,encoder_hiddens,mask):
        #calculate e_ij
        max_inp_sentence_length = encoder_hiddens.shape[0]
        batch_size = encoder_hiddens.shape[1]

        #step1 : linear layer
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1,max_inp_sentence_length,1)
        encoder_hiddens = encoder_hiddens.permute(1,0,2)
        energy = torch.tanh(self.a(torch.cat((decoder_hidden,encoder_hiddens),dim=2)))

        #step2 : mul with parameter to reduce dim
        energy = energy.permute(0,2,1)
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)

        #mask pad tokens
        attention = attention.masked_fill(mask == 0, -1e10)
        
        return F.softmax(attention, dim = 1)

In [0]:
class Decoder(nn.Module):
    def __init__(self,output_vocab_sz,output_embedding_dim,encoder_dim,decoder_dim):
        super(Decoder,self).__init__()
        self.embedding_layer = nn.Embedding(output_vocab_sz,output_embedding_dim)
        self.f = nn.GRU(output_embedding_dim + encoder_dim * 2  , decoder_dim)
        self.g = nn.Linear(output_embedding_dim + decoder_dim + encoder_dim * 2,output_vocab_sz)
        self.attention = Attention(encoder_dim,decoder_dim)

    def forward(self,input,decoder_hidden,encoder_hiddens,mask):
        #input = y_im1 , decoder_hidden = s_im1 , encoder_hiddens = h 
        
        #compute attention
        attn = self.attention(decoder_hidden,encoder_hiddens,mask)
        
        #compute weighted context vector c_i
        attn = attn.unsqueeze(1)
        encoder_hiddens = encoder_hiddens.permute(1, 0, 2)

        c_i = torch.bmm(attn, encoder_hiddens)
        
        #compute new decoder hidden state
        y_im1 = self.embedding_layer(input.unsqueeze(0))
        c_i = c_i.permute(1,0,2)
        rnn_input = torch.cat((y_im1,c_i), dim=2 ) 

        s_i, s_i_copy = self.f(rnn_input,decoder_hidden.unsqueeze(0)) 

        assert (s_i==s_i_copy).all()
        
        #compute next token
        y_im1 = y_im1.squeeze(0)
        s_i = s_i.squeeze(0)
        c_i = c_i.squeeze(0)

        y_i = self.g(torch.cat((s_i, c_i, y_im1), dim = 1))

        return y_i, s_i_copy.squeeze(0), attn.squeeze(1)


In [0]:
class Seq2Seq(nn.Module):
    def __init__(self,args,input_vocab_sz,output_vocab_sz,pad_idx, sos_idx, eos_idx):
        super(Seq2Seq,self).__init__()
        self.input_vocab_sz = input_vocab_sz
        self.output_vocab_sz = output_vocab_sz
        self.encoder = Encoder(input_vocab_sz,args['input_embedding_dim'],args['encoder_dim'],args['decoder_dim'])
        self.decoder = Decoder(output_vocab_sz,args['output_embedding_dim'],args['encoder_dim'],args['decoder_dim])
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = args['device']
        
    def create_mask(self, src):
        mask = (src != self.pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
     
        #src = [src sent len, batch size]
        #src_len = [batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        if trg is None:
            assert teacher_forcing_ratio == 0, "Must be zero during inference"
            inference = True
            trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            
        batch_size = src.shape[1]
        max_len = trg.shape[0] if trg is not None else 100
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, self.output_vocab_sz).to(self.device)
        
        #tensor to store attention
        attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_hiddens, hidden_last = self.encoder(src, src_len)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        mask = self.create_mask(src)
                
        #mask = [batch size, src sent len]
                
        for t in range(1, max_len):
            output, hidden_last, attention = self.decoder(output, hidden_last, encoder_hiddens, mask)
            outputs[t] = output
            attentions[t] = attention
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if (teacher_force and not inference) else top1)
            if inference and output.item() == self.eos_idx:
                return outputs[:t], attentions[:t]
            
        return outputs, attentions

In [0]:
def en_preprocessor(text):
    return [t.lower().replace('.','') for t in text.split()]

def hi_preprocessor(text):
    return [token for token in indic_tokenize.trivial_tokenize(text)]

def collator(batch,PAD_IDX):
    max_src_len = max_trg_len = 0
    for x,y in batch:
        max_src_len = max(max_src_len,len(x))
        max_trg_len = max(max_trg_len,len(y))
    X=[]
    X_len= []
    Y=[]
    for x,y in batch:
        X.append(x+[PAD_IDX for i in range(max_src_len-len(x))])
        X_len.append(len(x))
        Y.append(y+[PAD_IDX for i in range(max_trg_len-len(y))])
    
    Y=torch.tensor(Y).permute(1,0).contiguous()
    X=torch.tensor(X).permute(1,0).contiguous()
    X_len =torch.tensor(X_len)
    return (X,X_len),Y

class Vocab:
    def __init__(self,src_dic=None,trg_dic=None):
        self.src_stoi = src_dic
        self.src_itos = defaultdict(self.ret_unk)
       
        if self.src_stoi is not None:
            for k,v in self.src_stoi.items():
                self.src_itos[v]=k

        self.trg_stoi = trg_dic
        self.trg_itos = defaultdict(self.ret_unk)
        
        if self.trg_stoi is not None:
            for k,v in self.trg_stoi.items():
                self.trg_itos[v]=k
    
    def ret_z(self):
        return 0
    def ret_unk(self):
        return '<UNK>'
    
    def build_dic(self,path,preprocessor):
        dic=defaultdict(self.ret_z)
        dic['<sos>']=1
        dic['<eos>']=2
        dic['<pad>']=3
        ctr =  4
        with open(path,'r') as F:
            for line in F:
                for token in preprocessor(line):
                    if token not in dic:
                        dic[token]=ctr
                        ctr+=1
        return dic
    
    def add_src_dic(self,dic):
        self.src_stoi = dic
        for k,v in self.src_stoi.items():
            self.src_itos[v]=k
    
    def add_trg_dic(self,dic):
        self.trg_stoi = dic
        for k,v in self.trg_stoi.items():
            self.trg_itos[v]=k

class DataReader(IterableDataset):
    def __init__(self,args,paths,src_preprocessor,trg_preprocessor,DIC=None):
        self.src_path = paths[0]
        self.trg_path = paths[1]
        
        self.vocab = Vocab()
        if DIC is None:
            src_dic = self.vocab.build_dic(self.src_path,src_preprocessor)
            trg_dic = self.vocab.build_dic(self.trg_path,trg_preprocessor)
            self.vocab.add_src_dic(src_dic)
            self.vocab.add_trg_dic(trg_dic)
        else:
            self.vocab=DIC
        
        self.src_preprocessor = src_preprocessor
        self.trg_preprocessor = trg_preprocessor

    def line_mapper(self, line, is_src):
        text = line
        tokens = []
        if is_src:
            tokens.append(self.vocab.src_stoi['<sos>'])
            tokens = tokens + [self.vocab.src_stoi[token] for token in self.src_preprocessor(text)]
            tokens.append(self.vocab.src_stoi['<eos>'])
        else:
            tokens.append(self.vocab.trg_stoi['<sos>'])
            tokens = tokens + [self.vocab.trg_stoi[token] for token in self.trg_preprocessor(text)]
            tokens.append(self.vocab.trg_stoi['<eos>'])
        return tokens

    def __iter__(self):
        #Create an iterator
        src_itr = open(self.src_path)
        trg_itr = open(self.trg_path)
        
        #Map each element using the line_mapper
        mapped_src_itr = map(lambda text : self.line_mapper(text,True), src_itr)
        mapped_trg_itr = map(lambda text : self.line_mapper(text,False), trg_itr)
        
        #Zip both iterators
        zipped_itr = zip(mapped_src_itr, mapped_trg_itr)
        
        return zipped_itr

# #TEST
# import config
# args,unparsed = config.get_args()
# test_dataset = DataReader(args,('./Data/dev_test/dev.en','./Data/dev_test/dev.hi'),en_preprocessor,hi_preprocessor)
# print('built vocab')
# dataloader = DataLoader(test_dataset, batch_size = 4, drop_last=True,collate_fn= lambda b: collator(b,3))

# for X, y in dataloader:
#     print(X)
#     print()
#     print(y)
#     break


In [0]:
def train(model, iterator, optimizer, criterion, clip, args):
    device=args['device']
    model.train()
    
    epoch_loss = 0
    batch_ctr=0
    for batch in tqdm(iterator):
        
        src, src_len = batch[0]
        trg = batch[1]
        src=src.to(device)
        src_len=src_len.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        
        output, _ = model(src, src_len, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        batch_ctr+=1
    return epoch_loss / (batch_ctr*args['batch'])

def evaluate(model, iterator, criterion, args):
    device=utils.get_device(args)
    model.eval()
    
    epoch_loss = 0
    batch_ctr=0
    with torch.no_grad():
    
        for batch in tqdm(iterator):

            src, src_len = batch[0]
            trg = batch[1]

            src=src.to(device)
            src_len=src_len.to(device)
            trg = trg.to(device)

            output, _ = model(src, src_len, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            batch_ctr+=1
        
    return epoch_loss / (batch_ctr*args['batch'])

def translate_sentence(model,vocab,sentence,args):
    model.eval()
    device = args['device']
    tokenized = en_preprocessor(sentence) 
    tokenized = ['<sos>'] + tokenized + ['<eos>']
    numericalized = [vocab.src_stoi[t] for t in tokenized] 
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    translation_tensor_logits, attention = model(tensor, sentence_length, None) 
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [vocab.trg_itos[t] for t in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation, attention

def display_attention(candidate, translation, attention):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
   
    ax.tick_params(labelsize=15)
    ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in en_preprocessor(candidate)] + ['<eos>'], 
                       rotation=45)
    ax.set_yticklabels([''] + translation)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

def translation_mode(args):
    vocab=None
    with open(args.load_dic_path, 'rb') as F:
        vocab = pickle.load(F)

    INPUT_DIM = len(vocab.src_stoi)
    OUTPUT_DIM = len(vocab.trg_stoi)
    PAD_IDX = vocab.src_stoi['<pad>']
    SOS_IDX = vocab.src_stoi['<sos>']
    EOS_IDX = vocab.src_stoi['<eos>']
    device = utils.get_device(args)

    model = Seq2Seq(args,INPUT_DIM,OUTPUT_DIM, PAD_IDX, SOS_IDX, EOS_IDX).to(device)
    model.load_state_dict(torch.load(args.load_model_path))

    sentence=input('Enter sentence in source language')
    translation,attention = translate_sentence(model,vocab,sentence,args)
    print('Translated: ',' '.join(translation.join))
    display_attention(sentence,translation,attention)    

def train_mode(args):
    #Get Data
    training_dataset = DataReader(args,args['training_data'],en_preprocessor,hi_preprocessor)
    validation_dataset = DataReader(args,args['validation_data'],en_preprocessor,hi_preprocessor,training_dataset.vocab)
    # testing_dataset = DataReader(args,args.testing_data,en_preprocessor,hi_preprocessor,training_dataset.vocab)
    
    INPUT_DIM = len(training_dataset.vocab.src_stoi)
    OUTPUT_DIM = len(training_dataset.vocab.trg_stoi)

    device = utils.get_device(args)

    PAD_IDX = training_dataset.vocab.src_stoi['<pad>']
    SOS_IDX = training_dataset.vocab.src_stoi['<sos>']
    EOS_IDX = training_dataset.vocab.src_stoi['<eos>']

    training_dataloader = DataLoader(training_dataset, batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX))
    validation_dataloader = DataLoader(validation_dataset,batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX))
    # testing_dataloader = DataLoader(testing_dataset,batch_size = args.batch, drop_last=True, collate_fn=lambda b: collator(b,PAD_IDX))

    #Get model
    model = Seq2Seq(args,INPUT_DIM,OUTPUT_DIM, PAD_IDX, SOS_IDX, EOS_IDX).to(device)
    logger.info(model.apply(utils.init_weights),extra=args.exec_id) #init model
    logger.info("Number of trainable parameters: "+str(utils.count_parameters(model)),extra=args.exec_id) #log Param count

    #Train and Evaluate model
    N_EPOCHS = args.epochs
    CLIP = 1
    best_valid_loss = float('inf')

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
    
    for epoch in range(N_EPOCHS): 
        start_time = time.time()
        
        train_loss = train(model, training_dataloader, optimizer, criterion, CLIP, args)
        valid_loss = evaluate(model, validation_dataloader, criterion, args)
        
        end_time = time.time()
        
        epoch_mins, epoch_secs = utils.epoch_time(start_time, end_time)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), args['save_model_path'])
            with open(args['save_dic_path'],'wb') as F:
                pickle.dump(training_dataset.vocab.src_stoi,F)
        
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [13]:
args={
    'batch':16,
    'input_embedding_dim':128,
    'output_embedding_dim':128,
    'encoder_dim':512,
    'decoder_dim':512,
    'epochs':10,
    'device':'cpu',
    'training_data':('./Data/parallel/IITB.en-hi.en','./Data/parallel/IITB.en-hi.hi'),
    'testing_data':('./Data/dev_test/test.en','./Data/dev_test/test.hi'),
    'validation_data':('./Data/dev_test/dev.en','./Data/dev_test/dev.hi'),
    'save_model_path':'./trained_models/seq2seq.pt',
    'save_dic_path':'./trained_models/dictionary.pkl',
    'load_model_path':'./trained_models/seq2seq.pt',
    'load_dic_path':'./trained_models/dictionary.pkl'
}
train_mode(args)

FileNotFoundError: ignored