In [1]:
CUDA_LAUNCH_BLOCKING="1"

In [2]:
#!pip install torchtext==0.6.0

In [3]:
from __future__ import unicode_literals, print_function, division
import torch
torch.cuda.empty_cache()
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
print(torch.__version__)
import torchtext
from torchtext import data
from torchtext.data import Field,BucketIterator,TabularDataset
print(torchtext.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from io import open
import unicodedata
import string
import re
import random
import pandas as pd
from unicodedata import normalize
from nltk.tokenize import word_tokenize
from numpy import array
import numpy as np

1.8.1+cu101
0.6.0
cuda


In [4]:
!pip install indic-nlp-library



In [5]:
import spacy
from spacy.lang.hi import Hindi

In [6]:
from indicnlp.tokenize import indic_tokenize 

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
spacy_en = spacy.load('en_core_web_sm')

In [9]:
# tokenizing hindi data 
def tokenize_hin(article):
    nlp = Hindi()
    doc = nlp(article)
    tokens = [token.text for token in doc]
    
    return tokens

# loading the simple spacy tokenizer for english 
def tokenize_en(sentence):
    return [tok.text for tok in spacy_en.tokenizer(sentence)]

In [10]:
# Special Tokens
BOS_WORD = '<sos?'
EOS_WORD = '<eos>'
BLANK_WORD = "<blank>"

In [11]:
HI_TEXT = Field(tokenize=tokenize_hin, pad_token=BLANK_WORD)
EN_TEXT = Field(tokenize=tokenize_en, init_token = "<sos>", eos_token = "<eos>", pad_token=BLANK_WORD)

In [12]:
# associate the text in the 'English' column with the EN_TEXT field
data_fields = [('hindi', HI_TEXT), ('english', EN_TEXT)]

In [13]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
import os
import sys
path = '/content/gdrive/My Drive/'
os.chdir(path)
print(os.getcwd())

/content/gdrive/My Drive


In [15]:
df = pd.read_csv("Dataset/train.csv")
#adding columns with the length of the sentences
df['hin_len'] = df['hindi'].str.count(' ')
df['eng_len'] = df['english'].str.count(' ')

In [16]:
df.shape

(101322, 5)

In [17]:
#remove long sentences and others with large difference in the translations 
df = df.query('hin_len < 80 & eng_len < 80')

In [18]:
df = df.drop([df.columns[0]], axis=1)

In [19]:
df.head()

Unnamed: 0,hindi,english,hin_len,eng_len
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ...",21,22
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.,7,6
2,-हटाओ रिक.,"Fuck them, Rick.",1,2
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.,6,4
4,The thought reaching the eyes...,The thought reaching the eyes...,4,4


In [20]:
from sklearn.model_selection import train_test_split
train,val=train_test_split(df,test_size=0.1)

In [21]:
train.shape

(91112, 4)

In [22]:
train.to_csv("Dataset/train1.csv", index=False)
val.to_csv("Dataset/val1.csv", index=False)

In [23]:
train,val = TabularDataset.splits(path='Dataset/', train='train1.csv', validation='val1.csv', format='csv', fields=data_fields, skip_header = True)

In [24]:
for i, example in enumerate([(x.hindi,x.english) for x in train[0:5]]):
  print(f"Example_{i}:{example}")

Example_0:(['हम', 'भगवान', 'Tyrion', 'के', 'चैम्बर', 'के', 'लिए', 'मोमबत्ती', 'की', 'बहुत', 'आवश्यकता', 'होगी', '।'], ['We', "'ll", 'need', 'plenty', 'of', 'candles', 'for', 'Lord', 'Tyrion', "'s", 'chamber', '.'])
Example_1:(['और', 'मुझे', 'लगता', 'है', ',', 'इन', 'दिनों', 'समाचार', 'में', 'यह', 'बहुत', 'कुछ', 'चलता', 'है', '।'], ['And', 'this', 'is', 'something', 'that', "'s", ',', 'I', 'think', ',', 'in', 'the', 'news', 'a', 'lot', 'these', 'days', '.'])
Example_2:(['तकनीशियन', ',', 'यह', 'आपके', 'नियंत्रण', 'है'], ['Technician', ',', 'this', 'is', 'your', 'Control', '.'])
Example_3:(['याद', 'रखें', ',', 'यही', 'हम', 'पूरा', 'करना', 'चाहते', 'हैं', '।'], ['Remember', ',', 'that', "'s", 'what', 'we', 'want', 'to', 'have', 'accomplished', '.'])
Example_4:(['अब', 'समापन', 'में', ',', 'मुझे', 'लगता', 'है', 'कि', 'हम', 'सब', 'को', 'इस', 'बारे', 'में', 'सोचने', 'की', 'जरूरत', 'है', 'अगर', 'हम', 'चाहते', 'हैं', 'कि', 'यह', 'वास्तविकता', 'बने', '-', 'और', 'यदि', 'हां', ',', 'यह', 'जीवन', 'क

In [25]:
HI_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

In [26]:
BATCH_SIZE = 64
# Create iterators to process text in batches of approx. the same length by sorting on sentence lengths
train_iter = data.BucketIterator(train, batch_size=BATCH_SIZE, repeat=False, sort_key=lambda x: len(x.hindi), shuffle=True )
val_iter = data.BucketIterator(val, batch_size=1, repeat=False, sort_key=lambda x: len(x.hindi))

In [27]:
batch = next(iter(train_iter))
src_matrix = batch.hindi.T
print(src_matrix, src_matrix.size())

tensor([[  18,   10,  746,  ...,    1,    1,    1],
        [  13,  231,   51,  ...,    1,    1,    1],
        [ 390, 1991,    8,  ...,    1,    1,    1],
        ...,
        [  42,   85,  624,  ...,    1,    1,    1],
        [4475,   73,    1,  ...,    1,    1,    1],
        [  19,  179,   67,  ...,    1,    1,    1]]) torch.Size([64, 53])


In [28]:
trg_matrix = batch.english.T
print(trg_matrix, trg_matrix.size())

tensor([[    2,   101,    17,  ...,     1,     1,     1],
        [    2,     8,   104,  ...,     1,     1,     1],
        [    2,  2079,  7578,  ...,     1,     1,     1],
        ...,
        [    2,    78,    30,  ...,     1,     1,     1],
        [    2, 18296,    75,  ...,     1,     1,     1],
        [    2,    36,   218,  ...,     1,     1,     1]]) torch.Size([64, 51])


In [29]:
import math

In [30]:
# class TransformerModel(nn.Module):
#   def __init__(self, vocab_size_en,vocab_size_fr, dim_input, nos_head, fd_frwd, nlayers, dropout=0.5):
#     super(TransformerModel, self).__init__()
#     from torch.nn import TransformerEncoderLayer,TransformerEncoder,TransformerDecoder,TransformerDecoderLayer, Embedding
#     self.src_mask=None
#     self.embed_en =Embedding(vocab_size_en,dim_input)
#     self.pos_encoder=PositionalEncoding(dim_input)
#     encoder_layers=TransformerEncoderLayer(dim_input,nos_head,fd_frwd,dropout)
#     self.encoder=TransformerEncoder(encoder_layers,nlayers)
#     self.dim_input=dim_input
#     self.nlayers=nlayers
#     self.embed_fr=Embedding(vocab_size_fr,dim_input)
#     self.pos_decoder=PositionalEncoding(dim_input)
#     dec_layers=TransformerDecoderLayer(dim_input,nos_head,fd_frwd,dropout)
#     self.decoder=TransformerDecoder(dec_layers,nlayers)
#     self.decoder_out_layer=nn.Linear(dim_input,vocab_size_fr)
#     self.output_final=nn.Softmax()
#     self.init_weights()
#   def _generate_square_subsequent_mask(self, sz):
#     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
#     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
#     return mask
  
#   def init_weights(self):
#     initrange = 0.1
#     self.embed_en.weight.data.uniform_(-initrange, initrange)
#     self.embed_fr.weight.data.uniform_(-initrange, initrange)
#     self.decoder_out_layer.bias.data.zero_()
#     self.decoder_out_layer.weight.data.uniform_(-initrange, initrange)
  
#   def forward(self,inp,target):
#     inp=self.embed_en(inp)
#     inp=self.pos_encoder(inp)
#     encoder_output = self.encoder(inp)
#     #print(encoder_output.shape)
#     #print(self.src_mask == None or self.src_mask.size(0) != len(target))
#     if (self.src_mask == None or self.src_mask.size(0) != len(target)): 
#       device = target.device
#       #print(len(target))
#       mask = self._generate_square_subsequent_mask(len(target)).to(device)
#       src_mask = mask
#       #print(src_mask)
#     target = self.embed_fr(target) 
#     target = self.pos_decoder(target)
#     #print(target.shape)
#     output = self.decoder(target,encoder_output,src_mask,memory_mask=None,tgt_key_padding_mask=None, memory_key_padding_mask=None)
#     output = self.decoder_out_layer(output)
#     out=self.output_final(output)
#     return out

In [31]:
# vocab_size_hi=len(HI_TEXT.vocab.stoi)
# vocab_size_en=len(EN_TEXT.vocab.stoi)
# dim_input=256
# nos_head=4
# nlayers=4
# fd_frwd=512

In [32]:
# class PositionalEncoding1(nn.Module):
#     def __init__(self, d_model, max_seq_len=500):
#         super(PositionalEncoding, self).__init__()
#         pe = torch.zeros((max_seq_len, d_model))
#         position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return (x)

In [33]:
from torch import Tensor
import torchvision

import copy
from typing import Optional, Any
from torch.nn.init import xavier_uniform_
import math, copy, time
from torch.autograd import Variable

In [34]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

    
class MyTransformer(nn.Module):
    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: str = "relu",source_vocab_length: int = 60000,target_vocab_length: int = 60000) -> None:
        super(MyTransformer, self).__init__()
        self.source_embedding = nn.Embedding(source_vocab_length, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
        self.target_embedding = nn.Embedding(target_vocab_length, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.out = nn.Linear(512, target_vocab_length)
        self._reset_parameters()
        self.d_model = d_model
        self.nhead = nhead

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        src = self.source_embedding(src)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        output = self.out(output)
        return output


    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

In [35]:
source_vocab_length = len(HI_TEXT.vocab)
target_vocab_length = len(EN_TEXT.vocab)

model = MyTransformer(source_vocab_length=source_vocab_length,target_vocab_length=target_vocab_length)
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
model = model.cuda()

In [36]:
use_gpu = True

In [37]:
def train(train_iter, val_iter, model, optim, num_epochs,use_gpu=True): 
    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        # Train model
        model.train()
        for i, batch in enumerate(train_iter):
            src = batch.hindi.cuda() if use_gpu else batch.hindi
            trg = batch.english.cuda() if use_gpu else batch.english
            #change to shape (bs , max_seq_len)
            src = src.transpose(0,1)
            #change to shape (bs , max_seq_len+1) , Since right shifted
            trg = trg.transpose(0,1)
            trg_input = trg[:, :-1]
            targets = trg[:, 1:].contiguous().view(-1)
            src_mask = (src != 0)
            src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
            src_mask = src_mask.cuda() if use_gpu else src_mask
            trg_mask = (trg_input != 0)
            trg_mask = trg_mask.float().masked_fill(trg_mask == 0, float('-inf')).masked_fill(trg_mask == 1, float(0.0))
            trg_mask = trg_mask.cuda() if use_gpu else trg_mask
            size = trg_input.size(1)
            #print(size)
            np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
            np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            np_mask = np_mask.cuda() if use_gpu else np_mask   
            # Forward, backprop, optimizer
            optim.zero_grad()
            preds = model(src.transpose(0,1), trg_input.transpose(0,1), tgt_mask = np_mask)#, src_mask = src_mask)#, tgt_key_padding_mask=trg_mask)
            preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))
            loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
            loss.backward()
            optim.step()
            train_loss += loss.item()/BATCH_SIZE
        
        model.eval()
        with torch.no_grad():
            for i, batch in enumerate(val_iter):
                src = batch.hindi.cuda() if use_gpu else batch.hindi
                trg = batch.english.cuda() if use_gpu else batch.english
                #change to shape (bs , max_seq_len)
                src = src.transpose(0,1)
                #change to shape (bs , max_seq_len+1) , Since right shifted
                trg = trg.transpose(0,1)
                trg_input = trg[:, :-1]
                targets = trg[:, 1:].contiguous().view(-1)
                src_mask = (src != 0)
                src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
                src_mask = src_mask.cuda() if use_gpu else src_mask
                trg_mask = (trg_input != 0)
                trg_mask = trg_mask.float().masked_fill(trg_mask == 0, float('-inf')).masked_fill(trg_mask == 1, float(0.0))
                trg_mask = trg_mask.cuda() if use_gpu else trg_mask
                size = trg_input.size(1)
                #print(size)
                np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
                np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
                np_mask = np_mask.cuda() if use_gpu else np_mask

                preds = model(src.transpose(0,1), trg_input.transpose(0,1), tgt_mask = np_mask)#, src_mask = src_mask)#, tgt_key_padding_mask=trg_mask)
                preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))         
                loss = F.cross_entropy(preds,targets, ignore_index=0,reduction='sum')
                valid_loss += loss.item()/1
            
        # Log after each epoch
        print(f'''Epoch [{epoch+1}/{num_epochs}] complete. Train Loss: {train_loss/len(train_iter):.3f}. Val Loss: {valid_loss/len(val_iter):.3f}''')
        
        #Save best model till now:
        if valid_loss/len(val_iter)<min(valid_losses,default=1e9): 
            print("saving state dict")
            torch.save(model.state_dict(), f"checkpoint_best_epoch.pt")
        
        train_losses.append(train_loss/len(train_iter))
        valid_losses.append(valid_loss/len(val_iter))
        
        # Check Example after each epoch:
        sentences = ["तुमने उनकी बात सुनी"]
        for sentence in sentences:
            print(f"Original Sentence: {sentence}")
            print(f"Translated Sentence: {greeedy_decode_sentence(model,sentence)}")
    return train_losses,valid_losses

In [38]:
def greeedy_decode_sentence(model,sentence):
    model.eval()
    sentence = HI_TEXT.preprocess(sentence)
    indexed = []
    for tok in sentence:
        if HI_TEXT.vocab.stoi[tok] != 0 :
            indexed.append(HI_TEXT.vocab.stoi[tok])
        else:
            indexed.append(0)
    sentence = Variable(torch.LongTensor([indexed])).cuda()
    trg_init_tok = EN_TEXT.vocab.stoi[BOS_WORD]
    trg = torch.LongTensor([[trg_init_tok]]).cuda()
    translated_sentence = ""
    maxlen = 80
    for i in range(maxlen):
        size = trg.size(0)
        np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
        np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
        np_mask = np_mask.cuda()
        pred = model(sentence.transpose(0,1), trg, tgt_mask = np_mask)
        add_word = EN_TEXT.vocab.itos[pred.argmax(dim=2)[-1]]
        translated_sentence+=" "+add_word
        if add_word==EOS_WORD:
            break
        trg = torch.cat((trg,torch.LongTensor([[pred.argmax(dim=2)[-1]]]).cuda()))
        #print(trg)
    return translated_sentence

In [39]:
train_losses,valid_losses = train(train_iter, val_iter, model, optim, 20)

Epoch [1/20] complete. Train Loss: 85.280. Val Loss: 68.915
saving state dict
Original Sentence: तुमने उनकी बात सुनी
Translated Sentence:  I 'm a little time , I 'm a little time , I 'm a little way . <eos>


RuntimeError: ignored

In [None]:
#model=TransformerModel(vocab_size_hi,vocab_size_en,dim_input,nos_head,fd_frwd,nlayers)

In [None]:
# import time
# criterion = nn.CrossEntropyLoss()
# lr = 5.0 # learning rate
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) 
# start_time=time.time()
# tot_loss=0
# for i,batch in enumerate(train_iter):
#   #print("batch",i)
#   tr_hi=batch.hindi
#   tr_target=batch.english
#   optimizer.zero_grad()                     
#   model.train() 
#   output = model(tr_hi,tr_target)
#   loss = criterion(output.view(-1, vocab_size_en), tr_target.view(-1))
#   loss.backward()
#   optimizer.step()
#   log_interval = 200
#   tot_loss+=loss
#   #print the loss and time per 200 batches
#   if i%2 ==0 :
#     elapsed = time.time() - start_time
#     print("loss:",tot_loss,"\t","time:",elapsed)
#     tot_loss=0

In [None]:
print(greeedy_decode_sentence(model,"तुमने उनकी बात सुनी"))



 <unk> परेशान l'मेरे <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
