In [None]:
import collections
import csv
import math
import numpy as np
import os
import pandas as pd
import random
import re
import time
import torch

from functools import partial
from operator import is_not

from torch.utils import data
from torch import nn

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def train_validation_split(csv_file):
    """For split csv file in train validation set and save these sets in file"""
    Hindi_sts = [] 
    Eng_sts = []
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            if(row[0] == ''):
                continue
            Hindi_sts.append(row[1])
            Eng_sts.append(row[2])

    trn_len = int(102322*0.8)
    tst_len = 102322 - trn_len    
    tst_idxs = random.sample(range(102322), tst_len)

    X_validate = []
    y_validate = []
    for idx in tst_idxs:
        X_validate.append(Hindi_sts[idx])
        y_validate.append(Eng_sts[idx])

    set1 = set(range(102322))
    set2 = set(tst_idxs)
    trn_idxs = list(set1 - set2)

    X_train = []
    y_train = []
    for idx in trn_idxs:
        X_train.append(Hindi_sts[idx])
        y_train.append(Eng_sts[idx])
    
    with open('X_train.txt', 'w') as f, open('y_train.txt', 'w') as f1:
        for row1,row2 in zip(X_train, y_train):
            f.write(row1 + '\n')
            f1.write(row2 + '\n')

    with open('X_validate.txt', 'w') as f, open('y_validate.txt', 'w') as f1:
        for row1,row2 in zip(X_validate, y_validate):
            f.write(row1 + '\n')
            f1.write(row2 + '\n')

csv_file = '../input/hineng/train/train.csv'
train_validation_split(csv_file)

References for below code:
http://d2l.ai/

In [None]:
def tokenize_hin(X_train):
    """For tokenizing hindi statements"""
    source_tokenize = []
    for s in X_train:
        s = re.sub(r'[^\u0970-\u097f\u0900-\u0963]+', ' ', s).split(' ')
        s = [x for x in s if x is not '']
        source_tokenize.append(s)    
    return source_tokenize

def tokenize_eng(y_train):
    """For tokenizing english statements"""
    target_tokenize = []
    for s in y_train:
        s = s.lower()
        s = re.sub(r'[^a-zA-Z]+', ' ', s).split(' ')
        s = [x for x in s if x is not '']
        target_tokenize.append(s)
    return target_tokenize

In [None]:
class Accumulator:
    """For accumulating sums over `n` variables."""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=None, min_freq=0 , reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [None]:
def try_gpu(i=0):
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

In [None]:
def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad

def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def build_array_nmt(lines, vocab, num_steps):
    """Transform text sequences of machine translation into minibatches."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

In [None]:
def load_data_nmt(X_train, y_train, batch_size, num_steps, num_examples=600):
    """Return the iterator and the vocabularies of the translation dataset."""
    
    source = tokenize_hin(X_train)
    target = tokenize_eng(y_train)
    
    src_vocab = Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    
    return data_iter, src_vocab, tgt_vocab

In [None]:
class DotProductAttention(nn.Module):
    """Scaled dot product attention."""
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    # Shape of `queries`: (`batch_size`, no. of queries, `d`)
    # Shape of `keys`: (`batch_size`, no. of key-value pairs, `d`)
    # Shape of `values`: (`batch_size`, no. of key-value pairs, value dimension)
    # Shape of `valid_lens`: (`batch_size`,) or (`batch_size`, no. of queries)
    def forward(self, queries, keys, values, valid_lens=None):
        d = queries.shape[-1]
        # Set `transpose_b=True` to swap the last two dimensions of `keys`
        scores = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(d)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, key_size, query_size, value_size, num_hiddens,
                 num_heads, dropout, bias=False, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.attention = DotProductAttention(dropout)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=bias)
        self.W_k = nn.Linear(2*key_size, num_hiddens, bias=bias)
        self.W_v = nn.Linear(2*value_size, num_hiddens, bias=bias)
        self.W_o = nn.Linear(num_hiddens, 2*num_hiddens, bias=bias)

    def forward(self, queries, keys, values, valid_lens):
        # Shape of `queries`, `keys`, or `values`:
        # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`)
        # After transposing, shape of output `queries`, `keys`, or `values`:
        # (`batch_size` * `num_heads`, no. of queries or key-value pairs,
        # `num_hiddens` / `num_heads`)
        
        queries = transpose_qkv(self.W_q(queries), self.num_heads)
        keys = transpose_qkv(self.W_k(keys), self.num_heads)
        values = transpose_qkv(self.W_v(values), self.num_heads)

        if valid_lens is not None:
            # On axis 0, copy the first item (scalar or vector) for
            # `num_heads` times, then copy the next item, and so on
            valid_lens = torch.repeat_interleave(valid_lens, repeats=self.num_heads, dim=0)

        # Shape of `output`: (`batch_size` * `num_heads`, no. of queries,
        # `num_hiddens` / `num_heads`)
        output = self.attention(queries, keys, values, valid_lens)

        # Shape of `output_concat`:
        # (`batch_size`, no. of queries, `num_hiddens`)
        output_concat = transpose_output(output, self.num_heads)
        return self.W_o(output_concat)

def transpose_qkv(X, num_heads):
    # Shape of input `X`:
    # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`).
    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)

    X = X.permute(0, 2, 1, 3)

    # Shape of `output`:
    # (`batch_size` * `num_heads`, no. of queries or key-value pairs,
    # `num_hiddens` / `num_heads`)
    t = X.reshape(-1, X.shape[2], X.shape[3])
    return X.reshape(-1, X.shape[2], X.shape[3])

def transpose_output(X, num_heads):
    """Reverse the operation of `transpose_qkv`"""
    X = X.reshape(-1, num_heads, X.shape[1], X.shape[2])
    X = X.permute(0, 2, 1, 3)
    return X.reshape(X.shape[0], X.shape[1], -1)

In [None]:
class EncoderDecoder(nn.Module):
    """The base class for the encoder-decoder architecture."""
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        (outputs, dec_hidden_st, enc_valid_lens) = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, (outputs, dec_hidden_st, enc_valid_lens))

class Seq2SeqEncoder(nn.Module):
    """The RNN encoder for sequence to sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers,
                          dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_hiddens * 2, num_hiddens)

    def forward(self, X, *args):
        # The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
        X = self.dropout(self.embedding(X))
        # In RNN models, the first axis corresponds to time steps
        X = X.permute(1, 0, 2)
        # When state is not mentioned, it defaults to zeros
        output, hidden_st = self.rnn(X)
        concat = torch.cat((hidden_st[0:hidden_st.size(0):2], hidden_st[1:hidden_st.size(0):2]), dim=2)
        hidden_st = torch.tanh(self.fc(concat)) 
        # `output` shape: (`num_steps`, `batch_size`, `2*num_hiddens`)
        # `hidden_st` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, hidden_st

class Seq2SeqAttentionDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
        self.attention = MultiHeadAttention(num_hiddens, num_hiddens, num_hiddens, num_hiddens, 2, dropout)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size + 2*num_hiddens, num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens, *args):
        # Shape of `outputs`: (`num_steps`, `batch_size`, `2*num_hiddens`).
        # Shape of `hidden_st`: (`num_layers`, `batch_size`, `num_hiddens`)
        outputs, hidden_st = enc_outputs
        return (outputs.permute(1, 0, 2), hidden_st, enc_valid_lens)

    def forward(self, X, state):
        # Shape of `enc_outputs`: (`batch_size`, `num_steps`, `2*num_hiddens`).
        # Shape of `hidden_st`: (`num_layers`, `batch_size`, `num_hiddens`)
        enc_outputs, hidden_st, enc_valid_lens = state
        # Shape of the output `X`: (`num_steps`, `batch_size`, `embed_size`)
        X = self.embedding(X).permute(1, 0, 2)
        outputs, self._attention_weights = [], []
        for x in X:
            # Shape of `query`: (`batch_size`, 1, `num_hiddens`)
            query = torch.unsqueeze(hidden_st[-1], dim=1)
            # Shape of `context`: (`batch_size`, 1, `2*num_hiddens`)
            context = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens)
            # Concatenate on the feature dimension
            x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1)
            # Reshape `x` as (1, `batch_size`, `embed_size` + `2*num_hiddens`)
            out, hidden_st = self.rnn(x.permute(1, 0, 2), hidden_st)
            outputs.append(out)
        
        # After fully-connected layer transformation, shape of `outputs`:
        # (`num_steps`, `batch_size`, `vocab_size`)
        outputs = self.dense(torch.cat(outputs, dim=0))
        return outputs.permute(1, 0, 2), [
            enc_outputs, hidden_st, enc_valid_lens]

    def attention_weights(self):
        return self._attention_weights

In [None]:
def sequence_mask(X, valid_len, value=0):
    """Mask irrelevant entries in sequences."""
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    """The softmax cross-entropy loss with masks."""

    # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`)
    # `label` shape: (`batch_size`, `num_steps`)
    # `valid_len` shape: (`batch_size`,)
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction = 'none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [None]:
def masked_softmax(X, valid_lens):
    """Perform softmax operation by masking elements on the last axis."""
    # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor
    if valid_lens is None:
        return nn.functional.softmax(X, dim=-1)
    else:
        shape = X.shape
        if valid_lens.dim() == 1:
            valid_lens = torch.repeat_interleave(valid_lens, shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        # On the last axis, replace masked elements with a very large negative
        # value, whose exponentiation outputs 0
        X = sequence_mask(X.reshape(-1, shape[-1]), valid_lens, value=-1e6)
        return nn.functional.softmax(X.reshape(shape), dim=-1)

In [None]:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.LSTM:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    
    for epoch in range(num_epochs):
        metric = Accumulator(2)  # Sum of training loss, no. of tokens
        
        for batch in data_iter:
            
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1)
            
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = net(X, dec_input, X_valid_len)
            
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        print(epoch, f'loss {metric[0] / metric[1]:.3f}')
    print(f'final loss {metric[0] / metric[1]:.3f}')

In [None]:
embed_size, num_hiddens, num_layers, dropout = 64, 128, 2, 0.25
batch_size, num_steps = 64, 15
lr, num_epochs, device = 0.001, 200, try_gpu()

"""
# This was used to load train data which was saved using function 'train_validation_split' 
with open('../input/train-validate/Data/X_train.txt', 'r') as f, open('../input/train-validate/Data/y_train.txt', 'r') as f1:
    X_train = f.read().splitlines()
    y_train = f1.read().splitlines()
"""
# To load all train data
csv_file = '../input/hineng/train/train.csv'
X_train = [] 
y_train = []
with open(csv_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if(row[0] == ''):
            continue
        X_train.append(row[1])
        y_train.append(row[2])

train_iter, src_vocab, tgt_vocab = load_data_nmt(X_train, y_train, batch_size, num_steps)

encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqAttentionDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
net = EncoderDecoder(encoder, decoder)
print("Training Starts...")

st = time.time()
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
ed = time.time()
print('Time taken by model : ', ed-st)
torch.save(net.state_dict(), './train_model_fnl.pt')

In [None]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False):
    """Predict for sequence to sequence."""
    
    # Set `net` to eval mode for inference
    net.eval()
    tkns = tokenize_hin([src_sentence])
    tkns = tkns[0]
    src_tokens = src_vocab[tkns] + [src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    
    # Add the batch axis
    enc_X = torch.unsqueeze(torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_st = net.decoder.init_state(enc_outputs, enc_valid_len)
    
    # Add the batch axis
    dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    
    for _ in range(num_steps):
        Y, dec_st = net.decoder(dec_X, dec_st)
        
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        
        # Once the end-of-sequence token is predicted, the generation of the
        # output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq


In [None]:
# Load validation set
X_validate = []
y_validate = []
with open('../input/train-validate/Data/X_validate.txt', 'r') as f, open('../input/train-validate/Data/y_validate.txt', 'r') as f1:
    X_validate = f.read().splitlines()
    y_validate = f1.read().splitlines()

# Prediction on validation set 
predict = []
for hind, eng in zip(X_validate[:20],y_validate[:20]):
    translation, attention_weight_seq = predict_seq2seq(net, hind, src_vocab, tgt_vocab, num_steps, device)
    predict.append(translation)

# Printing sample predictions
i = 0
while i < 20:
    print(X_validate[i])
    print(y_validate[i])
    print(predict[i])
    print("\n")
    i += 1

# Save predictions of validation set
with open('predict.txt', 'w') as f:
    for row1 in predict:
        f.write(row1 + '\n')

In [None]:
# Predicting on test data and save in answer.txt
hindi_tst = []
csv_file = '../input/tstphase/testhindistatements.csv'
with open(csv_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if(row[0] == ''):
            continue
        hindi_tst.append(row[2])

answer = []
for hind in hindi_tst:
    translation, attention_weight_seq = predict_seq2seq(net, hind, src_vocab, tgt_vocab, num_steps, device)
    answer.append(translation)

with open('answer.txt', 'w') as f:
    for row1 in answer:
        f.write(row1 + '\n')