In [1]:
import pandas as pd

import re
import string
from string import digits

import os
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset


from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
from preprocessing import preprocessing

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### 1. Load Data

In [3]:
data = pd.read_csv('../Hindi_English_Truncated_Corpus.csv')
data = data.reset_index(drop=True)
data.drop('source',axis=1,inplace=True)

In [4]:
data.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
data = preprocessing(data)

### 2. Create Dataset and Dataloaders
* We create a dataset for train. 
* The vocab will only be based upon train.
* We'll have to write separate prediction code for val and test

In [6]:
val_frac = 0.1

In [7]:
val_split_idx = int(len(data)*val_frac)

In [8]:
data_idx = list(range(len(data)))
np.random.shuffle(data_idx)

In [9]:
val_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:]
print('len of train: ', len(train_idx))
print('len of val: ', len(val_idx))

len of train:  112343
len of val:  12482


In [10]:
train = data.iloc[train_idx].reset_index().drop('index',axis=1)
val = data.iloc[val_idx].reset_index().drop('index',axis=1)

### 3.Seq to Seq modelling
#### Strategy
* We'll create the main vocab using train and create train_dataset
* We'll use the train vocab for val and create val_dataset as we want to see how the model handles oov words
* We'll not touch the test set. We'll use this set like in production. We'll pass one test sentence to the model and it will give us a translated sentence

In [11]:
### Define parameters for Seq2Seq model ###

# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
#load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = 10000
input_size_decoder = 10000 
output_size = input_size_decoder #compulsory to define
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
#writer = SummaryWriter(f"runs/loss_plot")

transforms = False

In [12]:
from data_layer import Train_Dataset, Validation_Dataset, get_train_loader, get_valid_loader

train_dataset = Train_Dataset(train, 'english_sentence', 'hindi_sentence', source_vocab_max_size=input_size_encoder, target_vocab_max_size=input_size_decoder)
val_dataset = Validation_Dataset(train_dataset, val, 'english_sentence', 'hindi_sentence')

train_loader = get_train_loader(train_dataset, batch_size, num_workers=0, 
                          shuffle=True, pin_memory=True)
val_loader = get_valid_loader(val_dataset, train_dataset, batch_size, num_workers=0, 
                          shuffle=True, pin_memory=True)

#### And Experiment
We try to see how much padding on average we have to give if we create random batches. Creating random batches of larger sizes lead to inefficient padding

In [13]:
loader_src_padding_list = []
loader_trg_padding_list = []
for batch_idx, (src, trg) in enumerate(train_loader):
    loader_src_padding_list.append(src.shape[0])
    loader_trg_padding_list.append(trg.shape[0])
    

In [14]:
print('src: ' , np.sort(loader_src_padding_list))
print('trg: ' , np.sort(loader_trg_padding_list))

src:  [ 29  30  32 ... 350 371 400]
trg:  [ 35  35  35 ... 314 419 419]


In [15]:
print('src avg: ', np.mean(loader_src_padding_list).astype(int))
print('trg avg: ', np.mean(loader_trg_padding_list).astype(int))

src avg:  72
trg avg:  86


### 4. Define Encoder, Decoder and Seq2Seq

In [16]:
from seq_to_seq import Encoder, Decoder, Seq2Seq

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net, output_size, device)
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10000, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10000, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=10000, bias=True)
  )
)

In [17]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

pad_idx = train_dataset.source_vocab.stoi['<PAD>']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [18]:
step=0

In [19]:
#import function to test on sentence
from prediction import test_on_sentence

In [None]:
for epoch in range(num_epochs):
    train_loss=0
    valid_loss =0
    print(f"[Epoch {epoch} / {num_epochs}]")
    model.train()
    for batch_idx, (src, trg) in enumerate(train_loader):
        #print(batch_idx)
        # Get input and targets and get to cuda
        inp_data = src.to(device)
        target = trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        train_loss+= ((1 / (batch_idx + 1)) * (loss.data.item() - train_loss))
        if batch_idx%100==0:
            print('Avg train loss for last {} steps: {:.2f}'.format(batch_idx, train_loss))
            print(test_on_sentence('and the doctors told his parents', model, train_dataset, device, max_len = 20))
        
        step += 1
        
    
    model.eval()
    for batch_idx, (src, trg) in enumerate(val_loader):
        #print(batch_idx)
        # Get input and targets and get to cuda
        inp_data = src.to(device)
        target = trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        loss = criterion(output, target)

        valid_loss+= ((1 / (batch_idx + 1)) * (loss.data.item() - valid_loss))
        
        step += 1
    
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))



