In [8]:
import pandas as pd
from pandarallel import pandarallel #for parallel apply function

# Initialization
pandarallel.initialize()

import re
import string
from string import digits

import os
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### 1. Load data and preprocess
We'll first load the data. Then we'll preprocess and clean the text

In [2]:
data = pd.read_csv('Hindi_English_Truncated_Corpus.csv') #https://www.kaggle.com/aiswaryaramachandran/hindienglish-corpora/kernels

In [3]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127607 entries, 0 to 127606
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   source            127607 non-null  object
 1   english_sentence  127605 non-null  object
 2   hindi_sentence    127607 non-null  object
dtypes: object(3)
memory usage: 2.9+ MB


In [5]:
#Preprocess and clean
#clean the data
print(len(data))
data = data.dropna().drop_duplicates()

#lower and remove quotes
data['english_sentence'] = data.english_sentence.parallel_apply(lambda x: re.sub("'",'',x).lower())
data['hindi_sentence'] = data.hindi_sentence.parallel_apply(lambda x: re.sub("'",'',x).lower())

#remove special chars
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
data['english_sentence']=data['english_sentence'].parallel_apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['hindi_sentence']=data['hindi_sentence'].parallel_apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


remove_digits = str.maketrans('', '', digits)
data['english_sentence']=data['english_sentence'].parallel_apply(lambda x: x.translate(remove_digits))
data['hindi_sentence']=data['hindi_sentence'].parallel_apply(lambda x: x.translate(remove_digits))

data['hindi_sentence'] = data['hindi_sentence'].parallel_apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data['english_sentence']=data['english_sentence'].parallel_apply(lambda x: x.strip())
data['hindi_sentence']=data['hindi_sentence'].parallel_apply(lambda x: x.strip())
data['english_sentence']=data['english_sentence'].parallel_apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence']=data['hindi_sentence'].parallel_apply(lambda x: re.sub(" +", " ", x))

len(data)

127607


124827

Great! Now we have clean text. Now, as some of the rows were dropped from our data, we'll reset the index as the dataset class reads throws error.

In [6]:
#reset index as some rows were dropped. Else dataset getitem will give error as it works on index
data = data.reset_index().drop('index',axis=1)
#dropping source column as it is not needed
data.drop('source',axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
2,this percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,the ending portion of these vedas is called up...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


Ok, so our data is ready. Now we'll create the custom dataset class

### 2. Create the dataset class
Takes in dataframe, source and target text columns and returns numericalized texts one index at a time

In [9]:
class Language_Dataset(Dataset): 
    
    '''
    Initiating Variables
    source_column : the name of source text column in the dataframe
    target_columns : the name of target text column in the dataframe
    transform : If we want to add any augmentation
    freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
    source_vocab_max_size : max source vocab size
    target_vocab_max_size : max target vocab size
    '''
    def __init__(self, df, source_column, target_column, transform = None, freq_threshold = 5, 
                 source_vocab_max_size= 10000, target_vocab_max_size = 10000):
        self.df = df
        self.transform = transform
        
        #get source and target texts
        self.source_texts = self.df[source_column]
        self.target_texts = self.df[target_column]
        
        ##VOCABULARY class will be created below
        #create source vocab
        self.source_vocab = Vocabulary(freq_threshold,source_vocab_max_size)
        self.source_vocab.build_vocabulary(self.source_texts.tolist())
        #create target vocab
        self.target_vocab = Vocabulary(freq_threshold,target_vocab_max_size)
        self.target_vocab.build_vocabulary(self.target_texts.tolist())
        
    #used by data loader when creating batches
    def __len__(self):
        return len(self.df)
    
    #getitem gets 1 example at a time. This is done before a batch is created
    def __getitem__(self, index):
        source_text = self.source_texts[index]
        #print(source_text)
        target_text = self.target_texts[index]
        #print(target_text)
        if self.transform is not None:
            source_text = self.transform(source_text)
            
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_source = [self.source_vocab.stoi["<SOS>"]]
        numerialized_source += self.source_vocab.numericalize(source_text)
        numerialized_source.append(self.source_vocab.stoi["<EOS>"])
    
        numerialized_target = [self.target_vocab.stoi["<SOS>"]]
        numerialized_target += self.target_vocab.numericalize(target_text)
        numerialized_target.append(self.target_vocab.stoi["<EOS>"])
        #print(numerialized_source)
        return torch.tensor(numerialized_source), torch.tensor(numerialized_target) 
    
    


In [13]:
class Vocabulary:
    def __init__(self, freq_threshold, max_size):
        #defining pad, start of sentence, end of sentence and unknown token index
        self.itos = {0:"<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"} #index to string dict
        self.stoi = {k:j for j,k in self.itos.items()} #string to index dict
        self.freq_threshold = freq_threshold #minimum word frequency needed to be included in vocab
        self.max_size = max_size #max vocab size
        
    def __len__(self):
        return len(self.itos)
    
    
    
    '''
    a simple tokenizer that splits on space and converts the sentence to list of words 
    '''
    @staticmethod #static method is independent of a class instance
    def tokenizer(text):
        return [tok.lower().strip() for tok in text.split(' ')]
    
    '''
    build the vocab
    '''
    def build_vocabulary(self,sentence_list):
        frequencies = {}
        idx = 4
        
        #calculate freq of words
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    frequencies[word]+=1
        
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold}
        #limit vocab to the max size specified
        if len(frequencies)>self.max_size-4:
            frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-4]) #-4 for start,end, pad, unk token
        #create vocab
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
                
    '''
    convert the list of words to a list of corresponding indexes
    '''
    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else:
                numericalized_text.append(self.stoi['<UNK>'])
        
        return numericalized_text
                    

> Let us test the dataset

In [14]:
lang_dataset = Language_Dataset(data, 'english_sentence', 'hindi_sentence', source_vocab_max_size=10000, target_vocab_max_size=8000)

In [17]:
#let us check one example by passing an index of dataframe
print(lang_dataset.source_texts[0])
print(lang_dataset.target_texts[0])

politicians do not have permission to do what needs to be done
राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है


In [24]:
#let us check the vocabs English and hindi
print('len of source vocab : ',len(lang_dataset.source_vocab.stoi)) #len of the source vocab dictionary
print('len of target vocab : ',len(lang_dataset.target_vocab.stoi)) #len of the target vocab dictionary

len of source vocab :  10000
len of target vocab :  8000


In [27]:
#let us check out the numericalized texts
print('source text:\n', lang_dataset.source_texts[0])
print('numericalized text:\n', lang_dataset[0][0]) #lang_dataset[0]-> souce numericalized and lang_dataset[1]-> target numericalized

source text:
 politicians do not have permission to do what needs to be done
numericalized text:
 tensor([   1, 2841,   69,   25,   29, 1616,    7,   69,   66,  570,    7,   21,
         245,    2])


In [29]:
#let us confirm the the indexes match in the vocab
print('index of politicians: ', lang_dataset.source_vocab.stoi['politicians'])

index of politicians:  2841


* We can see the index for politicians matches to the vocab. 
* Also note that 1 -> start and 2 -> end is present in the numericalized texts

### 3. Create a class for adding padding to the batch

* Dataset class returns the text one example at a time
* DataLoader is used to get a batch from the dataset
* Now, if we want to post-process a batch, we use dataloader's collate_fn. It applies a logic to the whole batch. We'll use this to 0 pad the batchs.
  So, if a the max len in a batch is 12, the other sentences will be padded to be of length 12

In [30]:
#a class to create padding to the batches
## collat_fn in dataloader is used for postprocessing on a single batch. Like __getitem__ in dataset class was used
## for a single example

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    # __call__ :
    ##    First the obj is created using MyCollate(pad_idx).
    ##    Then if obj(batch) is called -> __call__ runs by default
    ## https://www.geeksforgeeks.org/__call__-in-python/
    def __call__(self, batch):
        source = [item[0] for item in batch] #get the source lists
        source = pad_sequence(source, batch_first=False, padding_value = self.pad_idx)
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx)
        
        return source,target

### 4. Create the DataLoader

In [32]:
#get batches of data from dataset and pad them
def get_loader(dataset, transform, batch_size, num_workers=1, shuffle=True, pin_memory=False):
    pad_idx = dataset.source_vocab.stoi['<PAD>']
    loader = DataLoader(dataset, batch_size = batch_size,# num_workers = num_workers,
                        shuffle=shuffle,
                       pin_memory=pin_memory, collate_fn = MyCollate(pad_idx=pad_idx))
    return loader

> the number of samples we get from a loader at a time = batch_size * (num_workers+1)

In [33]:
loader = get_loader(lang_dataset,False, 64)

In [39]:
#batch[0] -> source batch
#batch[1] -> target batch
print('batch: ', batch[0])
print('batch shape: ', batch[0].shape)

batch:  tensor([[   1,    1,    1,  ...,    1,    1,    1],
        [  54,   15,    4,  ...,   36,   17, 9702],
        [  86, 5243,  113,  ...,  579,  175,    8],
        ...,
        [   0,    0,    0,  ...,    0,    0,    5],
        [   0,    0,    0,  ...,    0,    0, 1615],
        [   0,    0,    0,  ...,    0,    0,    2]])
batch shape:  torch.Size([69, 64])


* The shape of the batch is (length of longest sentence in batch, batch_size)
* 1st Column is 1st sentence 
* We can see the "0" for padding smaller sentences

#### The above code can be used for creating a custom dataset. We can reuse the same code for Image->Caption as well

In [44]:
lang_dataset[0]

(tensor([   1, 2841,   69,   25,   29, 1616,    7,   69,   66,  570,    7,   21,
          245,    2]),
 tensor([  1,   3,   4, 120,  24, 168,  75, 110,  37,  30,  14, 736,  18,   6,
           2]))