In [1]:
# References:
# https://gmihaila.medium.com/better-batches-with-pytorchtext-bucketiterator-12804a545e2a

In [2]:
import torch
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import spacy
import os

In [3]:
path = "D:/Datasets/Eng-French Translation"
os.chdir(path)

In [4]:
# !python -m spacy download fr_core_news_sm
# !python -m spacy download en_core_web_sm
spacy_fr = spacy.load('fr_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [5]:
# tokenizers
def french_tokenizer(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def english_tokenizer(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [6]:
ENGLISH_TEXT = Field(sequential = True, 
                     tokenize = english_tokenizer, 
                     lower = True,
                     init_token = "<sos>", 
                     eos_token = "<eos>")

FRENCH_TEXT = Field(sequential = True, 
                    tokenize = french_tokenizer, 
                    lower= True,
                    init_token = "<sos>",
                    eos_token = "<eos>")

In [7]:
# Datafields
datafields = [("english", ENGLISH_TEXT), ("french", FRENCH_TEXT)]

train, valid = TabularDataset.splits(
    path = "inputs/",
    train = "train.csv", validation = "valid.csv",
    format = "csv", skip_header = True,
    fields = datafields
)

In [8]:
print("No of training examples : ", len(train.examples))
print("No of validation examples : ", len(valid.examples))

No of training examples :  11250
No of validation examples :  3750


In [9]:
# Example
print(vars(train.examples[0]))

{'english': ['let', 'me', 'get', 'back', 'to', 'you', '.'], 'french': ['laissez', 'moi', 'revenir', 'vers', 'vous', '!']}


In [10]:
# building vocabulary
ENGLISH_TEXT.build_vocab(train)
FRENCH_TEXT.build_vocab(train)

In [11]:
print("Unique tokens in english vocabulary : ", len(ENGLISH_TEXT.vocab))
print("Unique tokens in french vocabulary : ", len(FRENCH_TEXT.vocab))

Unique tokens in english vocabulary :  4521
Unique tokens in french vocabulary :  6347


In [12]:
# building iterators
BATCH_SIZE = 8
DEVICE = "cpu"

train_iterator, valid_iterator = BucketIterator.splits(
    (train, valid),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.english),
    device = DEVICE
)

In [13]:
# how a batch will look like in terms of text
valid_iterator.create_batches()

for batch in valid_iterator.batches:
    print("batch size : ", len(batch))
    print()
    
    for example in batch:
        print(vars(example))
        
    break

batch size :  8

{'english': ['bring', 'wine', '.'], 'french': ['apporte', 'du', 'vin', '.']}
{'english': ['i', 'understood', '.'], 'french': ['j', 'ai', 'compris', '.']}
{'english': ['be', 'friendly', '.'], 'french': ['soyez', 'amicaux', '!']}
{'english': ['excuse', 'me', '?'], 'french': ['pardon', '?']}
{'english': ['ignore', 'them', '.'], 'french': ['ignorez', 'les', '.']}
{'english': ['goodnight', 'mother', '.'], 'french': ['bonne', 'nuit', 'mere', '!']}
{'english': ['continue', 'digging', '.'], 'french': ['continue', 'de', 'creuser', '.']}
{'english': ['take', 'it', '.'], 'french': ['prends', 'le', '!']}


In [14]:
# how the batch actually looks like when goes for processing in numerical format
temp = next(iter(valid_iterator))

In [15]:
# each column corresponds to a sentence, thats why there are 10 columns
# the words/word index are arranged in columnar manner because the RNN takes one word at a time.
english_tensor = temp.english
english_tensor

tensor([[   2,    2,    2,    2,    2,    2,    2,    2],
        [  96, 1486,    0, 1267, 1064,   34,    5,  494],
        [  14,    0,  215,  169,   21,  943, 4403,  550],
        [   4,    4,    4,    4,    8,    4,    4,    4],
        [   3,    3,    3,    3,    3,    3,    3,    3]])

In [16]:
french_tensor = temp.french
french_tensor

tensor([[   2,    2,    2,    2,    2,    2,    2,    2],
        [ 309,  993,  198,    0, 5251,  224,   19,  872],
        [  13,    8,  245,   31,    7,    0,   22,   44],
        [  34,    0,  208,    4,    3,   34,  793,  571],
        [   3,    4,   34,    3,    1,    3,    4,    4],
        [   1,    3,    3,    1,    1,    1,    3,    3]])

In [17]:
# converting the numerical matrices back to sentence format to see how the matrices look like in text format

In [18]:
vars(ENGLISH_TEXT.vocab).keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [19]:
def get_batch_string(language_tensor, language_field):
    text_language_tensor = []
    for tensor in language_tensor.numpy().T:
        temp = []
        for element in tensor:
            temp.append(language_field.vocab.itos[element])
        text_language_tensor.append(temp)
    return text_language_tensor

In [20]:
text_english_tensor = get_batch_string(english_tensor, ENGLISH_TEXT)
text_french_tensor = get_batch_string(french_tensor, FRENCH_TEXT)

In [21]:
for s in text_english_tensor:
    print(" ".join(s))

<sos> take it . <eos>
<sos> continue <unk> . <eos>
<sos> <unk> mother . <eos>
<sos> ignore them . <eos>
<sos> excuse me ? <eos>
<sos> be friendly . <eos>
<sos> i understood . <eos>
<sos> bring wine . <eos>


In [22]:
for s in text_french_tensor:
    print(" ".join(s))

<sos> prends le ! <eos> <pad>
<sos> continue de <unk> . <eos>
<sos> bonne nuit mere ! <eos>
<sos> <unk> les . <eos> <pad>
<sos> pardon ? <eos> <pad> <pad>
<sos> soyez <unk> ! <eos> <pad>
<sos> j ai compris . <eos>
<sos> apporte du vin . <eos>
