In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading data

In [69]:
NUM_SAMPLES = 10000

input_texts = []
target_texts = []
target_texts_input = []

In [70]:
t = 0

with open("../neural_machine_translation/data/hin.txt") as f:
    for line in f:
        
        # keeping only limited number of samples
        t += 1
        if t > NUM_SAMPLES:
            break
            
        if '\t' not in line:
            continue
            
        input_text, translation, *rest = line.rstrip().split('\t')
        
        target_text = translation + ' <eos>'
        target_text_input = '<sos> ' + translation
        
        input_texts.append(input_text)
        target_texts.append(target_text)
        target_texts_input.append(target_text_input)
    

In [71]:
target_texts_input[:15]

['<sos> वाह!',
 '<sos> झुको!',
 '<sos> बतख़!',
 '<sos> बचाओ!',
 '<sos> उछलो.',
 '<sos> कूदो.',
 '<sos> छलांग.',
 '<sos> नमस्ते।',
 '<sos> नमस्कार।',
 '<sos> वाह-वाह!',
 '<sos> चियर्स!',
 '<sos> सांस छोड़।',
 '<sos> सांस छोड़ो।',
 '<sos> समझे कि नहीं?',
 '<sos> मैं ठीक हूँ।']

### Tokenizers and word to index mappings

#### I have two languages to deal with, hence need two different tokenizers

In [72]:
print(input_texts[:5])

['Wow!', 'Duck!', 'Duck!', 'Help!', 'Jump.']


In [73]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from itertools import islice

MAX_NUM_WORDS = 20000
MAX_SEQ_LEN = 100

In [74]:
# tokenizer for inputs
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
print(input_sequences[:7])


# word_to_index mapping 
word_to_idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens' % len(word_to_idx_inputs))
first_10_mapping_inputs = dict(islice(word_to_idx_inputs.items(), 10))
first_10_mapping_inputs


# maximum length of input sequences
max_len_inputs = max(len(i) for i in input_sequences)
print("Maximum length of input sequences:",max_len_inputs)

[[1326], [949], [949], [83], [582], [582], [582]]
Found 2463 unique input tokens
Maximum length of input sequences: 22


In [75]:
# tokenizer for outputs
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_input)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_input)
print(target_sequences[:5])
print(target_sequences_inputs[:5])


# word to index mapping
word_to_idx_outputs = tokenizer_outputs.word_index
print("Found %s unique output tokens" % len(word_to_idx_outputs))
first_10_mappings_outputs = dict(islice(word_to_idx_outputs.items(), 10))
first_10_mappings_outputs


# maximum length of output sequences
max_len_outputs = max(len(i) for i in target_sequences)
print("Maximum length of output sequences:",max_len_outputs)


[[1538, 1], [1539, 1], [1540, 1], [1541, 1], [1542, 1]]
[[2, 1538], [2, 1539], [2, 1540], [2, 1541], [2, 1542]]
Found 3265 unique output tokens
Maximum length of output sequences: 26


In [76]:
num_words_output = len(word_to_idx_outputs) + 1
num_words_output

3266

### Pad each sequence

In [82]:
# padding for encoder inputs
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_inputs)
print("Encoder input shape is", encoder_inputs.shape)
encoder_inputs[0]

Encoder input shape is (3061, 22)


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1326],
      dtype=int32)

#### Upon seeing the encoder state/the last word of the input sequence, the decoder produces the output immediately rather than having to go through a bunch of zeros first, hence post padding for decoder

In [83]:
# padding for decoder inputs and targets
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_outputs, padding='post')
print("Decoder input shape is", decoder_inputs.shape)
decoder_inputs[0]

pad_sequences(target_sequences, maxlen=max_len_outputs, padding='post')

Decoder input shape is (3061, 26)


array([   2, 1538,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)