Creating basic tokenizer for english language

In [1]:
import collections
import numpy as np
import json
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

Verify access to the GPU

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6642074215873930628
xla_global_id: -1
]


Load Data

In [3]:
def load_data(path):
  df = pd.read_excel(path)
  sentences = df.iloc[:, 0].tolist()
  return sentences

english_sentences = load_data('data/english_parallel.xlsx')

Sample Data

In [4]:
english_sentences[:5]

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer',
 'The default plugin layout for the bottom panel',
 'The default plugin layout for the top panel',
 'A list of plugins that are disabled by default']

Structure of the Dataset

In [5]:
english_words_counter = collections.Counter([word for sentence in english_sentences if isinstance(sentence, str) for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences if isinstance(sentence, str) for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

#print()

10780353 English words.
298005 unique English words.
10 Most common words in the English dataset:
"the" "of" "and" "to" "a" "is" "in" "you" "-" "for"


Preprocess

Tokenize the words into ids

In [6]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


Add padding to make all the sequence the same length.

In [7]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [12]:
def preprocess(x, y=None): # Added y=None as a default argument to handle single-input cases

    x = [str(item) for item in x]

    preprocess_x, x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)

    if y is not None:  # Process y only if provided
        preprocess_y, y_tk = tokenize(y)
        preprocess_y = pad(preprocess_y)
        preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
        preprocess_y = np.clip(preprocess_y, 0, len(y_tk.word_index) - 1)
        return preprocess_x, preprocess_y, x_tk, y_tk
    else:
        return preprocess_x, x_tk

# Assuming you only want to preprocess english_sentences
preproc_english_sentences, english_tokenizer = preprocess(english_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("English vocabulary size:", english_vocab_size)

Data Preprocessed
Max English sentence length: 657
English vocabulary size: 146847


In [13]:
# Serialize English Tokenizer to JSON
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))
