### Tokenizer

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer

# define 5 documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']

# create the tokenizer
t = Tokenizer()

# fit the tokenizer on the documents
t.fit_on_texts(docs)

### Word Counts

In [26]:
t.word_counts

OrderedDict([('well', 1),
             ('done', 1),
             ('good', 1),
             ('work', 2),
             ('great', 1),
             ('effort', 1),
             ('nice', 1),
             ('excellent', 1)])

### Word index

In [27]:
t.word_index

{'work': 1,
 'well': 2,
 'done': 3,
 'good': 4,
 'great': 5,
 'effort': 6,
 'nice': 7,
 'excellent': 8}

### Document Count

In [28]:
t.document_count

5

### Text to Word Sequence

In [29]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# define the document
text = 'The quick brown fox jumped over the lazy dog dog.'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))

print(words)

{'over', 'quick', 'lazy', 'jumped', 'brown', 'fox', 'dog', 'the'}


### Index Word

In [3]:
# Example : using Keras Tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer

# define 5 documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!']

t = Tokenizer()

t.fit_on_texts(docs)

t.index_word

{1: 'work',
 2: 'well',
 3: 'done',
 4: 'good',
 5: 'great',
 6: 'effort',
 7: 'nice',
 8: 'excellent'}

### One Hot Encoding: sequences_to_matrix (Binary)

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10)

x_train = [[1,2,3,4],
           [4,5,],
           [6,7,8]]

x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')

x_train

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 0.]])

### Padding Sequence : pre and post

In [6]:
# Example 1 : using Keras

import numpy as np
from tensorflow.keras.preprocessing import sequence

data_vec = [['3', '18', '9', '3', '11', '5', '20'],
            ['3', '8', '1', '12'],
            ['18', '1', '8', '1'],
            ['8', '1', '9', '14'],
            ['25', '1', '8', '1'],
            ['9']]

maxl=np.max([len(x) for x in data_vec])

print('maxl:{0}'.format(maxl))

x_data_vec = sequence.pad_sequences(data_vec, maxlen=maxl)

x_data_vec

maxl:7


array([[ 3, 18,  9,  3, 11,  5, 20],
       [ 0,  0,  0,  3,  8,  1, 12],
       [ 0,  0,  0, 18,  1,  8,  1],
       [ 0,  0,  0,  8,  1,  9, 14],
       [ 0,  0,  0, 25,  1,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  9]])

In [9]:
# padding = 'post'

x_data_vec = sequence.pad_sequences(data_vec, maxlen=maxl,padding='pre')

x_data_vec

array([[ 3, 18,  9,  3, 11,  5, 20],
       [ 0,  0,  0,  3,  8,  1, 12],
       [ 0,  0,  0, 18,  1,  8,  1],
       [ 0,  0,  0,  8,  1,  9, 14],
       [ 0,  0,  0, 25,  1,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  9]])

In [10]:
# Max Length

x_data_vec = sequence.pad_sequences(data_vec, maxlen=10,padding='post')

x_data_vec

array([[ 3, 18,  9,  3, 11,  5, 20,  0,  0,  0],
       [ 3,  8,  1, 12,  0,  0,  0,  0,  0,  0],
       [18,  1,  8,  1,  0,  0,  0,  0,  0,  0],
       [ 8,  1,  9, 14,  0,  0,  0,  0,  0,  0],
       [25,  1,  8,  1,  0,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0,  0]])