In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding


In [2]:
sample_text = ['I love machine learning',
               'It is an interesting subject',
               'Machine Learning is a boring subject'
               ]

In [3]:
vocab_size = 20
embedding_dim = 5
max_length = 5
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [4]:
# Tokenize, Convert to Sequence, and Pad

# Create and instance of a Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# creates a dictionary of word and values using train data. 
# i.e for each words in train review map it to some number

tokenizer.fit_on_texts(sample_text)

word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'machine': 2,
 'learning': 3,
 'is': 4,
 'subject': 5,
 'i': 6,
 'love': 7,
 'it': 8,
 'an': 9,
 'interesting': 10,
 'a': 11,
 'boring': 12}

In [5]:

# For sample_text
# using tokens above convert each review into a sequence.
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_sequences


[[6, 7, 2, 3], [8, 4, 9, 10, 5], [2, 3, 4, 11, 12, 5]]

In [6]:
# pad/truncate sequence to make each review of length max_length
sample_padded = pad_sequences(sample_sequences, maxlen=5, padding='post', truncating=trunc_type)
sample_padded

array([[ 6,  7,  2,  3,  0],
       [ 8,  4,  9, 10,  5],
       [ 2,  3,  4, 11, 12]], dtype=int32)

In [7]:

from tensorflow.keras.layers import Embedding
# Create an embedding layer
embedding_layer = Embedding(1000, 10)

# Pass some text to the embedding layer
embedded_text = embedding_layer(sample_padded)
# Print the shape of the output
print('The embedded text data shape is',embedded_text.shape)
print('An example of embedding: First embedded text sequence data is:')
embedded_text[1,:,:]

The embedded text data shape is (3, 5, 10)
An example of embedding: First embedded text sequence data is:


<tf.Tensor: shape=(5, 10), dtype=float32, numpy=
array([[-0.03834301,  0.00479092, -0.01012387,  0.03657177, -0.01715965,
        -0.02272273, -0.02656649,  0.0487006 , -0.0238972 ,  0.04372754],
       [ 0.0383485 ,  0.02869339, -0.02715058,  0.009262  ,  0.02195105,
        -0.02190059, -0.02965981,  0.00258789, -0.04203169,  0.00319447],
       [ 0.03820566, -0.0099265 ,  0.03273856,  0.00055615,  0.01259836,
         0.02301652, -0.01502926, -0.00572432,  0.00874148, -0.0195071 ],
       [ 0.01847352,  0.02642194,  0.02731682, -0.04977697, -0.0393496 ,
        -0.0372509 , -0.04752994, -0.00558472, -0.0310897 ,  0.01497186],
       [ 0.03563477, -0.00690212,  0.04435058,  0.01988745, -0.04891148,
         0.0187173 ,  0.04279529, -0.03194039,  0.03858398, -0.04339191]],
      dtype=float32)>

In [8]:
#What happens if we encounter words that are not in our vocabulary?
# Note in the output, word "application which is not in our vocabulary is represented vy 1 which is a token for <OOV>"
new_text = ['I love Machine Learning applications']
new_text_sequence = tokenizer.texts_to_sequences(new_text)
new_text_sequence

[[6, 7, 2, 3, 1]]