In [3]:
## Import tokenizer from keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [24]:
sentences = [ "My name is Samay Sah",
             "I am a NLP Engineer",
             "I love Dogs",
             "I love playing cricket",
             "I like travelling",
             "I am trying to learn tokenizer here"
            ]

In [25]:
print(sentences)

['My name is Samay Sah', 'I am a NLP Engineer', 'I love Dogs', 'I love playing cricket', 'I like travelling', 'I am trying to learn tokenizer here']


In [26]:
## num_words = 100 means keep top 100 frequency words
tokenizer = Tokenizer(num_words = 100)
print(tokenizer)

<keras_preprocessing.text.Tokenizer object at 0x000002548FCAE408>


In [27]:
## building the vocabulary
tokenizer.fit_on_texts(sentences)
print(tokenizer)

<keras_preprocessing.text.Tokenizer object at 0x000002548FCAE408>


In [28]:
## word_index gives index to each words
## word_index gives result in order which occurs maximum times in vocabulary comes first
## for eg : here "i" comes 5 times, "am" comes 2 times, "love" comes 2 times .
## if 2 words comes equal times than it will index as alphatecial order
word_index = tokenizer.word_index
word_index

{'i': 1,
 'am': 2,
 'love': 3,
 'my': 4,
 'name': 5,
 'is': 6,
 'samay': 7,
 'sah': 8,
 'a': 9,
 'nlp': 10,
 'engineer': 11,
 'dogs': 12,
 'playing': 13,
 'cricket': 14,
 'like': 15,
 'travelling': 16,
 'trying': 17,
 'to': 18,
 'learn': 19,
 'tokenizer': 20,
 'here': 21}

In [31]:
## create sequences to sentences
sequences = tokenizer.texts_to_sequences(sentences)
print(len(sequences))
print(sequences)

6
[[4, 5, 6, 7, 8], [1, 2, 9, 10, 11], [1, 3, 12], [1, 3, 13, 14], [1, 15, 16], [1, 2, 17, 18, 19, 20, 21]]


#### Decode the sequence:
#### [4, 5, 6, 7, 8] = 4:my, 5: name, 6:is, 7: samay, 8:sah
#### for each sentence encoding is done.

### case : If there is one new sentence in test data , in which there are word which is not present in early vocabulary

In [35]:
test_data = ["i like beer and mountains"]

In [37]:
print(word_index)

{'i': 1, 'am': 2, 'love': 3, 'my': 4, 'name': 5, 'is': 6, 'samay': 7, 'sah': 8, 'a': 9, 'nlp': 10, 'engineer': 11, 'dogs': 12, 'playing': 13, 'cricket': 14, 'like': 15, 'travelling': 16, 'trying': 17, 'to': 18, 'learn': 19, 'tokenizer': 20, 'here': 21}


In [36]:
## beer , and, mountain are not present in vocabulary , so it is getting ignored 
## only "i" "like" index is coming
test_sequence = tokenizer.texts_to_sequences(test_data)
test_sequence

[[1, 15]]

In [40]:
## to deal with these cases , we generally gives oov (out of vocabulary value) 
tokenizer = Tokenizer(num_words = 100 , oov_token = "<OOV>")

In [42]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'am': 3, 'love': 4, 'my': 5, 'name': 6, 'is': 7, 'samay': 8, 'sah': 9, 'a': 10, 'nlp': 11, 'engineer': 12, 'dogs': 13, 'playing': 14, 'cricket': 15, 'like': 16, 'travelling': 17, 'trying': 18, 'to': 19, 'learn': 20, 'tokenizer': 21, 'here': 22}


In [44]:
### Now in place of beer , and, mountain = 1 will come
test_sequence = tokenizer.texts_to_sequences(test_data)
test_sequence

[[2, 16, 1, 1, 1]]

### PAD_SEQUENCE

In [46]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [47]:
print(sequences)

[[4, 5, 6, 7, 8], [1, 2, 9, 10, 11], [1, 3, 12], [1, 3, 13, 14], [1, 15, 16], [1, 2, 17, 18, 19, 20, 21]]


In [48]:
## it will left pad the sequence
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  4  5  6  7  8]
 [ 0  0  1  2  9 10 11]
 [ 0  0  0  0  1  3 12]
 [ 0  0  0  1  3 13 14]
 [ 0  0  0  0  1 15 16]
 [ 1  2 17 18 19 20 21]]


In [50]:
## padding ="post" it will right padded the sequence
padded = pad_sequences(sequences, padding = "post")
print(padded)

[[ 4  5  6  7  8  0  0]
 [ 1  2  9 10 11  0  0]
 [ 1  3 12  0  0  0  0]
 [ 1  3 13 14  0  0  0]
 [ 1 15 16  0  0  0  0]
 [ 1  2 17 18 19 20 21]]


In [52]:
## it will truncate max_lenth from end
padded = pad_sequences(sequences, padding = "post", maxlen = 5)
print(padded)

[[ 4  5  6  7  8]
 [ 1  2  9 10 11]
 [ 0  0  1  3 12]
 [ 0  1  3 13 14]
 [ 0  0  1 15 16]
 [17 18 19 20 21]]


In [53]:
## truncating = "post" will give max_lenth from start
padded = pad_sequences(sequences, padding = "post", maxlen = 5, truncating = "post")
print(padded)

[[ 4  5  6  7  8]
 [ 1  2  9 10 11]
 [ 1  3 12  0  0]
 [ 1  3 13 14  0]
 [ 1 15 16  0  0]
 [ 1  2 17 18 19]]


In [55]:
### Now we have sequence, we can make TDM (term document matrix or bag of words from it)
tdm = tokenizer.sequences_to_matrix(sequences, mode = "binary")
tdm

array([[0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0.