In [1]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentences = ['I Love my dog',
            'I Love my cat'
            ]

In [3]:
sentences

['I Love my dog', 'I Love my cat']

In [4]:
tokenizer = Tokenizer(num_words=100)

In [5]:
tokenizer.fit_on_texts(sentences)

In [6]:
word_index = tokenizer.word_index

In [7]:
word_index

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}

# Sequences

# Natural Language Processing sequencing takes a sequence of words and converts them into a sequence of numbers. We can perform other data processing techniques after sequencing the text.

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
sentences = ['I Love my dog',
            'I love my cat',
            'you love my dog very much',
            'do you think my dog is awesome']

In [11]:
tokenizer = Tokenizer(num_words = 100)

In [16]:
tokenizer.fit_on_texts(sentences)

In [18]:
word_index = tokenizer.word_index

In [19]:
word_index

{'my': 1,
 'love': 2,
 'dog': 3,
 'i': 4,
 'you': 5,
 'cat': 6,
 'very': 7,
 'much': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'awesome': 12}

In [20]:
sequences = tokenizer.texts_to_sequences(sentences)

In [21]:
sequences

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3, 7, 8], [9, 5, 10, 1, 3, 11, 12]]

In [22]:
test_data = ['i really love my dog',
            'my dog love my brother'
            ]

In [23]:
test_data

['i really love my dog', 'my dog love my brother']

In [26]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [27]:
test_seq

[[4, 2, 1, 3], [1, 3, 2, 1]]

# Out of Vocabulary

# ut-of-vocabulary (OOV) words are unknown words that appear in the testing speech but not in the recognition vocabulary. They are usually important content words such as names and locations which contain information crucial to the success of many speech recognition tasks.

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [29]:
serntences =['I love my dog',
             'i love my cat',
             'you love my dog verty much',
             'do you think my dog is awesome'
]

In [30]:
sentences

['I Love my dog',
 'I love my cat',
 'you love my dog very much',
 'do you think my dog is awesome']

In [31]:
tokenizer = Tokenizer(num_words = 100,oov_token='<oov>')

In [33]:
tokenizer.fit_on_texts(sentences)

In [35]:
word_index = tokenizer.word_index

In [36]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'very': 8,
 'much': 9,
 'do': 10,
 'think': 11,
 'is': 12,
 'awesome': 13}

In [40]:
sequences = tokenizer.texts_to_sequences(sentences)

In [41]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4, 8, 9], [10, 6, 11, 2, 4, 12, 13]]

In [42]:
test_data

['i really love my dog', 'my dog love my brother']

In [44]:
test_seq = tokenizer.texts_to_sequences(test_data)

In [45]:
test_seq

[[5, 1, 3, 2, 4], [2, 4, 3, 2, 1]]

# Padding

# Padding is the process of adding layers of zeros or other values outside the actual data in an input matrix.

In [48]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from  tensorflow.keras.preprocessing.sequence import pad_sequences

In [49]:
sentences =['I love my dog',
           'i love my cat',
           'you love my dog very much',
           'do you think my dog is awesome']

In [51]:
sentences

['I love my dog',
 'i love my cat',
 'you love my dog very much',
 'do you think my dog is awesome']

In [53]:
tokenizer = Tokenizer(num_words = 100,oov_token='<oov>')

In [54]:
tokenizer.fit_on_texts(sentences)

In [56]:
word_index = tokenizer.word_index

In [57]:
word_index

{'<oov>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'very': 8,
 'much': 9,
 'do': 10,
 'think': 11,
 'is': 12,
 'awesome': 13}

In [60]:
sequence = tokenizer.texts_to_sequences(sentences)

In [61]:
sequence

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4, 8, 9], [10, 6, 11, 2, 4, 12, 13]]

In [63]:
padded = pad_sequences(sequences)

In [64]:
padded

array([[ 0,  0,  0,  5,  3,  2,  4],
       [ 0,  0,  0,  5,  3,  2,  7],
       [ 0,  6,  3,  2,  4,  8,  9],
       [10,  6, 11,  2,  4, 12, 13]])

In [65]:
padded1 = pad_sequences(sequences,padding='post')

In [66]:
padded1

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  8,  9,  0],
       [10,  6, 11,  2,  4, 12, 13]])

In [67]:
padded2 = pad_sequences(sequences,padding='pre',maxlen=5)

In [68]:
padded2

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2,  7],
       [ 3,  2,  4,  8,  9],
       [11,  2,  4, 12, 13]])

In [69]:
padded3 = pad_sequences(sequences,truncating='post',maxlen=5)

In [70]:
padded3

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2,  7],
       [ 6,  3,  2,  4,  8],
       [10,  6, 11,  2,  4]])