In [1]:
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'

In [2]:
sentences = [
    "I love my dog", 
    "I love my cat",
    "You love my dog!",
]

In [3]:
print(sentences)

['I love my dog', 'I love my cat', 'You love my dog!']


## Tokenizer 🎫

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50)
tokenizer.fit_on_texts(sentences)

# - word_index assigns a unique index to each word present in the text.
word_index = tokenizer.word_index

# - The word_count shows the number of times words occur in the text corpus
word_counts = tokenizer.word_counts

# - The word_doc tells in how many documents each of the words appear
word_in_doc = tokenizer.word_docs

# - Number of documents/texts passed to the keras tokenizer class.
documents = tokenizer.document_count

print(word_index, word_counts, word_in_doc, documents, sep="\n\n")

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}

OrderedDict([('i', 2), ('love', 3), ('my', 3), ('dog', 2), ('cat', 1), ('you', 1)])

defaultdict(<class 'int'>, {'my': 3, 'i': 2, 'love': 3, 'dog': 2, 'cat': 1, 'you': 1})

3


## texts_to_sequences 📚

- method helps in converting tokens of text corpus into a sequence of integers.

In [5]:
sentences.append("Do you think my dog is amazing?")
print(sentences)

['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
toks = tf.keras.preprocessing.text.Tokenizer(num_words = 100)
toks.fit_on_texts(sentences)

# - Generate word index dictionary
word_idx = toks.word_index

# - List of token sequences
sequences = toks.texts_to_sequences(sentences)


print(f"word index: {word_idx}\n"
      f"token sequences: {sequences}"
    )

word index: {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
token sequences: [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [8]:
# - test the tokenizer 
test_data = [
    "i really love my dog", 
    "my dog loves my manatee",
]

test_seq = toks.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]


In [9]:
print(word_idx)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [10]:
test_seq[1] # <= gave 'my dog my'

[1, 3, 1]

- __Note:__
- To takle this problem we need more training data so the tokeizer can learn to generalize.

## Try to resolve above issue 🧪

- Firs solution is to gather much more data.
- Second solution is to put a special value/character to those words which are not seen by the model.

In [11]:
# - Lets look at out data once:
print(sentences)

['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']


In [12]:
# - Apply second solution
new_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 100, oov_token = "<00V>")
new_tokenizer.fit_on_texts(sentences)
word_index = new_tokenizer.word_index

sequences_new = new_tokenizer.texts_to_sequences(sentences)

test_seq_new = new_tokenizer.texts_to_sequences(test_data)
print(test_data, test_seq_new, sep="\n\n")

['i really love my dog', 'my dog loves my manatee']

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [13]:
# - we can also call word_index dictionary as a lookup dictionary.
print(word_index)

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


- __still not syntactically correct!__


## padding 📏

- Further when we feed data to the neural network model(training) we need data in uniform size.
- Therefor __padding__ is used to make uniform sequence of data. 
- List that has maximum length will get the size to all of lists of different sizes.

In [14]:
print(sentences, sequences_new, sep="\n\n")

['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [15]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_sequences = tf.keras.utils.pad_sequences(sequences_new,)

In [16]:
print(sentences, sequences_new, pad_sequences, word_index, sep='\n\n')

['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]

{'<00V>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [None]:
# on going...