In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentence=['I love my dog','I love my cat']
tokenizer=Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentence)
word_index=tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [4]:
sentence=['I love my dog','I love my cat','You love my dog!']
tokenizer=Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentence)
word_index=tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [5]:
test_data=['I really love my dog','my dog loves my mantee']
test=tokenizer.texts_to_sequences(test_data)
print(test)

[[3, 1, 2, 4], [2, 4, 2]]


In [6]:
# handling the missing tokens
sentence=['I love my dog','I love my cat','You love my dog!','do you think my dog is amazing']
tokenizer=Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index=tokenizer.word_index
test=tokenizer.texts_to_sequences(sentence)
print(word_index)
test_data=['I really love my dog','my dog loves my mantee']
test_seq=tokenizer.texts_to_sequences(test_data)
print(test_seq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [14]:
# padding - > converts all the sentences into  longest sequence length
# keep the remaining places with 0 values in the beginning
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentence=['I love my dog','I love my cat','You love my dog!','do you think my dog is amazing']
tokenizer=Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index=tokenizer.word_index
seq=tokenizer.texts_to_sequences(sentence)
padded=pad_sequences(seq)
padded1=pad_sequences(seq,maxlen=5,padding='post',truncating='post')
print(seq)
print(padded)
print(padded1)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]
[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


Word embedding

In [21]:
# using flatten
model =tf.keras.Sequential([
    # tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Embedding(1000,10,input_length=100),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 10)           10000     
_________________________________________________________________
flatten_4 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 6006      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 7         
Total params: 16,013
Trainable params: 16,013
Non-trainable params: 0
_________________________________________________________________


In [23]:
# using GlobalAveragePooling1D
model =tf.keras.Sequential([
    tf.keras.layers.Embedding(1000,10,input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 10)           10000     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 10)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 6)                 66        
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 7         
Total params: 10,073
Trainable params: 10,073
Non-trainable params: 0
_________________________________________________________________
