In [28]:
# Install TensorFlow
# !pip install -q tensorflow

try:
  %tensorflow_version 2.x  # Colab only.
except Exception:
  pass

import tensorflow as tf
print(tf.__version__)

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  # Colab only.`. This will be interpreted as: `2.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.
2.2.0


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create Data

In [0]:
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

# Tokenize Data

In [31]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(tokenizer)
print(sequences)

<keras_preprocessing.text.Tokenizer object at 0x7f3d0f142588>
[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [32]:
tokenizer.word_index

{'and': 2,
 'bunnies': 8,
 'chocolate': 7,
 'eggs': 4,
 'ham': 5,
 'hate': 9,
 'i': 1,
 'like': 3,
 'love': 6,
 'onions': 10}

# Padding and Truncating

In [33]:
# pre padding
data = pad_sequences(sequences)
print(data) 

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [34]:
# post padding
data = pad_sequences(sequences, padding='post')
print(data) 

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [35]:
# Pre padding and pre truncating
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [36]:
# Post padding and pre truncating
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data) 

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 1  9 10  0]]


In [37]:
# Pre padding and post truncating
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]


In [40]:
# Post padding and post truncating
MAX_SEQUENCE_LENGTH = 4
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 1  9 10  0]]
