In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1500)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [20]:
import os
import tensorflow_datasets as tfds


In [9]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

##download data to '/root/.keras/datasets'
for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

'/root/.keras/datasets'

In [10]:
# Load text into datasets
# Iterate through the files, loading each one into its own dataset.

# Each example needs to be labeled individually labeled, so use tf.data.Dataset.map to apply a labeler function to each one. This will iterate over every example in the dataset, returning (example, label) pairs.

In [11]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [12]:
# Combine these labeled datasets into a single dataset, and shuffle it

In [13]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [14]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [15]:
# You can use tf.data.Dataset.take and print to see what the (example, label) pairs look like. The numpy property shows each Tensor's value.

In [16]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b"The soil, there Pallas tripp'd him. Ordure foul">, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b"The Thunderer's throne with admiration view'd,">, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'Wounded, and in his nether bowels deep'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=86, shape=(), dtype=string, numpy=b'He said, and Venus with excess of pain'>, <tf.Tensor: id=87, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=90, shape=(), dtype=string, numpy=b'Thou most severe! I never search thy thoughts,'>, <tf.Tensor: id=91, shape=(), dtype=int64, numpy=0>)


In [17]:
# Encode text lines as numbers
# Machine learning models work on numbers, not words, so the string values need to be converted into lists of numbers. To do that, map each unique word to a unique integer.

In [18]:
# Build vocabulary
# First, build a vocabulary by tokenizing the text into a collection of individual unique words. There are a few ways to do this in both TensorFlow and Python. For this tutorial:

# Iterate over each example's numpy value.
# Use tfds.features.text.Tokenizer to split it into tokens.
# Collect these tokens into a Python set, to remove duplicates.
# Get the size of the vocabulary for later use.

In [21]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

12595

In [23]:
# Encode examples
# # Create an encoder by passing the vocabulary_set to tfds.features.text.TokenTextEncoder. The encoder's encode method takes in a string of text and returns a list of integers.

In [24]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [25]:
# You can try this on a single line to see what the output looks like.
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b"The soil, there Pallas tripp'd him. Ordure foul"


In [26]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[9110, 10086, 8374, 10346, 11284, 733, 3789, 11292, 989]


In [28]:
# Now run the encoder on the dataset by wrapping it in tf.py_function and passing that to the dataset's map method.

In [30]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [31]:
# Split the dataset into text and train batches

In [32]:
# Use tf.data.Dataset.take and tf.data.Dataset.skip to create a small test dataset and a larger training set.

In [33]:
# Before being passed into the model, the datasets need to be batched. Typically, the examples inside of a batch need to be the same size and shape. But, the examples in these datasets are not all the same size — each line of text had a different number of words. So use tf.data.Dataset.padded_batch (instead of batch) to pad the examples to the same size.

In [34]:
##train除了skip跳过的数据，剩下的都是的
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

#test只取前TAKE_SIZE数据
test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [35]:
# ow, test_data and train_data are not collections of (example, label) pairs, but collections of batches. Each batch is a pair of (many examples, many labels) represented as arrays.

In [37]:
# To illustrate:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

W0723 02:00:55.536967 139715680397056 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0723 02:00:55.538956 139715680397056 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int64
W0723 02:00:55.541317 139715688789760 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0723 02:00:55.543113 139715688789760 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int64
W0723 02:00:55.547410 139715680397056 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


(<tf.Tensor: id=65717, shape=(15,), dtype=int64, numpy=
 array([ 9110, 10086,  8374, 10346, 11284,   733,  3789, 11292,   989,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: id=65721, shape=(), dtype=int64, numpy=0>)

In [38]:
# Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.

In [39]:
vocab_size += 1

In [40]:
# Build the model

In [41]:
model = tf.keras.Sequential()

In [42]:
# The first layer converts integer representations to dense vector embeddings. See the Word Embeddings tutorial for more details.
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [43]:
# The next layer is a Long Short-Term Memory layer, which lets the model understand words in their context with other words. A bidirectional wrapper on the LSTM helps it to learn about the datapoints in relationship to the datapoints that came before it and after it.
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [44]:
# Finally we'll have a series of one or more densely connected layers, with the last one being the output layer. The output layer produces a probability for all the labels. The one with the highest probability is the models prediction of an example's label.
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

In [45]:
# Finally, compile the model. For a softmax categorization model, use sparse_categorical_crossentropy as the loss function. You can try other optimizers, but adam is very common.
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
# Train the model
# # This model running on this data produces decent results (about 83%).
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3


W0723 02:04:39.050826 139719798712064 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f128e29df98>

In [47]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))

     79/Unknown - 5s 64ms/step - loss: 0.3387 - accuracy: 0.8882
Eval loss: 0.3387248304825795, Eval accuracy: 0.8881999850273132
