In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [None]:
#本教程为你提供了一个如何使用 tf.data.TextLineDataset 来加载文本文件的示例。TextLineDataset 通常被用来以文本文件构建数据集（原文件中的一行为一个样本) 。这适用于大多数的基于行的文本数据（例如，诗歌或错误日志) 。下面我们将使用相同作品（荷马的伊利亚特）三个不同版本的英文翻译，然后训练一个模型来通过单行文本确定译者。

In [3]:
# 下载数据集
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
parent_dir = os.path.dirname(text_dir)
parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'C:\\Users\\Administrator\\.keras\\datasets'

In [4]:
def labeler(example,index):
    return example,tf.cast(index,tf.int64)
labeled_data_sets=[]
for i,file_name in enumerate(FILE_NAMES):
    line_dataset=tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
    labeled_dataset=line_dataset.map(lambda ex:labeler(ex,i))
    labeled_data_sets.append(labeled_dataset)

In [11]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [12]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [13]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b"Exciting, though he mourn'd his comrade slain.">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Xanthus answer'd from beneath his yoke,">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'He ceased, and still proceeding, next arrived'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Within my walls, in Argos, far from home,'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Then were great deeds achiev'd; nor thro' the breach">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [24]:
# 对文本进行切分
tokenizer=tfds.deprecated.text.Tokenizer()
vocabulary_set=set()
for text_tensor,_ in all_labeled_data:
    some_tokens=tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)


TypeError: 'set' object is not subscriptable

In [31]:
len(vocabulary_set)

17178

In [33]:
# 构建编码器
encoder=tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [35]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b"Exciting, though he mourn'd his comrade slain."


In [36]:
encoded_example=encoder.encode("Exciting, though he mourn'd his comrade slain.")
encoded_example

[1023, 11978, 11892, 8705, 10995, 7878, 6540, 7918]

In [37]:
def encode(text_tensor,label):
    encoded_text=encoder.encode(text_tensor.numpy())
    return encoded_text,label
def encode_map_fn(text,label):
    encoded_text,label=tf.py_function(encode,inp=[text,label],Tout=(tf.int64,tf.int64))
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text,label
all_encoded_data = all_labeled_data.map(encode_map_fn)

<MapDataset shapes: ((None,), ()), types: (tf.int64, tf.int64)>