In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf

tf.executing_eagerly()

True

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt

In [3]:
!ls

dataset  dataset.ipynb	dataset.py  en.subwords


In [4]:
examples, metadata = tfds.load(name="ted_hrlr_translate/pt_to_en",
                               with_info=True,
                               as_supervised=True,
                               data_dir="./dataset/")
train_examples, val_examples = examples["train"], examples["validation"]
print(len(list(train_examples)), len(list(val_examples)))

51785 1193


In [7]:
for i, (pt, en) in enumerate(train_examples):
    if i > 0:
        break
    print(pt)
    print(pt.numpy())
    print(en)
    print(en.numpy())

tf.Tensor(b'os astr\xc3\xb3nomos acreditam que cada estrela da gal\xc3\xa1xia tem um planeta , e especulam que at\xc3\xa9 um quinto deles tem um planeta do tipo da terra que poder\xc3\xa1 ter vida , mas ainda n\xc3\xa3o vimos nenhum deles .', shape=(), dtype=string)
b'os astr\xc3\xb3nomos acreditam que cada estrela da gal\xc3\xa1xia tem um planeta , e especulam que at\xc3\xa9 um quinto deles tem um planeta do tipo da terra que poder\xc3\xa1 ter vida , mas ainda n\xc3\xa3o vimos nenhum deles .'
tf.Tensor(b"astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them .", shape=(), dtype=string)
b"astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them ."


In [8]:
## 迭代器 
train_examples_small = iter(list(train_examples)[:5])

In [9]:
tokenizer_en_small = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_generator=(en.numpy() for pt, en in train_examples_small),   # generator yielding `str`
    target_vocab_size=2 ** 13,
    max_subword_length=20,
    max_corpus_chars=None,
    reserved_tokens=None
)

In [10]:
sample_string = 'Transformer is awesome.'
print(tokenizer_en_small.encode(sample_string))

[162, 192, 175, 188, 193, 180, 189, 192, 187, 179, 192, 110, 183, 193, 110, 175, 197, 179, 29, 187, 179, 124]


In [11]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_generator=(en.numpy() for pt, en in train_examples),   # generator yielding `str`
    target_vocab_size=2 ** 13,
    max_subword_length=20,
    max_corpus_chars=None,
    reserved_tokens=None
)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [7915, 1248, 7946, 7194, 13, 2799, 7877]
The original string: Transformer is awesome.


In [12]:
tokenizer_en.save_to_file("en")

In [13]:
!ls
!wc -l en.subwords  # 奇怪为什么 subword　的词表大小不一样呢。。

dataset  dataset.ipynb	dataset.py  en.subwords
7832 en.subwords


In [14]:
for ts in tokenized_string:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

7915 ----> T
1248 ----> ran
7946 ----> s
7194 ----> former 
13 ----> is 
2799 ----> awesome
7877 ----> .


- tokenizer.encode 是把 string 先做 bpe,　分割成 subwords,　然后转换成　index  
- tokenizer.decode 是把 index 转换成　subwords.

#### Add a start and end token to the input and target.

In [16]:
tokenizer_en.vocab_size, tokenizer_pt.vocab_size

(8087, 8214)

In [17]:
def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]   # add start and end token

    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

    return lang1, lang2

In [18]:
pt_example = 'Transformer é incrível.'
en_example = 'Transformer is awesome.'
pt_example = tf.convert_to_tensor(pt_example, dtype=tf.string)
en_example = tf.convert_to_tensor(en_example, dtype=tf.string)

In [19]:
pt_enc, en_enc = encode(pt_example, en_example)
print(pt_enc)
print(en_enc)

[8214, 8042, 2883, 8073, 2266, 383, 8, 1917, 8004, 8215]
[8087, 7915, 1248, 7946, 7194, 13, 2799, 7877, 8088]


In [20]:
MAX_LENGTH = 40

#### 使用 tf.logical_and 删除掉长度超过 40 的

In [21]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [26]:
print(tf.size(pt_enc), tf.size(pt_enc) <= 40) 

tf.Tensor(10, shape=(), dtype=int32) tf.Tensor(True, shape=(), dtype=bool)


In [30]:
tf.logical_and(tf.size(pt_enc) <= MAX_LENGTH,
             tf.size(en_enc) <= MAX_LENGTH)

<tf.Tensor: id=520474, shape=(), dtype=bool, numpy=True>

In [31]:
def tf_encode(pt, en):
    return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

In [32]:
tf_encode(pt_example, en_example)

W0509 17:47:57.726433 140119709894464 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0509 17:47:57.728044 140119709894464 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


[<tf.Tensor: id=520482, shape=(10,), dtype=int64, numpy=array([8214, 8042, 2883, 8073, 2266,  383,    8, 1917, 8004, 8215])>,
 <tf.Tensor: id=520483, shape=(9,), dtype=int64, numpy=array([8087, 7915, 1248, 7946, 7194,   13, 2799, 7877, 8088])>]

In [33]:
train_dataset = train_examples.map(tf_encode)
train_dataset

<MapDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [34]:
train_dataset = train_dataset.filter(filter_max_length)
train_dataset

<FilterDataset shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [35]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [36]:
# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))

In [47]:
print(len(list(train_dataset)))

703


In [37]:
pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch

W0509 17:51:00.207847 140117419202304 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0509 17:51:00.208775 140117419202304 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0509 17:51:00.210186 140117419202304 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


(<tf.Tensor: id=521021, shape=(64, 40), dtype=int64, numpy=
 array([[8214, 1259,    5, ...,    0,    0,    0],
        [8214,  299,   13, ...,    0,    0,    0],
        [8214,   59,    8, ...,    0,    0,    0],
        ...,
        [8214,   95,    3, ...,    0,    0,    0],
        [8214, 5157,    1, ...,    0,    0,    0],
        [8214, 4479, 7990, ...,    0,    0,    0]])>,
 <tf.Tensor: id=521022, shape=(64, 40), dtype=int64, numpy=
 array([[8087,   18,   12, ...,    0,    0,    0],
        [8087,  634,   30, ...,    0,    0,    0],
        [8087,   16,   13, ...,    0,    0,    0],
        ...,
        [8087,   12,   20, ...,    0,    0,    0],
        [8087,   17, 4981, ...,    0,    0,    0],
        [8087,   12, 5453, ...,    0,    0,    0]])>)