In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

import tensorflow_text

In [2]:
!conda list --export > requirements.txt

In [3]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                               with_info=True,
                               as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']



In [5]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
  print('> Examples in Portuguese:')
  for pt in pt_examples.numpy():
    print(pt.decode('utf-8'))
  print()

  print('> Examples in English:')
  for en in en_examples.numpy():
    print(en.decode('utf-8'))

> Examples in Portuguese:
o problema é que nunca vivi lá um único dia .
os astrónomos acreditam que cada estrela da galáxia tem um planeta , e especulam que até um quinto deles tem um planeta do tipo da terra que poderá ter vida , mas ainda não vimos nenhum deles .
agora aqui temos imagens sendo extraídas em tempo real diretamente do feed ,

> Examples in English:
except , i 've never lived one day of my life there .
astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them .
now here are live images being pulled straight from the feed .


In [6]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip


'.\\ted_hrlr_translate_pt_en_converter.zip'

In [7]:
tokenizers = tf.saved_model.load(model_name)

In [8]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

In [9]:
print('> This is a batch of strings:')
for en in en_examples.numpy():
  print(en.decode('utf-8'))

> This is a batch of strings:
except , i 've never lived one day of my life there .
astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them .
now here are live images being pulled straight from the feed .


In [10]:
encoded = tokenizers.en.tokenize(en_examples)

print('> This is a padded-batch of token IDs:')
for row in encoded.to_list():
  print(row)

> This is a padded-batch of token IDs:
[2, 1533, 13, 45, 9, 142, 243, 752, 103, 204, 74, 99, 183, 96, 15, 3]
[2, 3946, 110, 321, 75, 198, 1452, 77, 71, 2662, 144, 37, 580, 13, 72, 83, 5848, 5939, 1970, 75, 130, 73, 103, 3339, 74, 124, 89, 111, 462, 14, 106, 580, 75, 242, 97, 264, 73, 3487, 183, 13, 87, 78, 89, 50, 9, 56, 464, 225, 74, 124, 15, 3]
[2, 110, 137, 86, 301, 722, 222, 2404, 1473, 109, 71, 1559, 15, 3]


In [11]:
round_trip = tokenizers.en.detokenize(encoded)

print('> This is human-readable text:')
for line in round_trip.numpy():
  print(line.decode('utf-8'))

> This is human-readable text:
except , i ' ve never lived one day of my life there .
astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth - like planet that might be able to harbor life , but we have n ' t seen any of them .
now here are live images being pulled straight from the feed .


In [12]:
print('> This is the text split into tokens:')
tokens = tokenizers.en.lookup(encoded)
tokens

> This is the text split into tokens:


<tf.RaggedTensor [[b'[START]', b'except', b',', b'i', b"'", b've', b'never', b'lived',
  b'one', b'day', b'of', b'my', b'life', b'there', b'.', b'[END]']   ,
 [b'[START]', b'astronomers', b'now', b'believe', b'that', b'every',
  b'star', b'in', b'the', b'galaxy', b'has', b'a', b'planet', b',', b'and',
  b'they', b'sp', b'##ec', b'##ulate', b'that', b'up', b'to', b'one',
  b'fifth', b'of', b'them', b'have', b'an', b'earth', b'-', b'like',
  b'planet', b'that', b'might', b'be', b'able', b'to', b'harbor', b'life',
  b',', b'but', b'we', b'have', b'n', b"'", b't', b'seen', b'any', b'of',
  b'them', b'.', b'[END]']                                                 ,
 [b'[START]', b'now', b'here', b'are', b'live', b'images', b'being',
  b'pulled', b'straight', b'from', b'the', b'feed', b'.', b'[END]'] ]>

In [13]:
lengths = []

for pt_examples, en_examples in train_examples.batch(1024):
  pt_tokens = tokenizers.pt.tokenize(pt_examples)
  lengths.append(pt_tokens.row_lengths())

  en_tokens = tokenizers.en.tokenize(en_examples)
  lengths.append(en_tokens.row_lengths())
  print('.', end='', flush=True)

...................................................

In [None]:
all_lengths = np.concatenate(lengths)

plt.hist(all_lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(all_lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}');