In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as vocab
import os

  from .autonotebook import tqdm as notebook_tqdm


## load Data 

In [2]:
data,infos = tfds.load('ted_hrlr_translate/pt_to_en',with_info=True,as_supervised=True)


In [3]:
train_data , val_data = data["train"],data["test"]

In [4]:
for pt,en in train_data.batch(1).take(2):
    for example in pt.numpy() :
        print(example.decode("utf-8"))
    for example in en.numpy():
        print(example.decode("utf-8"))    

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
mas e se estes fatores fossem ativos ?
but what if it were active ?


## setup tokenizer

In [5]:
train_en = train_data.map(lambda pt,en : en)
train_pt = train_data.map(lambda pt,en : pt)

for en in train_en.take(3) :
    print(en.numpy().decode("utf-8"))



and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


In [6]:
def vocab_to_file(path,vocab):
    file = open(path,"w",encoding="utf-8")
    for token in vocab :
        print(token,file=file)
    file.close()
    
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens = ["[PAD]","[UNK]","[START]","[END]"]


In [7]:
en_vocab = vocab.bert_vocab_from_dataset(train_en.batch(1000).prefetch(tf.data.AUTOTUNE),8000,reserved_tokens=reserved_tokens,bert_tokenizer_params=bert_tokenizer_params)

In [8]:
print(en_vocab[0:10])
print(en_vocab[200:210])
print(f"there is {len(en_vocab)} token in english")


vocab_to_file("tokens/en_tokens.txt",en_vocab)

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['different', 'around', 'thank', 'say', 'day', 'good', 'her', 'through', 'today', 'same']
there is 7010 token in english


In [9]:
pt_vocab = vocab.bert_vocab_from_dataset(train_pt.batch(1000).prefetch(tf.data.AUTOTUNE),8000,reserved_tokens=reserved_tokens,bert_tokenizer_params=bert_tokenizer_params)

In [10]:
print(pt_vocab[0:10])
print(pt_vocab[200:210])
print(f"there is {len(pt_vocab)} token in portuguese")

vocab_to_file("tokens/pt_tokens.txt",pt_vocab)

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['verdade', 'falar', 'todas', 'vou', 'portanto', 'pela', 'sem', 'aos', 'realmente', 'vezes']
there is 7765 token in portuguese


In [11]:
en_tokenizer = text.BertTokenizer("tokens/en_tokens.txt")
for pt, en in train_data.batch(2).take(1):
    print(en.numpy())
    token_batch = en_tokenizer.tokenize(en)
    token_batch = token_batch.merge_dims(-2,-1)
    for ex in token_batch.to_list():
        print(ex)



[b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .'
 b'but what if it were active ?']
[72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15]
[87, 90, 107, 76, 129, 1852, 30]


In [12]:
import re

def add_start_end(token_batch):
    length = token_batch.bounding_shape()[0] #batch size
    start=tf.cast(tf.fill([length,1],reserved_tokens.index("[START]")),dtype=tf.int64)
    end=tf.cast(tf.fill([length,1],reserved_tokens.index("[END]")),dtype=tf.int64)
    return tf.concat([start,token_batch,end],axis=1)

token_batch = en_tokenizer.tokenize("hello is ramzey")
token_batch = token_batch.merge_dims(-2,-1)
tokens = add_start_end(token_batch)
print(tokens)
words = en_tokenizer.detokenize(tokens)
print(words)
def remove_reserved(token_text,reserved_tokens):
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    reg_ex = "|".join(bad_tokens)
    bad_cells = tf.strings.regex_full_match(token_text, reg_ex)
    good_cells = tf.logical_not(bad_cells)
    result = tf.ragged.boolean_mask(token_text,good_cells)
    return result

#print(remove_reserved(words,reserved_tokens))

<tf.RaggedTensor [[2, 4006, 80, 54, 2199, 2302, 240, 3]]>
<tf.RaggedTensor [[b'[START]', b'hello', b'is', b'ramzey', b'[END]']]>


In [25]:
class MyTokenizer(tf.Module):
    def __init__(self,vocab_path,reserved_tokens):
        self.tokenizer = text.BertTokenizer(vocab_path)
        self._reserved_tokens=reserved_tokens
        self.vocab_path = tf.saved_model.Asset(vocab_path)
        f = open(vocab_path,"r",encoding="utf-8")
        tokens=f.read()
        f.close()
        self.vocab = tf.Variable(tokens.splitlines())
        

        
        self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
        
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()
    @tf.function
    def tokenize(self,strings):
        encode = self.tokenizer.tokenize(strings)
        encode = encode.merge_dims(-2,-1)
        return add_start_end(encode)
    @tf.function
    def detokenize(self,encoded) :
        decode = self.tokenizer.detokenize(encoded)
        return remove_reserved(decode,self._reserved_tokens)
    @tf.function
    def lookup(self,tokens_ids):
        words = tf.gather(self.vocab,tokens_ids)
        return words
    @tf.function
    def get_vocab_size(self):
        return self.vocab.shape[0]
    @tf.function
    def get_reserved_tokens(self):
        return self._reserved_tokens
    @tf.function
    def get_vocab_path(self):
        return self.vocab_path


In [28]:

tokenizers = tf.Module()
tokenizers.pt = MyTokenizer('tokens/pt_tokens.txt',reserved_tokens)
tokenizers.en = MyTokenizer('tokens/en_tokens.txt',reserved_tokens)

tf.saved_model.save(tokenizers,"tokenizers")

INFO:tensorflow:Assets written to: tokenizers\assets


INFO:tensorflow:Assets written to: tokenizers\assets


In [29]:
loaded_model = tf.saved_model.load("tokenizers")
print(loaded_model.pt.get_vocab_size())

tf.Tensor(7765, shape=(), dtype=int32)


In [32]:
words = "hello my name is tensor"
print("text :",words)
tokens=loaded_model.en.tokenize([words])
tf.print("encoded to: ",tokens)
tf.print("encoded words: ",loaded_model.en.lookup(tokens))
tf.print("decoded to: ",loaded_model.en.detokenize(tokens))

text : hello my name is tensor
encoded to:  <tf.RaggedTensor [[2, 4006, 99, 571, 80, 2358, 687, 3]]>
encoded words:  <tf.RaggedTensor [[b'[START]', b'hello', b'my', b'name', b'is', b'tens', b'##or', b'[END]']]>
decoded to:  [['[UNK]', 'hello', 'my', 'name', 'is', 'tensor']]
