## **Full word** tokenization

In [1]:
# from tensorflow.keras.preprocessing.text import tokenizer_from_json

# with open('tokenizer_en.json', 'r', encoding='utf-8') as f:
#     tok_en_json_str = f.read()

# tokenizer_en = tokenizer_from_json(tok_en_json_str)

# with open('tokenizer_ru.json', 'r', encoding='utf-8') as f:
#     tok_ru_json_str = f.read()

# tokenizer_ru = tokenizer_from_json(tok_ru_json_str)

In [2]:
# from tensorflow.keras.models import load_model
# import tensorflow as tf

# start_id = tokenizer_ru.word_index['<start>']
# pad_id   = 0

# base_scc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

# def masked_scc(y_true, y_pred):
#     per_token = base_scc(y_true, y_pred)
#     valid = tf.logical_and(
#         tf.not_equal(y_true, start_id),
#         tf.not_equal(y_true, pad_id)
#     )
#     mask = tf.cast(valid, tf.float32)
#     return tf.reduce_sum(per_token * mask) / tf.reduce_sum(mask)

# def masked_accuracy(y_true, y_pred):
#     y_true_int = tf.cast(y_true, tf.int64)
#     pred_ids   = tf.argmax(y_pred, axis=-1)
#     matches = tf.equal(y_true_int, pred_ids)
#     valid_positions = tf.logical_and(
#         tf.not_equal(y_true_int, start_id),
#         tf.not_equal(y_true_int, pad_id)
#     )
#     mask = tf.cast(valid_positions, tf.float32)
#     correct = tf.cast(matches, tf.float32) * mask
#     return tf.reduce_sum(correct) / tf.reduce_sum(mask)

# model = load_model(
#     'my_transformer_model.keras',
#     custom_objects={
#         'masked_scc': masked_scc,
#         'masked_accuracy': masked_accuracy
#     }
# )

In [3]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import numpy as np

# max_len = 25

# def idx_to_word(idx):
#     w = tokenizer_ru.index_word.get(idx, '<UNK>')
#     if w == '<start>': return '<S>'
#     if w == '<end>':   return '<E>'
#     return w

# def decode_sequence(input_sentence):
#     seq = tokenizer_en.texts_to_sequences([input_sentence])
#     seq = pad_sequences(seq, maxlen=max_len, padding='post')
#     decoder_input = [tokenizer_ru.word_index['<start>']]
#     for _ in range(max_len):
#         dec_seq = pad_sequences([decoder_input], maxlen=max_len, padding='post')
#         preds = model.predict([seq, dec_seq], verbose=0)
#         next_id = np.argmax(preds[0][len(decoder_input)-1])
#         if next_id == tokenizer_ru.word_index['<end>']:
#             break
#         decoder_input.append(next_id)
#     return input_sentence + ' -> ' + ' '.join(idx_to_word(i) for i in decoder_input[1:])

# print(decode_sequence("what are you going to do this morning?"))

## **Subword** tokenization

In [4]:
# !pip install sentencepiece

In [5]:
keras_model_filename = 'my_transformer_model_subword_bugfixed.keras'
bpe_model_filename = 'bpe_subword_bugfixed.model'

In [6]:
import sentencepiece as spm
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Layer

sp = spm.SentencePieceProcessor()
sp.load(bpe_model_filename)
PAD_ID   = sp.piece_to_id('<pad>')
START_ID = sp.piece_to_id('<start>')
END_ID   = sp.piece_to_id('<end>')

def masked_scc(y_true, y_pred):
    base = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
    per_token = base(y_true, y_pred)
    valid = tf.logical_and(
        tf.not_equal(y_true, START_ID),
        tf.not_equal(y_true, PAD_ID)
    )
    mask = tf.cast(valid, tf.float32)
    return tf.reduce_sum(per_token * mask) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    y_true_int = tf.cast(y_true, tf.int64)
    pred_ids = tf.argmax(y_pred, axis=-1)
    matches = tf.equal(y_true_int, pred_ids)
    valid = tf.logical_and(
        tf.not_equal(y_true, START_ID),
        tf.not_equal(y_true, PAD_ID)
    )
    mask = tf.cast(valid, tf.float32)
    return tf.reduce_sum(tf.cast(matches, tf.float32) * mask) / tf.reduce_sum(mask)

class PositionalEncoding(Layer):
    def __init__(self, seq_len, d_model, **kwargs):
        super().__init__(**kwargs)
        # precompute the (1, seq_len, d_model) constant
        pos = np.arange(seq_len)[:, np.newaxis]
        i   = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angles = pos * angle_rates
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        self.pos_encoding = tf.constant(angles[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        # x shape = (batch, seq_len, d_model)
        return x + self.pos_encoding

    def get_config(self):
        # so that load_model can reinstantiate this layer
        config = super().get_config()
        config.update({
            "seq_len": int(self.pos_encoding.shape[1]),
            "d_model": int(self.pos_encoding.shape[2]),
        })
        return config

class PaddingMask(Layer):
    def __init__(self, pad_id, **kwargs):
        super().__init__(**kwargs)
        self.pad_id = pad_id

    def call(self, x):
        # x shape = (batch, seq_len)
        mask = tf.cast(tf.not_equal(x, self.pad_id), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def get_config(self):
        config = super().get_config()
        config.update({"pad_id": self.pad_id})
        return config

model = load_model(
    keras_model_filename,
    custom_objects={
        'masked_scc':        masked_scc,
        'masked_accuracy':   masked_accuracy,
        'Custom>PositionalEncoding': PositionalEncoding,
        'Custom>PaddingMask':        PaddingMask
    },
    safe_mode=False
)

max_len = 25

def decode_sequence(input_sentence):
    enc_ids = sp.encode(input_sentence, out_type=int)
    enc_seq = pad_sequences([enc_ids], maxlen=max_len, padding='post', value=PAD_ID)
    dec_input = [START_ID]
    for _ in range(max_len - 1):
        dec_seq = pad_sequences([dec_input], maxlen=max_len, padding='post', value=PAD_ID)
        preds = model.predict([enc_seq, dec_seq], verbose=0)
        next_id = int(np.argmax(preds[0][len(dec_input)-1]))
        if next_id == END_ID:
            break
        dec_input.append(next_id)
    return sp.decode(dec_input[1:])

print(decode_sequence("What are you going to do this morning?"))


Что вы будете делать сегодня утром?


In [7]:
print(decode_sequence('I have just left the bathroom.'))

Я только что оставил ванную.


## Make predictions

In [8]:
sentences = [
    "The cat sits on the sofa.",
    "She loves to play soccer.",
    "He is reading a book.",
    "The sun is shining brightly.",
    "I enjoy listening to music.",
    "They are going to the park.",
    "What is your favorite color?",
    "Can you help me with my homework?",
    "I would like a cup of coffee.",
    "Where did you put my keys?",
    "She is cooking for the party.",
    "He plays the guitar very well.",
    "What time does the movie start?",
    "I have a meeting at noon today.",
    "The flowers are blooming in spring.",
    "Can you tell me a joke?",
    "I need to buy groceries after work.",
    "The children are playing in the garden.",
    "What are you doing this weekend?",
    "She traveled to Paris last summer.",
    "What is the capital of France?",
    "I would like to learn a new language.",
    "The weather is nice today, isn't it?",
    "Can you recommend a good restaurant?",
    "I saw a movie that was really interesting.",
    "She has been working here for five years.",
    "He is studying for his final exams this week.",
    "I would love to visit Japan someday.",
    "The book was better than the movie.",
    "How do you feel about public speaking?",
    "Can you explain the theory of relativity?",
    "He believes, technology can solve many problems."
]

mx = max(len(s) for s in sentences)

for s in sentences:
	print(s.ljust(mx) + '-> ' + decode_sequence(s))

The cat sits on the sofa.                       -> Кот сидень на диване.
She loves to play soccer.                       -> Она любит играть в футбол.
He is reading a book.                           -> Он читает книгу.
The sun is shining brightly.                    -> Солнце светло ярно.
I enjoy listening to music.                     -> Мне нравится слушать музыку.
They are going to the park.                     -> Они едут в парк.
What is your favorite color?                    -> Какой твой любимый цвет?
Can you help me with my homework?               -> Можешь помочь мне с домашним заданием?
I would like a cup of coffee.                   -> Я бы хотел чашку кофе.
Where did you put my keys?                      -> Куда ты положил ключи?
She is cooking for the party.                   -> Она готовит на вечеринке.
He plays the guitar very well.                  -> Он очень хорошо играет на гитаре.
What time does the movie start?                 -> Во сколько начнётся фильм?
I have a