In [1]:
import random
import string
import re
import numpy as np
import tensorflow as tf
import keras

In [2]:
with open('ukr.txt', 'r', encoding = 'utf-8') as f:
  lines = f.read().split('\n')[:-1]

for _ in range(3):
    print(random.choice(lines))

Do you need to drink wine?	Вам потрібно випити вина?	CC-BY 2.0 (France) Attribution: tatoeba.org #1122357 (cntrational) & #5763235 (deniko)
Tom gave Mary a ring.	Том дав Мері обручку.	CC-BY 2.0 (France) Attribution: tatoeba.org #5822260 (CK) & #7499658 (deniko)
Tom confided in Mary.	Том поклався на Мері.	CC-BY 2.0 (France) Attribution: tatoeba.org #5105749 (CK) & #5805055 (deniko)


In [3]:
text_pairs = []
for line in lines:
  eng, ukr, _ = line.split('\t')
  ukr = '[s] ' + ukr + ' [e]'
  text_pairs.append((eng, ukr))

for t in range(5):
    print(random.choice(text_pairs))

("Tom hasn't yet paid.", '[s] Том ще не платив. [e]')
('I think Tom is a little shy.', "[s] Мені здається, Том трохи сором'язливий. [e]")
('I worked in Boston.', '[s] Я працював у Бостоні. [e]')
("Look at her. She's beautiful.", '[s] Подивись на неї. Вона прекрасна. [e]')
("It's quiet.", '[s] Вона тиха. [e]')


In [4]:
random.shuffle(text_pairs)
text_pairs = text_pairs[:50000]
num_val = int(0.15 * len(text_pairs))
num_train = len(text_pairs) - 2 * num_val
train_pairs = text_pairs[:num_train]
val_pairs = text_pairs[num_train : num_train + num_val]
test_pairs = text_pairs[num_train + num_val :]

print(f'all pairs: {len(text_pairs)}')
print(f'train pairs: {len(train_pairs)}')
print(f'validation pairs: {len(val_pairs)}')
print(f'test pairs: {len(test_pairs)}')

all pairs: 50000
train pairs: 35000
validation pairs: 7500
test pairs: 7500


In [5]:
strip_chars = string.punctuation.replace('[', '')
strip_chars = strip_chars.replace(']', '')

vocabulary_size = 15000
sequence_length = 20
batch_size = 64

def ukr_standardization(input_string):
    return tf.strings.regex_replace(tf.strings.lower(input_string), '[%s]' % re.escape(strip_chars), '')

eng_vector = keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_mode='int',
    output_sequence_length=sequence_length,
)
ukr_vector = keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_mode='int',
    output_sequence_length=sequence_length + 1,
    standardize=ukr_standardization,
)
train_eng = [pair[0] for pair in train_pairs]
train_ukr = [pair[1] for pair in train_pairs]
eng_vector.adapt(train_eng)
ukr_vector.adapt(train_ukr)

In [6]:
for v in [eng_vector, ukr_vector]:
  print(len(v.get_vocabulary()))
  print(v.get_vocabulary()[10:20])

6422
[np.str_('do'), np.str_('have'), np.str_('im'), np.str_('mary'), np.str_('was'), np.str_('dont'), np.str_('he'), np.str_('me'), np.str_('in'), np.str_('it')]
15000
[np.str_('на'), np.str_('Мері'), np.str_('ти'), np.str_('Тома'), np.str_('я'), np.str_('з'), np.str_('у'), np.str_('Це'), np.str_('У'), np.str_('мене')]


In [7]:
def format_dataset(eng, ukr):
  eng = eng_vector(eng)
  ukr = ukr_vector(ukr)
  return ({ "encoder_inputs": eng, "decoder_inputs": ukr[:, :-1]}, ukr[:, 1:])

def make_dataset(pairs):
  eng_texts, ukr_texts = zip(*pairs)
  eng_texts = list(eng_texts)
  ukr_texts = list(ukr_texts)
  dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ukr_texts))
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(format_dataset)
  return dataset.cache().shuffle(2048).prefetch(16)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [8]:
for inputs, targets in train_ds.take(1):
    print(f'encoder inputs shape: {inputs["encoder_inputs"].shape}')
    print(f'decoder inputs shape: {inputs["decoder_inputs"].shape}')
    print(f"targets shape: {targets.shape}")

encoder inputs shape: (64, 20)
decoder inputs shape: (64, 20)
targets shape: (64, 20)


In [9]:
class TransformerEncoder(keras.layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerEncoder, self).__init__()
    self.attention = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = tf.keras.Sequential([keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim)])
    self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(rate)
    self.dropout2 = keras.layers.Dropout(rate)

  def call(self, inputs, training):
    attn_output = self.attention(inputs, inputs)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.dense_proj(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    return self.layernorm2(out1 + ffn_output)

In [10]:
class TransformerDecoder(keras.layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerDecoder, self).__init__()
    self.attention1 = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.attention2 = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = tf.keras.Sequential([keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim)])
    self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(rate)
    self.dropout2 = keras.layers.Dropout(rate)
    self.dropout3 = keras.layers.Dropout(rate)

  def call(self, inputs, enc_output, training):
    attn_output1 = self.attention1(inputs, inputs)
    attn_output1 = self.dropout1(attn_output1, training=training)
    out1 = self.layernorm1(inputs + attn_output1)
    attn_output2 = self.attention2(out1, enc_output)
    attn_output2 = self.dropout2(attn_output2, training=training)
    out2 = self.layernorm2(out1 + attn_output2)
    ffn_output = self.dense_proj(out2)
    ffn_output = self.dropout3(ffn_output, training=training)
    return self.layernorm3(out2 + ffn_output)

In [12]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

# Вхідні дані для енкодера
encoder_inputs = keras.layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, num_heads, latent_dim)(x, training=True)

# Вхідні дані для декодера
decoder_inputs = keras.layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")
x = keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embed_dim)(decoder_inputs)

# Підключення декодера до виходів енкодера
x = TransformerDecoder(embed_dim, num_heads, latent_dim)(x, encoder_outputs, training=True)
decoder_outputs = keras.layers.Dense(vocabulary_size, activation="softmax")(x)

# Створення моделі
transformer = keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")





In [13]:
epochs = 5

transformer.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/5




[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 903ms/step - accuracy: 0.7304 - loss: 2.4729 - val_accuracy: 0.7678 - val_loss: 1.5149
Epoch 2/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 899ms/step - accuracy: 0.7685 - loss: 1.5252 - val_accuracy: 0.7712 - val_loss: 1.4156
Epoch 3/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 891ms/step - accuracy: 0.7689 - loss: 1.4428 - val_accuracy: 0.7749 - val_loss: 1.3663
Epoch 4/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 895ms/step - accuracy: 0.7786 - loss: 1.2959 - val_accuracy: 0.7769 - val_loss: 1.3333
Epoch 5/5
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m486s[0m 888ms/step - accuracy: 0.7806 - loss: 1.2295 - val_accuracy: 0.7803 - val_loss: 1.2763


<keras.src.callbacks.history.History at 0x2d0ee4d5910>

In [15]:
ukr_vocab = ukr_vector.get_vocabulary()
ukr_index_lookup = dict(zip(range(len(ukr_vocab)), ukr_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
  tokenized_input_sentence = eng_vector([input_sentence])
  decoded_sentence = "[s]"
  for i in range(max_decoded_sentence_length):
    tokenized_target_sentence = ukr_vector([decoded_sentence])[:, :-1]
    predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
    sampled_token_index = np.argmax(predictions[0, i, :])
    sampled_token = ukr_index_lookup[sampled_token_index]
    decoded_sentence += " " + sampled_token
    if sampled_token == "[e]":
      break
  return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(3):
  input_sentence = random.choice(test_eng_texts)
  translated = decode_sequence(input_sentence)
  print(f'--input: {input_sentence}')
  print(f'--output: {translated}')

--input: Do you think Tom will call?
--output: [s] Том [e]
--input: Did you speak to Tom yesterday?
--output: [s] Том [e]
--input: I'm very glad to see you.
--output: [s] Том [e]
