In [92]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import ops
import random
import string
import re



In [93]:
!pip install -q gdown
import gdown

file_id = "1AsV1_sa3T1EQY2Nf2r-ISFOJLScjcZ_V"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "ukr.txt", quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1AsV1_sa3T1EQY2Nf2r-ISFOJLScjcZ_V
To: /content/ukr.txt
100%|██████████| 25.0M/25.0M [00:00<00:00, 64.7MB/s]


'ukr.txt'

In [94]:
with open("ukr.txt", "r", encoding="utf-8") as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []
for line in lines:
    eng, ukr, _ = line.split("\t")
    ukr = "[start] " + ukr + " [end]"
    text_pairs.append((eng, ukr))

random.shuffle(text_pairs)


text_pairs = text_pairs[:50000]


In [95]:
num_val = int(0.15 * len(text_pairs))
train_pairs = text_pairs[:-num_val]
val_pairs = text_pairs[-num_val:]


In [96]:
vocabulary_size = 10000
sequence_length = 20
batch_size = 64

embed_dim = 128
latent_dim = 512
num_heads = 8

epochs = 5


In [97]:
strip_chars = string.punctuation.replace("[", "").replace("]", "")

def ukr_standardization(input_string):
    return tf.strings.regex_replace(
        tf.strings.lower(input_string),
        "[%s]" % re.escape(strip_chars),
        ""
    )


In [98]:
eng_vector = layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

ukr_vector = layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=ukr_standardization,
)

train_eng = [pair[0] for pair in train_pairs]
train_ukr = [pair[1] for pair in train_pairs]

eng_vector.adapt(train_eng)
ukr_vector.adapt(train_ukr)


In [99]:
def format_dataset(eng, ukr):
    eng = eng_vector(eng)
    ukr = ukr_vector(ukr)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": ukr[:, :-1],
        },
        ukr[:, 1:]
    )

def make_dataset(pairs):
    eng_texts, ukr_texts = zip(*pairs)
    dataset = tf.data.Dataset.from_tensor_slices((list(eng_texts), list(ukr_texts)))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.cache().shuffle(2048).prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


In [100]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocabulary_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocabulary_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )

    def call(self, inputs):
        length = ops.shape(inputs)[-1]
        positions = ops.arange(0, length, 1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return ops.not_equal(inputs, 0)


In [101]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], "int32")
        else:
            padding_mask = None

        attention_output = self.attention(
            inputs, inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


In [102]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential([
            layers.Dense(latent_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_causal_attention_mask(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]
        i = ops.arange(seq_len)[:, None]
        j = ops.arange(seq_len)
        mask = ops.cast(i >= j, "int32")
        mask = ops.reshape(mask, (1, seq_len, seq_len))
        return ops.tile(mask, [batch_size, 1, 1])

    def call(self, inputs, mask=None):
        inputs, encoder_outputs = inputs
        causal_mask = self.get_causal_attention_mask(inputs)

        attention_output_1 = self.attention_1(
            inputs, inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            out_1, encoder_outputs
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)


In [103]:
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocabulary_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
x = PositionalEmbedding(sequence_length, vocabulary_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)([x, encoder_outputs])
x = layers.Dropout(0.3)(x)
decoder_outputs = layers.Dense(vocabulary_size, activation="softmax")(x)

transformer = keras.Model(
    {"encoder_inputs": encoder_inputs, "decoder_inputs": decoder_inputs},
    decoder_outputs,
    name="transformer",
)


In [104]:
transformer.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

transformer.summary()


In [105]:
transformer.fit(
    train_ds,
    epochs=epochs,
    validation_data=val_ds,
)


Epoch 1/5
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1251s[0m 2s/step - accuracy: 0.1476 - loss: 4.7072 - val_accuracy: 0.2018 - val_loss: 2.5454
Epoch 2/5
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1255s[0m 2s/step - accuracy: 0.2050 - loss: 2.4612 - val_accuracy: 0.2205 - val_loss: 1.9446
Epoch 3/5
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 2s/step - accuracy: 0.2307 - loss: 1.7033 - val_accuracy: 0.2314 - val_loss: 1.6197
Epoch 4/5
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1228s[0m 2s/step - accuracy: 0.2465 - loss: 1.2831 - val_accuracy: 0.2370 - val_loss: 1.5014
Epoch 5/5
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 2s/step - accuracy: 0.2586 - loss: 1.0198 - val_accuracy: 0.2388 - val_loss: 1.4645


<keras.src.callbacks.history.History at 0x7b71c46630b0>

In [109]:
import numpy as np

# Словник індекс → слово
ukr_vocab = ukr_vector.get_vocabulary()
ukr_index_lookup = dict(enumerate(ukr_vocab))

max_decoded_sentence_length = sequence_length


def decode_sequence(input_sentence):
    # 1️⃣ Векторизуємо англійське речення
    tokenized_input_sentence = eng_vector([input_sentence])

    # 2️⃣ Стартовий токен
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        # 3️⃣ Векторизуємо поточний вихід декодера
        tokenized_target_sentence = ukr_vector([decoded_sentence])[:, :-1]

        # 4️⃣ Прогноз
        predictions = transformer(
            {
                "encoder_inputs": tokenized_input_sentence,
                "decoder_inputs": tokenized_target_sentence,
            },
            training=False
        )

        # 5️⃣ Беремо токен з максимальною ймовірністю
        sampled_token_index = int(np.argmax(predictions[0, i]))
        sampled_token = ukr_index_lookup[sampled_token_index]

        # 6️⃣ Додаємо слово до результату
        decoded_sentence += " " + sampled_token

        # 7️⃣ Умова завершення
        if sampled_token == "[end]":
            break

    return decoded_sentence


In [113]:
test_eng_texts = [pair[0] for pair in val_pairs]


for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)

    print(f"<I>: {input_sentence}")
    print(f"<O>: {translated}")
    print()


<I>: I haven't paid yet.
<O>: [start] Я ще не заплатив [end]

<I>: I thought Tom would be interested in this.
<O>: [start] Я думав що Том це цікаво [end]

<I>: You must be Tom.
<O>: [start] Ти мабуть Том [end]

<I>: They have gone to Europe.
<O>: [start] Вони пішли в Європі [end]

<I>: He lost his life in an accident.
<O>: [start] Він втратив життя у аварії [end]

