In [9]:
!pip install transformers torch
!pip install sentencepiece
!pip install tensorflow



Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,

In [1]:
import json

# Функция для загрузки данных из файла
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    src_texts = [item['src'] for item in data]
    dst_texts = [item['dst'] for item in data if 'dst' in item]  # Убедимся, что есть перевод
    return src_texts, dst_texts

# Загрузка данных из файлов
train_src, train_dst = load_data('ml_trainings.alien_translation/train')  
val_src, val_dst = load_data('ml_trainings.alien_translation/val')        
test_src, _ = load_data('ml_trainings.alien_translation/test_no_reference')  


In [2]:
import sentencepiece
print("SentencePiece успешно импортирован!")


SentencePiece успешно импортирован!


In [6]:
val_dst[0]

'The hosts regrouped, and Bouchard evened the score again, scoring a goal with a 27-37 man advantage.'

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Инициализируем токенизаторы для исходных и целевых текстов
src_tokenizer = Tokenizer(num_words=10000, filters='')  # Используем только 10,000 наиболее частых слов
dst_tokenizer = Tokenizer(num_words=10000, filters='')



In [22]:
# Обучаем токенизаторы на наших текстах
src_tokenizer.fit_on_texts(train_src)
dst_tokenizer.fit_on_texts(train_dst)

In [23]:
# Преобразуем текст в последовательности индексов
train_src_seqs = src_tokenizer.texts_to_sequences(train_src)
train_dst_seqs = dst_tokenizer.texts_to_sequences(train_dst)

In [24]:
# Паддинг последовательностей до одинаковой длины
max_src_len = max([len(seq) for seq in train_src_seqs])
max_dst_len = max([len(seq) for seq in train_dst_seqs])

train_src_seqs = pad_sequences(train_src_seqs, maxlen=max_src_len, padding='post')
train_dst_seqs = pad_sequences(train_dst_seqs, maxlen=max_dst_len, padding='post')


In [25]:
# Преобразуем в тензоры
train_src_seqs = tf.convert_to_tensor(train_src_seqs)
train_dst_seqs = tf.convert_to_tensor(train_dst_seqs)

Создание модели

In [38]:
from tensorflow.keras import layers

# Параметры модели
# Уменьшаем размер векторов, чтобы уменьшить ресурсына обучение
embedding_dim = 128  
hidden_units = 256   

#vocab_size_src = len(src_tokenizer.word_index) + 1  # Добавляем 1 для padding
#vocab_size_dst = len(dst_tokenizer.word_index) + 1  # Добавляем 1 для padding
vocab_size = 10000  # Размер словаря

In [39]:
# Энкодер
encoder_inputs = layers.Input(shape=(max_src_len,))
encoder_embedding = layers.Embedding(vocab_size_src, embedding_dim)(encoder_inputs)
encoder_lstm = layers.LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Декодер
decoder_inputs = layers.Input(shape=(max_dst_len,))
decoder_embedding = layers.Embedding(vocab_size_dst, embedding_dim)(decoder_inputs)
decoder_lstm = layers.LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = layers.Dense(vocab_size_dst, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [40]:
import tensorflow as tf

class Seq2SeqModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Seq2SeqModel, self).__init__()

        # Создаем слои в конструкторе, они не будут создаваться каждый раз в методе call
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    @tf.function
    def call(self, inputs):
        src_seq, dst_seq_input = inputs

        # Процесс кодирования (encoder)
        x = self.embedding(src_seq)

        # Процесс декодирования (decoder)
        x, state_h, state_c = self.lstm(x)

        # Применяем выходной слой
        return self.dense(x)

In [42]:
# Модель Seq2Seq
#model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Создаем модель
model = Seq2SeqModel(vocab_size, embedding_dim, hidden_units)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Обучение модели

In [32]:
# Создадим целевые данные для обучения (сдвиг последовательностей для декодера)
train_dst_seqs_input = train_dst_seqs[:, :-1]
train_dst_seqs_output = train_dst_seqs[:,:-1]

print("Shape of train_src_seqs:", train_src_seqs.shape)
print("Shape of train_dst_seqs_input:", train_dst_seqs_input.shape)
print("Shape of train_dst_seqs_output:", train_dst_seqs_output.shape)


Shape of train_src_seqs: (300000, 30)
Shape of train_dst_seqs_input: (300000, 57)
Shape of train_dst_seqs_output: (300000, 57)


In [34]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Добавим заполнение (padding) в конец последовательностей
train_dst_seqs_input = pad_sequences(train_dst_seqs_input, maxlen=58, padding='post')
train_dst_seqs_output = pad_sequences(train_dst_seqs_output, maxlen=58, padding='post')

print("Shape of train_dst_seqs_input after padding:", train_dst_seqs_input.shape)
print("Shape of train_dst_seqs_output after padding:", train_dst_seqs_output.shape)


Shape of train_dst_seqs_input after padding: (300000, 58)
Shape of train_dst_seqs_output after padding: (300000, 58)


In [35]:
# Обучение модели
model.fit(
    [train_src_seqs, train_dst_seqs_input],
    tf.expand_dims(train_dst_seqs_output, -1),  # Удостоверимся, что выходная форма верна
    batch_size=64,
    epochs=10,
    validation_data=([train_src_seqs, train_dst_seqs_input], tf.expand_dims(train_dst_seqs_output, -1))
)


Epoch 1/10




ValueError: Creating variables on a non-first call to a function decorated with tf.function.