In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import random

In [112]:
# Параметры
batch_size = 8  # Уменьшено
embedding_size = 128
hidden_size = 350
num_layers = 1
dropout = 0.2
learning_rate = 0.001
num_epochs = 50
clip = 1
min_freq = 1  # Снижено
test_size = 0.2
random_seed = 42
max_length = 10  # Уменьшено


In [113]:
# 2. Данные (примеры)
english_sentences = [
    "Hello world",
    "How are you?",
    "What is your name?",
    "I am learning machine translation",
    "This is a simple example",
    "The quick brown fox jumps over the lazy dog",
    "Coding is fun",
    "Machine learning is interesting",
    "Python is a powerful language",
    "I love data science",
    "What can you tell us about yourself?",
    "I can program in many languages",
    "Artificial intelligence helps me"
    
    
]

russian_sentences = [
    "Привет мир",
    "Как дела?",
    "Как тебя зовут?",
    "Я изучаю машинный перевод",
    "Это простой пример",
    "Быстрая коричневая лиса перепрыгивает через ленивую собаку",
    "Программировать это весело",
    "Машинное обучение это интересно",
    "Python это мощный язык",
    "Я люблю науку о данных",
    "Что ты можешь о себе сообщить?",
    "Я умею программировать на многих языках",
    "Мне помогает искусственный интеллект"
]

In [138]:
# 3. Создание словарей
def create_vocabularies(english_texts, russian_texts, min_freq=1): # min_freq: #Минимальная частота, с которой слово должно встречаться в корпусе, 
    # чтобы быть включенным в словарь (по умолчанию 1
    english_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    russian_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
#  '<pad>': Токен для паддинга (используется для выравнивания последовательностей разной длины).  Ему присваивается индекс 0
#  '<sos>': Токен начала последовательности (start-of-sequence). Ему присваивается индекс 1.
#  '<eos>': Токен конца последовательности (end-of-sequence). Ему присваивается индекс 2
# '<unk>': Токен для неизвестных слов (unknown).  Ему присваивается индекс 3    
    english_word_counts = {}
    russian_word_counts = {}
# пустые английские и раууские словари для подсчета частоты каждого слова 
    for text in english_texts:
        for token in nltk.word_tokenize(text):
            english_word_counts[token] = english_word_counts.get(token, 0) + 1 #  Обновляет счетчик для текущего токена в словаре english_word_counts
                                                                               # и  сохраняет обновленный счетчик в словаре для текущего токена             
    for text in russian_texts:
        for token in nltk.word_tokenize(text):
            russian_word_counts[token] = russian_word_counts.get(token, 0) + 1  # аналогично предыдущему циклу для русского

    english_index = len(english_vocab)
    for word, count in english_word_counts.items():
        if count >= min_freq:                                      # фильтруем редкие слова
            english_vocab[word] = english_index
            english_index += 1

    russian_index = len(russian_vocab)
    for word, count in russian_word_counts.items():
        if count >= min_freq:
            russian_vocab[word] = russian_index
            russian_index += 1

    return english_vocab, russian_vocab

english_vocab, russian_vocab = create_vocabularies(english_sentences, russian_sentences, min_freq)
russian_vocab_size = len(russian_vocab)  # Запоминаем размер словаря


In [139]:
# 4. Dataset
class TranslationDataset(Dataset):
    def __init__(self, english_texts, russian_texts, english_vocab, russian_vocab, max_length=10):
        self.english_texts = english_texts
        self.russian_texts = russian_texts
        self.english_vocab = english_vocab
        self.russian_vocab = russian_vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.english_texts)

    def __getitem__(self, idx):
        english_text = self.english_texts[idx]
        russian_text = self.russian_texts[idx]

        # Токенизация с помощью nltk
        english_tokens = nltk.word_tokenize(english_text)[:self.max_length]
        russian_tokens = nltk.word_tokenize(russian_text)[:self.max_length]

        # Добавление токенов <sos> и <eos>
        english_tokens = ['<sos>'] + english_tokens + ['<eos>']
        russian_tokens = ['<sos>'] + russian_tokens + ['<eos>']

        # Численное представление
        english_indices = [self.english_vocab.get(token, self.english_vocab['<unk>']) for token in english_tokens]
        russian_indices = [self.russian_vocab.get(token, self.russian_vocab['<unk>']) for token in russian_tokens]

        # Паддинг
        english_indices = english_indices + [self.english_vocab['<pad>']] * (self.max_length + 2 - len(english_indices))
        russian_indices = russian_indices + [self.russian_vocab['<pad>']] * (self.max_length + 2 - len(russian_indices))

        return torch.tensor(english_indices), torch.tensor(russian_indices)

# Разделение на обучающую и тестовую выборки
english_train, english_test, russian_train, russian_test = train_test_split(
    english_sentences, russian_sentences, test_size=test_size, random_state=random_seed
)

train_dataset = TranslationDataset(english_train, russian_train, english_vocab, russian_vocab, max_length)
test_dataset = TranslationDataset(english_test, russian_test, english_vocab, russian_vocab, max_length)


In [140]:
# DataLoader
def collate_fn(batch):
    english_batch, russian_batch = [], []
    for en_sample, ru_sample in batch:
        english_batch.append(en_sample)
        russian_batch.append(ru_sample)

    english_batch = torch.stack(english_batch)
    russian_batch = torch.stack(russian_batch)
    return english_batch, russian_batch

train_iterator = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [141]:
# Encoder/Decoder
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.fc.out_features  # Get output vocab size correctly

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(source)

        x = target[:, 0].to(self.device)  # <sos> token

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            x = (target[:, t] if teacher_force else top1).to(self.device)

        return outputs

In [142]:
# Обучение
def train(model, iterator, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for i, (source, target) in enumerate(iterator):
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()

        output = model(source, target)

        batch_size, seq_len, vocab_size = output.shape # Correct way to get vocab_size
        output = output[:, 1:].reshape(-1, vocab_size)
        target = target[:, 1:].reshape(-1)

        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [143]:
# Перевод и оценка
def translate_sentence(model, sentence, english_vocab, russian_vocab, device, max_length=10):
    model.eval()
    tokens = nltk.word_tokenize(sentence)[:max_length]
    tokens = ['<sos>'] + tokens + ['<eos>']
    indices = [english_vocab.get(token, english_vocab['<unk>']) for token in tokens]
    indices = indices + [english_vocab['<pad>']] * (max_length + 2 - len(indices))

    source = torch.tensor(indices).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(source)
        outputs = [russian_vocab['<sos>']]
        x = torch.tensor([russian_vocab['<sos>']]).to(device)

        for _ in range(max_length):
            output, hidden, cell = model.decoder(x, hidden, cell)
            predicted_token = output.argmax(1).item()

            outputs.append(predicted_token)

            if predicted_token == russian_vocab['<eos>']:
                break
            x = torch.tensor([predicted_token]).to(device)
    predicted_words = [key for key, value in russian_vocab.items() if value in outputs]

    return ' '.join(predicted_words[1:-1])

def calculate_bleu(model, data, english_vocab, russian_vocab, device, max_length=10):
    bleu_scores = []
    smoothing = SmoothingFunction().method1
    for english, russian in zip(data.english_texts, data.russian_texts):
        candidate = translate_sentence(model, english, english_vocab, russian_vocab, device, max_length)
        reference = nltk.word_tokenize(russian)
        candidate_list = candidate.split()

        try:
            bleu_score = sentence_bleu([reference], candidate_list, smoothing_function=smoothing)
        except ZeroDivisionError:
            bleu_score = 0
        bleu_scores.append(bleu_score)

    return sum(bleu_scores) / len(bleu_scores)

In [145]:
# 9. Main
if __name__ == '__main__':
    # Определение моделей
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    encoder = Encoder(len(english_vocab), embedding_size, hidden_size, num_layers, dropout).to(device)
    decoder = Decoder(russian_vocab_size, embedding_size, hidden_size, num_layers, dropout).to(device) # Use correct vocab size
    model = Seq2Seq(encoder, decoder, device).to(device)

    # Оптимизатор и функция потерь
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Обучение модели
    for epoch in range(num_epochs):
        train_loss = train(model, train_iterator, optimizer, criterion, clip, device)
        print(f'Эпоха: {epoch + 1:02}, Потери: {train_loss:.4f}')

    # Оценка BLEU
    test_bleu = calculate_bleu(model, test_dataset, english_vocab, russian_vocab, device, max_length)
    print(f'BLEU score на тестовом наборе: {test_bleu:.4f}')

    # Пример перевода
    example_sentence = "How are you?"
    translated_sentence = translate_sentence(model, example_sentence, english_vocab, russian_vocab, device, max_length)
    print(f"English: {example_sentence}")
    print(f"Russian: {translated_sentence}")


Эпоха: 01, Потери: 3.9469
Эпоха: 02, Потери: 3.8347
Эпоха: 03, Потери: 3.6404
Эпоха: 04, Потери: 3.6049
Эпоха: 05, Потери: 3.1442
Эпоха: 06, Потери: 3.0035
Эпоха: 07, Потери: 2.8918
Эпоха: 08, Потери: 2.9180
Эпоха: 09, Потери: 2.7944
Эпоха: 10, Потери: 2.4644
Эпоха: 11, Потери: 2.2333
Эпоха: 12, Потери: 2.1550
Эпоха: 13, Потери: 2.1841
Эпоха: 14, Потери: 2.1250
Эпоха: 15, Потери: 1.8878
Эпоха: 16, Потери: 1.6246
Эпоха: 17, Потери: 1.8827
Эпоха: 18, Потери: 1.6712
Эпоха: 19, Потери: 1.4884
Эпоха: 20, Потери: 1.4869
Эпоха: 21, Потери: 1.0557
Эпоха: 22, Потери: 1.1349
Эпоха: 23, Потери: 1.0094
Эпоха: 24, Потери: 1.7777
Эпоха: 25, Потери: 0.9319
Эпоха: 26, Потери: 0.8641
Эпоха: 27, Потери: 0.6909
Эпоха: 28, Потери: 0.7835
Эпоха: 29, Потери: 0.5934
Эпоха: 30, Потери: 0.9792
Эпоха: 31, Потери: 0.4916
Эпоха: 32, Потери: 0.3951
Эпоха: 33, Потери: 0.5188
Эпоха: 34, Потери: 0.4341
Эпоха: 35, Потери: 0.3372
Эпоха: 36, Потери: 0.5186
Эпоха: 37, Потери: 0.6933
Эпоха: 38, Потери: 0.5262
Эпоха: 39, П