In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random


In [2]:
target = ['верить',	
'говорить',
'держать',	
'думать',
'желать',
'играть',
'искать',
'курить',
'лежать',
'просить',
'сидеть',
'слушать',
'смотреть',
'ставить',
'стоять',
'строить',
'считать',
'терять',
'тратить']

prime = ['поверить',
'поговорить',
'подержать',
'подумать',
'пожелать',
'поиграть',
'поискать',
'покурить',
'полежать',
'попросить',
'посидеть',
'послушать',
'посмотреть',
'поставить',
'постоять',
'построить',
'посчитать',
'потерять',
'потратить']

pairs = list(zip(target, prime))

In [22]:
pairs

[('верить', 'поверить'),
 ('говорить', 'поговорить'),
 ('держать', 'подержать'),
 ('думать', 'подумать'),
 ('желать', 'пожелать'),
 ('играть', 'поиграть'),
 ('искать', 'поискать'),
 ('курить', 'покурить'),
 ('лежать', 'полежать'),
 ('просить', 'попросить'),
 ('сидеть', 'посидеть'),
 ('слушать', 'послушать'),
 ('смотреть', 'посмотреть'),
 ('ставить', 'поставить'),
 ('стоять', 'постоять'),
 ('строить', 'построить'),
 ('считать', 'посчитать'),
 ('терять', 'потерять'),
 ('тратить', 'потратить')]

In [3]:
all_text = [char for src, tgt in pairs for char in src + tgt]
chars = sorted(set(all_text))
char2idx = {c: i+4 for i, c in enumerate(chars)}
char2idx["<PAD>"] = 0
char2idx["<SOS>"] = 1
char2idx["<EOS>"] = 2
char2idx["<UNK>"] = 3
idx2char = {i: c for c, i in char2idx.items()}
vocab_size = len(char2idx)


In [4]:
class VerbPairDataset(Dataset):
    def __init__(self, pairs, char2idx, max_len=20):
        self.pairs = pairs
        self.char2idx = char2idx
        self.max_len = max_len

    def encode(self, word, add_sos=False):
        seq = [char2idx.get(c, char2idx["<UNK>"]) for c in word]
        if add_sos:
            seq = [char2idx["<SOS>"]] + seq
        seq = seq + [char2idx["<EOS>"]]
        seq += [char2idx["<PAD>"]] * (self.max_len - len(seq))
        return seq[:self.max_len]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_encoded = self.encode(src)
        tgt_encoded = self.encode(tgt, add_sos=True)
        return torch.tensor(src_encoded), torch.tensor(tgt_encoded)


In [5]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell


In [6]:
def train_seq2seq(encoder, decoder, dataloader, epochs=20, teacher_forcing_ratio=0.5):
    criterion = nn.CrossEntropyLoss(ignore_index=char2idx["<PAD>"])
    enc_opt = optim.Adam(encoder.parameters(), lr=0.001)
    dec_opt = optim.Adam(decoder.parameters(), lr=0.001)

    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            enc_opt.zero_grad()
            dec_opt.zero_grad()
            hidden, cell = encoder(src)
            input_token = tgt[:, 0]
            loss = 0
            for t in range(1, tgt.shape[1]):
                output, hidden, cell = decoder(input_token, hidden, cell)
                loss += criterion(output, tgt[:, t])
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = output.argmax(1)
                input_token = tgt[:, t] if teacher_force else top1
            loss.backward()
            enc_opt.step()
            dec_opt.step()
            total_loss += loss.item() / tgt.shape[1]
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")


In [7]:
def predict(encoder, decoder, word, max_len=20):
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        input_seq = torch.tensor([dataset.encode(word)], dtype=torch.long)
        hidden, cell = encoder(input_seq)
        input_token = torch.tensor([char2idx["<SOS>"]])
        output_seq = []

        for _ in range(max_len):
            output, hidden, cell = decoder(input_token, hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == char2idx["<EOS>"]:
                break
            output_seq.append(idx2char[top1.item()])
            input_token = top1

        return "".join(output_seq)


In [8]:
dataset = VerbPairDataset(pairs, char2idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

encoder = Encoder(vocab_size, emb_dim=64, hidden_dim=128)
decoder = Decoder(vocab_size, emb_dim=64, hidden_dim=128)

train_seq2seq(encoder, decoder, dataloader, epochs=30)

# Test



Epoch 1, Loss: nan
Epoch 2, Loss: nan
Epoch 3, Loss: nan
Epoch 4, Loss: nan
Epoch 5, Loss: nan
Epoch 6, Loss: nan
Epoch 7, Loss: nan
Epoch 8, Loss: nan
Epoch 9, Loss: nan
Epoch 10, Loss: nan
Epoch 11, Loss: nan
Epoch 12, Loss: nan
Epoch 13, Loss: nan
Epoch 14, Loss: nan
Epoch 15, Loss: nan
Epoch 16, Loss: nan
Epoch 17, Loss: nan
Epoch 18, Loss: nan
Epoch 19, Loss: nan
Epoch 20, Loss: nan
Epoch 21, Loss: nan
Epoch 22, Loss: nan
Epoch 23, Loss: nan
Epoch 24, Loss: nan
Epoch 25, Loss: nan
Epoch 26, Loss: nan
Epoch 27, Loss: nan
Epoch 28, Loss: nan
Epoch 29, Loss: nan
Epoch 30, Loss: nan


In [9]:
print(predict(encoder, decoder, "думать"))
print(predict(encoder, decoder, "желать"))
print(predict(encoder, decoder, "строить"))
print(predict(encoder, decoder, "говорить"))
print(predict(encoder, decoder, "строить"))
print(predict(encoder, decoder, "считать"))

посерать
посерать
пострать
пострать
пострать
посерать


In [10]:
def exact_match_accuracy(predictions, targets):
    correct = sum([pred == tgt for pred, tgt in zip(predictions, targets)])
    return correct / len(predictions)


In [11]:
def character_accuracy(predictions, targets):
    total_chars = 0
    correct_chars = 0
    for pred, tgt in zip(predictions, targets):
        for pc, tc in zip(pred, tgt):
            total_chars += 1
            if pc == tc:
                correct_chars += 1
    return correct_chars / total_chars if total_chars > 0 else 0


In [12]:
import editdistance

def average_edit_distance(predictions, targets):
    total_distance = 0
    for pred, tgt in zip(predictions, targets):
        total_distance += editdistance.eval(pred, tgt)
    return total_distance / len(predictions)


In [13]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def average_bleu(predictions, targets):
    smoothie = SmoothingFunction().method4
    scores = [
        sentence_bleu([list(tgt)], list(pred), weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
        for pred, tgt in zip(predictions, targets)
    ]
    return sum(scores) / len(scores)


In [14]:
predictions = ["поговорить", "поиграть", "покурить"]
targets = ["поговорить", "поиграть", "полежать"]

print("Exact Match Accuracy:", exact_match_accuracy(predictions, targets))
print("Character Accuracy:", character_accuracy(predictions, targets))
print("Average Edit Distance:", average_edit_distance(predictions, targets))
print("BLEU Score:", average_bleu(predictions, targets))


Exact Match Accuracy: 0.6666666666666666
Character Accuracy: 0.8461538461538461
Average Edit Distance: 1.3333333333333333
BLEU Score: 0.8333333333333334
