In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd


In [3]:
data = pd.read_csv("teacher_learner.csv")
df = pd.DataFrame(data)

In [4]:
teacher = df["teacher"]
learner = df["learner"]

pairs = list(zip(teacher, learner))

In [5]:
pairs

[('помочь', 'помогать'),
 ('злоупотреблять', 'злоупотребить'),
 ('пойти', 'идти'),
 ('проконтролировать', 'контролировать'),
 ('покупать', 'купить'),
 ('начинать', 'начать'),
 ('передавать', 'передать'),
 ('возвращаться', 'вернуться'),
 ('приходить', 'прийти'),
 ('выражать', 'выразить'),
 ('разрушать', 'разрушить'),
 ('уделять', 'удалить'),
 ('достигнуть', 'достигать'),
 ('заменить', 'заменять'),
 ('заметить', 'замечать'),
 ('пообщаться', 'общаться'),
 ('оживить', 'оживлять'),
 ('помочь', 'помогать'),
 ('помочь', 'помогать'),
 ('помочь', 'помогать'),
 ('поступить', 'поступать'),
 ('представить', 'представлять'),
 ('принимать', 'принять'),
 ('проверить', 'проверять'),
 ('разуверить', 'разуверять'),
 ('сократить', 'сокращать'),
 ('улучшить', 'улучшать'),
 ('уступить', 'уступать'),
 ('увидеть', 'видеть'),
 ('увидеть', 'видеть'),
 ('увидеть', 'видеть'),
 ('увидеть', 'видеть'),
 ('поговорить', 'говорить'),
 ('приготовить', 'готовить'),
 ('сделать', 'делать'),
 ('сделать', 'делать'),
 ('сдел

In [6]:
all_text = [char for src, tgt in pairs for char in src + tgt]
chars = sorted(set(all_text))
char2idx = {c: i+4 for i, c in enumerate(chars)}
char2idx["<PAD>"] = 0
char2idx["<SOS>"] = 1
char2idx["<EOS>"] = 2
char2idx["<UNK>"] = 3
idx2char = {i: c for c, i in char2idx.items()}
vocab_size = len(char2idx)


In [7]:
class VerbPairDataset(Dataset):
    def __init__(self, pairs, char2idx, max_len=20):
        self.pairs = pairs
        self.char2idx = char2idx
        self.max_len = max_len

    def encode(self, word, add_sos=False):
        seq = [char2idx.get(c, char2idx["<UNK>"]) for c in word]
        if add_sos:
            seq = [char2idx["<SOS>"]] + seq
        seq = seq + [char2idx["<EOS>"]]
        seq += [char2idx["<PAD>"]] * (self.max_len - len(seq))
        return seq[:self.max_len]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_encoded = self.encode(src)
        tgt_encoded = self.encode(tgt, add_sos=True)
        return torch.tensor(src_encoded), torch.tensor(tgt_encoded)


In [8]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell


In [9]:
def train_seq2seq(encoder, decoder, dataloader, epochs=20, teacher_forcing_ratio=0.5):
    criterion = nn.CrossEntropyLoss(ignore_index=char2idx["<PAD>"])
    enc_opt = optim.Adam(encoder.parameters(), lr=0.001)
    dec_opt = optim.Adam(decoder.parameters(), lr=0.001)

    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            enc_opt.zero_grad()
            dec_opt.zero_grad()
            hidden, cell = encoder(src)
            input_token = tgt[:, 0]
            loss = 0
            for t in range(1, tgt.shape[1]):
                output, hidden, cell = decoder(input_token, hidden, cell)
                loss += criterion(output, tgt[:, t])
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = output.argmax(1)
                input_token = tgt[:, t] if teacher_force else top1
            loss.backward()
            enc_opt.step()
            dec_opt.step()
            total_loss += loss.item() / tgt.shape[1]
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")


In [10]:
def predict(encoder, decoder, word, max_len=20):
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        input_seq = torch.tensor([dataset.encode(word)], dtype=torch.long)
        hidden, cell = encoder(input_seq)
        input_token = torch.tensor([char2idx["<SOS>"]])
        output_seq = []

        for _ in range(max_len):
            output, hidden, cell = decoder(input_token, hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == char2idx["<EOS>"]:
                break
            output_seq.append(idx2char[top1.item()])
            input_token = top1

        return "".join(output_seq)


In [11]:
dataset = VerbPairDataset(pairs, char2idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

encoder = Encoder(vocab_size, emb_dim=64, hidden_dim=128)
decoder = Decoder(vocab_size, emb_dim=64, hidden_dim=128)

train_seq2seq(encoder, decoder, dataloader, epochs=30)

# Test



Epoch 1, Loss: nan
Epoch 2, Loss: nan
Epoch 3, Loss: nan
Epoch 4, Loss: nan
Epoch 5, Loss: nan
Epoch 6, Loss: nan
Epoch 7, Loss: nan
Epoch 8, Loss: nan
Epoch 9, Loss: nan
Epoch 10, Loss: nan
Epoch 11, Loss: nan
Epoch 12, Loss: nan
Epoch 13, Loss: nan
Epoch 14, Loss: nan
Epoch 15, Loss: nan
Epoch 16, Loss: nan
Epoch 17, Loss: nan
Epoch 18, Loss: nan
Epoch 19, Loss: nan
Epoch 20, Loss: nan
Epoch 21, Loss: nan
Epoch 22, Loss: nan
Epoch 23, Loss: nan
Epoch 24, Loss: nan
Epoch 25, Loss: nan
Epoch 26, Loss: nan
Epoch 27, Loss: nan
Epoch 28, Loss: nan
Epoch 29, Loss: nan
Epoch 30, Loss: nan


In [12]:
df["learner_pred"] = df["teacher"].apply(lambda word: predict(encoder, decoder, word))


In [13]:
df["is_correct"] = df["learner_pred"] == df["learner"]
accuracy = df["is_correct"].mean()
print(f"Exact match accuracy: {accuracy:.2f}")


Exact match accuracy: 0.87


In [14]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import editdistance


In [15]:
smoothie = SmoothingFunction().method4

def compute_metrics(row):
    pred = row["learner_pred"]
    truth = row["learner"]

    # Exact match
    is_exact = pred == truth

    # Edit distance
    edit_dist = editdistance.eval(pred, truth)

    # Character accuracy
    max_len = max(len(pred), len(truth))
    char_acc = 1 - (edit_dist / max_len) if max_len > 0 else 1.0

    # BLEU score (character-level)
    bleu = sentence_bleu(
        [list(truth)], list(pred), weights=(1, 0, 0, 0), smoothing_function=smoothie
    )

    return pd.Series([is_exact, edit_dist, char_acc, bleu],
                     index=["is_exact", "edit_distance", "char_accuracy", "bleu"])


In [16]:
df[["is_exact", "edit_distance", "char_accuracy", "bleu"]] = df.apply(compute_metrics, axis=1)


In [17]:
print("Exact Match Accuracy:", df["is_exact"].mean())
print("Average Edit Distance:", df["edit_distance"].mean())
print("Average Character Accuracy:", df["char_accuracy"].mean())
print("Average BLEU Score:", df["bleu"].mean())


Exact Match Accuracy: 0.8653061224489796
Average Edit Distance: 0.2873469387755102
Average Character Accuracy: 0.9667192950662339
Average BLEU Score: 0.9701638379886588


In [18]:
import pandas as pd

# Show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# To avoid cutting off wide content
pd.set_option('display.max_colwidth', None)


df

Unnamed: 0,teacher,learner,learner_pred,is_correct,is_exact,edit_distance,char_accuracy,bleu
0,помочь,помогать,помогать,True,True,0,1.0,1.0
1,злоупотреблять,злоупотребить,злоупотребить,True,True,0,1.0,1.0
2,пойти,идти,идти,True,True,0,1.0,1.0
3,проконтролировать,контролировать,контролировать,True,True,0,1.0,1.0
4,покупать,купить,купить,True,True,0,1.0,1.0
5,начинать,начать,начать,True,True,0,1.0,1.0
6,передавать,передать,передать,True,True,0,1.0,1.0
7,возвращаться,вернуться,вернуться,True,True,0,1.0,1.0
8,приходить,прийти,прийти,True,True,0,1.0,1.0
9,выражать,выразить,выразить,True,True,0,1.0,1.0
