In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random

data = pd.read_csv("prefixed_verbs.csv")
df = pd.DataFrame(data)
teacher = df["prefix"]
learner = df["base"]

pairs = list(zip(teacher, learner))

In [39]:


# Character processing remains the same
all_text = [char for src, tgt in pairs for char in src + tgt]
chars = sorted(set(all_text))
char2idx = {c: i+4 for i, c in enumerate(chars)}
char2idx["<PAD>"] = 0
char2idx["<SOS>"] = 1
char2idx["<EOS>"] = 2
char2idx["<UNK>"] = 3
idx2char = {i: c for c, i in char2idx.items()}
vocab_size = len(char2idx)

class VerbPairDataset(Dataset):
    def __init__(self, pairs, char2idx, max_len=20):
        self.pairs = pairs
        self.char2idx = char2idx
        self.max_len = max_len

    def encode(self, word, add_sos=False):
        if not word:  # Handle empty input
            word = "<UNK>"
        seq = [char2idx.get(c, char2idx["<UNK>"]) for c in word]
        if add_sos:
            seq = [char2idx["<SOS>"]] + seq
        seq = seq + [char2idx["<EOS>"]]
        seq += [char2idx["<PAD>"]] * (self.max_len - len(seq))
        return seq[:self.max_len]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_encoded = self.encode(src)
        tgt_encoded = self.encode(tgt, add_sos=True)
        return torch.tensor(src_encoded), torch.tensor(tgt_encoded)
    
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: (1, batch_size, hidden_dim)
        # encoder_outputs: (batch_size, seq_len, hidden_dim)
        hidden = hidden.squeeze(0)  # (batch_size, hidden_dim)
        
        # Repeat hidden for each encoder output
        hidden_expanded = hidden.unsqueeze(1).expand(-1, encoder_outputs.shape[1], -1)  # (batch_size, seq_len, hidden_dim)
        
        # Concatenate and score
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))  # (batch_size, seq_len, hidden_dim)
        attention_scores = self.v(energy).squeeze(2)  # (batch_size, seq_len)
        
        return torch.softmax(attention_scores, dim=1)
        
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.ln = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)

        # Merge bidirectional outputs
        outputs = self.fc(outputs)  # (batch_size, seq_len, hidden_dim)
        outputs = self.ln(outputs)

        # Adjust hidden state (sum bidirectional states)
        hidden = hidden.view(2, -1, hidden.shape[2])  # (2, batch_size, hidden_dim)
        hidden = torch.sum(hidden, dim=0).unsqueeze(0)  # (1, batch_size, hidden_dim)
        hidden = torch.tanh(hidden)

        # Ensure cell state is also 3D
        cell = cell.view(2, -1, cell.shape[2])
        cell = torch.sum(cell, dim=0).unsqueeze(0)

        return outputs, hidden, cell
        
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)
        self.ln = nn.LayerNorm(hidden_dim)

    def forward(self, x, hidden, cell, encoder_outputs):
        x = x.unsqueeze(1)  # (batch_size, 1)
        embedded = self.embedding(x)  # (batch_size, 1, emb_dim)

        # Ensure hidden/cell are 3D: (num_layers, batch_size, hidden_dim)
        if hidden.dim() == 2:
            hidden = hidden.unsqueeze(0)  # (1, batch_size, hidden_dim)
        if cell.dim() == 2:
            cell = cell.unsqueeze(0)

        # Calculate attention
        attn_weights = self.attention(hidden, encoder_outputs)  # (batch_size, seq_len)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, hidden_dim)

        # Combine embedding and context
        lstm_input = torch.cat((embedded, context), dim=2)  # (batch_size, 1, emb_dim + hidden_dim)

        # LSTM step
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.ln(output)

        # Final prediction
        output = torch.cat((output.squeeze(1), context.squeeze(1)), dim=1)
        prediction = self.fc(output)

        return prediction, hidden, cell, attn_weights
def train_seq2seq(encoder, decoder, dataloader, epochs=20):
    criterion = nn.CrossEntropyLoss(ignore_index=char2idx["<PAD>"])
    optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.0001)
    
    for epoch in range(epochs):
        total_loss = 0
        teacher_forcing_ratio = 0.8 * (0.9 ** epoch)
        
        for src, tgt in dataloader:
            optimizer.zero_grad()
            
            if src.nelement() == 0 or tgt.nelement() == 0:
                continue
                
            # Encoder forward (returns all outputs)
            encoder_outputs, hidden, cell = encoder(src)
            input_token = tgt[:, 0]
            loss = 0
            
            for t in range(1, tgt.shape[1]):
                # Decoder with attention
                output, hidden, cell, _ = decoder(
                    input_token, hidden, cell, encoder_outputs
                )
                step_loss = criterion(output, tgt[:, t])
                
                if torch.isnan(step_loss).any():
                    continue
                    
                loss += step_loss
                teacher_force = random.random() < teacher_forcing_ratio
                input_token = tgt[:, t] if teacher_force else output.argmax(1)
            
            if torch.isnan(loss).any() or loss == 0:
                continue
                
            loss.backward()
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)
            optimizer.step()
            
            total_loss += loss.item() / tgt.shape[1]
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")


In [40]:
# Initialize with larger batch size
dataset = VerbPairDataset(pairs, char2idx)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # Increased from 2 to 16

encoder = Encoder(vocab_size, emb_dim=64, hidden_dim=128)
decoder = Decoder(vocab_size, emb_dim=64, hidden_dim=128)

train_seq2seq(encoder, decoder, dataloader, epochs=100)



Epoch 1, Loss: 1.5313
Epoch 2, Loss: 1.1657
Epoch 3, Loss: 1.0374
Epoch 4, Loss: 0.9037
Epoch 5, Loss: 0.8455
Epoch 6, Loss: 0.8113
Epoch 7, Loss: 0.8047
Epoch 8, Loss: 0.7560
Epoch 9, Loss: 0.7633
Epoch 10, Loss: 0.7479
Epoch 11, Loss: 0.6790
Epoch 12, Loss: 0.6821
Epoch 13, Loss: 0.6882
Epoch 14, Loss: 0.6089
Epoch 15, Loss: 0.5771
Epoch 16, Loss: 0.5588
Epoch 17, Loss: 0.5002
Epoch 18, Loss: 0.4657
Epoch 19, Loss: 0.4298
Epoch 20, Loss: 0.3815
Epoch 21, Loss: 0.3289
Epoch 22, Loss: 0.2624
Epoch 23, Loss: 0.2328
Epoch 24, Loss: 0.1974
Epoch 25, Loss: 0.1491
Epoch 26, Loss: 0.1334
Epoch 27, Loss: 0.1046
Epoch 28, Loss: 0.0916
Epoch 29, Loss: 0.0770
Epoch 30, Loss: 0.0593
Epoch 31, Loss: 0.0526
Epoch 32, Loss: 0.0499
Epoch 33, Loss: 0.0403
Epoch 34, Loss: 0.0234
Epoch 35, Loss: 0.0290
Epoch 36, Loss: 0.0150
Epoch 37, Loss: 0.0120
Epoch 38, Loss: 0.0101
Epoch 39, Loss: 0.0076
Epoch 40, Loss: 0.0069
Epoch 41, Loss: 0.0058
Epoch 42, Loss: 0.0048
Epoch 43, Loss: 0.0044
Epoch 44, Loss: 0.00

In [46]:
def predict(encoder, decoder, word, char2idx, idx2char, max_len=20, device='cpu'):
    encoder.eval()
    decoder.eval()

    # Prepare input tensor
    seq = [char2idx.get(c, char2idx["<UNK>"]) for c in word] + [char2idx["<EOS>"]]
    seq += [char2idx["<PAD>"]] * (max_len - len(seq))
    src_tensor = torch.tensor([seq[:max_len]], dtype=torch.long).to(device)

    # Encoder forward
    with torch.no_grad():
        encoder_outputs, hidden, cell = encoder(src_tensor)

    # Start decoding
    input_token = torch.tensor([char2idx["<SOS>"]], dtype=torch.long).to(device)
    decoded_indices = []

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell, attn_weights = decoder(
                input_token, hidden, cell, encoder_outputs
            )
        prediction = output.argmax(1).item()

        if prediction == char2idx["<EOS>"]:
            break

        decoded_indices.append(prediction)
        input_token = torch.tensor([prediction], dtype=torch.long).to(device)

    return ''.join(idx2char[idx] for idx in decoded_indices)


In [48]:
df["base_pred"] = df["prefix"].apply(lambda word: predict(encoder, decoder, word, char2idx, idx2char, max_len=20, device="cpu"))



In [49]:
df["is_correct"] = df["base_pred"] == df["base"]
accuracy = df["is_correct"].mean()
print(f"Exact match accuracy: {accuracy:.2f}")


Exact match accuracy: 1.00


In [50]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import editdistance


In [51]:
smoothie = SmoothingFunction().method4

def compute_metrics(row):
    pred = row["base_pred"]
    truth = row["base"]

    # Exact match
    is_exact = pred == truth

    # Edit distance
    edit_dist = editdistance.eval(pred, truth)

    # Character accuracy
    max_len = max(len(pred), len(truth))
    char_acc = 1 - (edit_dist / max_len) if max_len > 0 else 1.0

    # BLEU score (character-level)
    bleu = sentence_bleu(
        [list(truth)], list(pred), weights=(1, 0, 0, 0), smoothing_function=smoothie
    )

    return pd.Series([is_exact, edit_dist, char_acc, bleu],
                     index=["is_exact", "edit_distance", "char_accuracy", "bleu"])


In [52]:
df[["is_exact", "edit_distance", "char_accuracy", "bleu"]] = df.apply(compute_metrics, axis=1)


In [53]:


print("Exact Match Accuracy:", df["is_exact"].mean())
print("Average Edit Distance:", df["edit_distance"].mean())
print("Average Character Accuracy:", df["char_accuracy"].mean())
print("Average BLEU Score:", df["bleu"].mean())


Exact Match Accuracy: 1.0
Average Edit Distance: 0.0
Average Character Accuracy: 1.0
Average BLEU Score: 1.0


In [10]:
# Save model and optimizer states
torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
}, 'l2_aspect_teacher_learner_seq2seq_model.pth')

In [54]:
import pandas as pd

# Show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# To avoid cutting off wide content
pd.set_option('display.max_colwidth', None)


df

Unnamed: 0,prefix,base,base_pred,is_correct,is_exact,edit_distance,char_accuracy,bleu
0,внести,нести,нести,True,True,0,1.0,1.0
1,вынести,нести,нести,True,True,0,1.0,1.0
2,занести,нести,нести,True,True,0,1.0,1.0
3,нанести,нести,нести,True,True,0,1.0,1.0
4,отнести,нести,нести,True,True,0,1.0,1.0
5,поднести,нести,нести,True,True,0,1.0,1.0
6,принести,нести,нести,True,True,0,1.0,1.0
7,принести,нести,нести,True,True,0,1.0,1.0
8,перенести,нести,нести,True,True,0,1.0,1.0
9,разнести,нести,нести,True,True,0,1.0,1.0
