In [None]:
import random, pandas as pd, numpy as np, torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import Dataset, DataLoader

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

BATCH_SIZE = 64
EMB_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 4
DROPOUT = 0.4
NUM_EPOCHS = 50
LR = 1e-3
TF_RATIO = 0.4


train_df = pd.read_csv("/kaggle/input/meznarikk/train.csv")
test_df  = pd.read_csv("/kaggle/input/meznarikk/test.csv")

def build_vocab(strings, extra):
    chars = sorted(set("".join(strings)))
    itos = extra + chars
    stoi = {c:i for i,c in enumerate(itos)}
    return stoi, itos

inp_stoi, inp_itos = build_vocab(train_df.data.tolist()+test_df.data.tolist(), ["<pad>","<unk>"])
tgt_stoi, tgt_itos = build_vocab(train_df.label.tolist(), ["<pad>","<sos>","<eos>"])

PAD_INP, UNK = inp_stoi["<pad>"], inp_stoi["<unk>"]
PAD_TGT, SOS, EOS = tgt_stoi["<pad>"], tgt_stoi["<sos>"], tgt_stoi["<eos>"]

class DateDataset(Dataset):
    def __init__(self, df, train=True):
        self.x = df.data.values
        self.y = df.label.values if train else None

    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        xi = torch.tensor([inp_stoi.get(c, UNK) for c in self.x[i]], dtype=torch.long)
        if self.y is None:
            return xi
        yi = [SOS] + [tgt_stoi[c] for c in self.y[i]] + [EOS]
        return xi, torch.tensor(yi, dtype=torch.long)

def collate(batch):
    if isinstance(batch[0], tuple):
        xs, ys = zip(*batch)
        return (
            nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=PAD_INP),
            nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=PAD_TGT),
        )
    return nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=PAD_INP)


tr_df, vl_df = train_test_split(train_df, test_size=0.01, random_state=SEED)
train_loader = DataLoader(DateDataset(tr_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(DateDataset(vl_df), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)
test_loader  = DataLoader(DateDataset(test_df, train=False), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)


class Attention(nn.Module):
    def forward(self, decoder_hidden, encoder_outputs, mask=None):
        scores = torch.bmm(encoder_outputs, decoder_hidden.unsqueeze(2)).squeeze(2)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attn_weights

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size, n_layers, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_INP)
        self.rnn = nn.GRU(emb_size, hid_size, num_layers=n_layers, batch_first=True,
                          bidirectional=True, dropout=dropout)

    def forward(self, src):
        embedded = self.emb(src)
        outputs, hidden = self.rnn(embedded)
        last = torch.cat([hidden[-2], hidden[-1]], dim=1)               
        hidden = last.unsqueeze(0).repeat(self.rnn.num_layers, 1, 1)    
        return outputs, hidden


class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size, n_layers, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_TGT)
        self.attn = Attention()
        self.rnn = nn.GRU(emb_size + hid_size*2, hid_size*2, num_layers=n_layers,
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hid_size*4, vocab_size)

    def forward(self, token, hidden, encoder_outputs, mask=None):
        emb = self.emb(token.unsqueeze(1))
        context, _ = self.attn(hidden[-1], encoder_outputs, mask)
        rnn_input = torch.cat([emb, context.unsqueeze(1)], dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        logits = self.fc(torch.cat([output.squeeze(1), context], dim=1))
        return logits, hidden


class Seq2Seq(nn.Module):
    def __init__(self, enc, dec):
        super().__init__()
        self.enc, self.dec = enc, dec

    def forward(self, src, tgt=None):
        enc_out, hidden = self.enc(src)
        max_len = tgt.size(1)-1 if tgt is not None else 10
        batch = src.size(0)
        outputs = torch.zeros(batch, max_len, len(tgt_itos), device=DEVICE)
        token = torch.full((batch,), SOS, device=DEVICE)
        mask = (src != PAD_INP)

        for t in range(max_len):
            logits, hidden = self.dec(token, hidden, enc_out, mask)
            outputs[:,t] = logits
            token = tgt[:,t+1] if (tgt is not None and random.random()<TF_RATIO) else logits.argmax(1)
        return outputs

model = Seq2Seq(
    Encoder(len(inp_itos), EMB_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT),
    Decoder(len(tgt_itos), EMB_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)
).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TGT)


def evaluate(loader):
    model.eval()
    loss_sum, correct, total = 0,0,0
    with torch.no_grad():
        for x,y in loader:
            x,y = x.to(DEVICE), y.to(DEVICE)
            out = model(x,y)
            loss = criterion(out.view(-1,out.size(-1)), y[:,1:].reshape(-1))
            loss_sum += loss.item()
            preds = out.argmax(2)
            for p,t in zip(preds, y[:,1:]):
                if "".join([tgt_itos[i] for i in p.tolist() if i not in (PAD_TGT,EOS)]) == \
                   "".join([tgt_itos[i] for i in t.tolist() if i not in (PAD_TGT,EOS)]):
                    correct+=1
                total+=1
    return loss_sum/len(loader), correct/total


for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    train_loss = 0
    for x,y in train_loader:
        x,y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        out = model(x,y)
        loss = criterion(out.view(-1,out.size(-1)), y[:,1:].reshape(-1))
        loss.backward(); optimizer.step()
        train_loss += loss.item()
    val_loss,val_acc = evaluate(val_loader)
    print(f"Epoch {epoch}: Train Loss={train_loss/len(train_loader):.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

model.eval()
preds=[]
with torch.no_grad():
    for x in test_loader:
        x = x.to(DEVICE)
        out = model(x)
        for seq in out.argmax(2):
            txt = "".join([tgt_itos[i] for i in seq.tolist() if i not in (PAD_TGT,EOS)])
            preds.append(txt)

pd.DataFrame({"id":test_df.id, "label":preds}).to_csv("submission.csv", index=False)


Epoch 1: Train Loss=2.1267, Val Loss=2.5274, Val Acc=0.0000
Epoch 2: Train Loss=1.7450, Val Loss=1.2810, Val Acc=0.0000
Epoch 3: Train Loss=0.8682, Val Loss=0.6611, Val Acc=0.0000
Epoch 4: Train Loss=0.5605, Val Loss=0.5357, Val Acc=0.0000
Epoch 5: Train Loss=0.4654, Val Loss=0.4906, Val Acc=0.0909
Epoch 6: Train Loss=0.3563, Val Loss=0.4080, Val Acc=0.0909
