In [127]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.optim import Adam, AdamW

In [128]:
torch.manual_seed(0)
random.seed(0)

In [129]:
import pandas as pd


train_dataset = pd.read_csv('train (1).csv').values
test_dataset = pd.read_csv('test (1).csv')

In [130]:
train_dataset

array([['den tjugofjärde 05 2049', '24-05-2049'],
       ['15/11/77', '15-11-2077'],
       ["sipsa'e 02 2049", '14-02-2049'],
       ...,
       ['le neuf mars 2007', '09-03-2007'],
       ['am vier und zwanzigsten juni 2007', '24-06-2007'],
       ['sechster juni 2007', '06-06-2007']], dtype=object)

In [5]:
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

MAX_LENGTH

41

In [7]:
SOS_token = 0
EOS_token = 1


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [8]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [222]:
class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [221]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.layer_norm2 = nn.LayerNorm(hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        output = self.embedding(x).view(1, 1, -1)
        output = self.layer_norm1(output.squeeze(0)).unsqueeze(0)

        output = self.relu(output)
        output, hidden = self.gru(output, hidden)

        output = self.layer_norm2(output.squeeze(0)).unsqueeze(0)

        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [256]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,
                         num_layers=num_layers,
                         dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=2, dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        self.dropout1 = nn.Dropout(dropout)

        self.gru = nn.GRU(hidden_size, hidden_size,
                         num_layers=num_layers,
                         dropout=dropout if num_layers > 1 else 0)

        self.layer_norm2 = nn.LayerNorm(hidden_size)
        self.dropout2 = nn.Dropout(dropout)

        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        output = self.embedding(x).view(1, 1, -1)
        output = self.layer_norm1(output.squeeze(0)).unsqueeze(0)
        output = self.dropout1(output)

        output, hidden = self.gru(output, hidden)

        output = self.layer_norm2(output.squeeze(0)).unsqueeze(0)
        output = self.dropout2(output)

        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

LSTM я тоже пробовала, нооо архитектура с LSTM давала меньшую точность (хотя мне казалось, что лстм как будто бы более подходящий)

Я изменяла архитектуру по-разному, увеличивала количество слоев GRU/LSTM, добавляла и линейные слои, Dropout, LayerNorm

In [250]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [251]:
def train_single(
        input_tensor, target_tensor,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer,
        criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = False

    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, elem)
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, elem)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / len(target_tensor)

In [252]:
def train(encoder, decoder, n_epochs=5, print_every=100):
    encoder.train()
    decoder.train()

    encoder_optimizer = AdamW(encoder.parameters(), lr=1e-3)
    decoder_optimizer = AdamW(decoder.parameters(), lr=1e-3)

    criterion = nn.NLLLoss()
    #criterion = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        print_loss_total = 0

        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')
        training_pairs = [
            pair2tensor(x) for x in train_dataset
        ]

        for i, training_pair in enumerate(training_pairs):
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train_single(
                input_tensor, target_tensor,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer,
                criterion
            )
            print_loss_total += loss

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}')

In [257]:
encoder_model = Encoder(input_lang.n_words, 256).to(device)
decoder_model = Decoder(256, output_lang.n_words).to(device)

train(encoder_model, decoder_model, n_epochs=10)

Epoch [01/10]
Training (9.0%) loss: 1.7170
Training (18.2%) loss: 1.1099
Training (27.3%) loss: 0.7107
Training (36.4%) loss: 0.6431
Training (45.6%) loss: 0.6660
Training (54.7%) loss: 0.5854
Training (63.8%) loss: 0.5720
Training (73.0%) loss: 0.5478
Training (82.1%) loss: 0.5102
Training (91.2%) loss: 0.4655
Epoch [02/10]
Training (9.0%) loss: 0.5612
Training (18.2%) loss: 0.4719
Training (27.3%) loss: 0.3896
Training (36.4%) loss: 0.3719
Training (45.6%) loss: 0.3208
Training (54.7%) loss: 0.2842
Training (63.8%) loss: 0.2775
Training (73.0%) loss: 0.2354
Training (82.1%) loss: 0.2188
Training (91.2%) loss: 0.2045
Epoch [03/10]
Training (9.0%) loss: 0.2527
Training (18.2%) loss: 0.1622
Training (27.3%) loss: 0.1818
Training (36.4%) loss: 0.1230
Training (45.6%) loss: 0.1149
Training (54.7%) loss: 0.1061
Training (63.8%) loss: 0.0900
Training (73.0%) loss: 0.0621
Training (82.1%) loss: 0.0941
Training (91.2%) loss: 0.0666
Epoch [04/10]
Training (9.0%) loss: 0.0671
Training (18.2%) l

In [258]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()

    for elem in input_tensor:
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    decoded_words = []

    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        _, topi = decoder_output.data.topk(1)

        decoded_words.append(output_lang.index2word[topi.item()])

        if topi.item() == EOS_token:
            break

        decoder_input = topi.squeeze().detach()

    return decoded_words


def predict_(encoder, decoder, dataset):
    result = []

    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])

    return result

In [259]:
test_dataset = pd.read_csv('test (1).csv')

In [260]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [261]:
test_prediction = [''.join(x) for x in test_prediction]

In [262]:
test_dataset['label'] = test_prediction

In [263]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)

In [264]:
test_dataset

Unnamed: 0,id,data,label
0,0,24 января 2007,24-01-2007
1,1,le six mars 2049,06-03-2049
2,2,le dix 05 2077,10-05-2077
3,3,27 июня 2049,27-06-2049
4,4,08 гыйнварда 2077,08-01-2077
...,...,...,...
4671,4671,am fünfzehnten januar 2049,15-01-2049
4672,4672,тугызынчы 05 2049,09-05-2049
4673,4673,der achzehnte 02 2007,18-02-2007
4674,4674,vierzehnter 12 2049,14-12-2049
