# Architecture of seq2seq model

In [None]:
class EncoderRNN(nn.Module):
    """
        Encoder part
    """
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=PAD_token)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, dropout=dropout_p, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    """
        Decoder part
    """
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=PAD_token)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2, dropout=dropout_p, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    
    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        
        for i in range(MAX_LENGTH + 1):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        return decoder_outputs, decoder_hidden

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        
        return output, hidden

# Train model

In [None]:
def train_epoch(epoch, dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    total = 0
    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    
    encoder.train()
    decoder.train()
    for i, batch in loop:
        input, target = batch

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input)
        decoder_outputs, _, = decoder(encoder_outputs, encoder_hidden, target)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item() * input.shape[0]
        total += input.shape[0]
        loop.set_postfix({"loss": total_loss/total})

    return total_loss / total

def val_epoch(epoch, dataloader, encoder, decoder,
          criterion, best_so_far=0.0, encoder_path='best_encoder.pt', decoder_path='best_decoder.pt'):
    
    total_loss = 0
    total = 0
    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        for i, batch in loop:
            input, target = batch

            encoder_outputs, encoder_hidden = encoder(input)
            decoder_outputs, _, = decoder(encoder_outputs, encoder_hidden, target)
            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target.view(-1)
            )

            total_loss += loss.item() * input.shape[0]
            total += input.shape[0]
            loop.set_postfix({"loss": total_loss/total})

        Loss = total_loss / total
        if Loss < best_so_far:
            torch.save(encoder.state_dict(), encoder_path)
            torch.save(decoder.state_dict(), decoder_path)
            return Loss, total_loss / total

    return best_so_far, total_loss / total

In [None]:
def train(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, epochs):
    plot_losses = []
    total = 0
    loss_total = 0
    best = float('inf')
    loss_train_list = []
    loss_val_list = []
    for epoch in range(1, epochs + 1):
        loss_train = train_epoch(epoch, train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        best, loss_val = val_epoch(epoch, val_dataloader, encoder, decoder, criterion, best_so_far=best)
        loss_train_list.append(loss_train)
        loss_val_list.append(loss_val)
    
    loss_train_list, loss_val_list

In [None]:
hidden_size = 128
batch_size = 32
learning_rate = 1e-3

vocab_tox, vocab_detox, pairs = prepareData(Data)
train_dataloader, val_dataloader = get_dataloader(batch_size, vocab_tox, vocab_detox, pairs)

In [None]:
encoder = EncoderRNN(vocab_tox.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, vocab_detox.n_words).to(device)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_token)

loss_train, loss_val = train(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 15)

# Plot loss

In [None]:
plt.plot(range(epochs), train_loss, label='Training loss')
plt.plot(range(epochs), val_loss, label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')

plt.show()