In [1]:
from __future__ import unicode_literals, print_function, division

import numpy as np
import pandas as pd
from tqdm import tqdm

from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
#!unzip filtered_paranmt.zip

In [2]:
def read_corpus(filename):
  data = []
  for line in open(filename, encoding='utf-8'):
      questions = line.split('\t')
      data.append(questions)
      data[-1][-1] = data[-1][-1][:-1]
  return data

In [3]:
str_col = ['reference', 'translation']
num_col = ['ref_tox', 'trn_tox', 'similarity', 'lenght_diff']

data = read_corpus("filtered.tsv")
data[0][0] = 'id'
Data = pd.DataFrame(data[1:], columns=data[0])
for num in num_col:
  Data[num] = pd.to_numeric(Data[num])

Data.index = pd.to_numeric(Data['id']).values

Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 577777 entries, 0 to 577776
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           577777 non-null  object 
 1   reference    577777 non-null  object 
 2   translation  577777 non-null  object 
 3   similarity   577777 non-null  float64
 4   lenght_diff  577777 non-null  float64
 5   ref_tox      577777 non-null  float64
 6   trn_tox      577777 non-null  float64
dtypes: float64(4), object(3)
memory usage: 35.3+ MB


In [4]:
SOS_token = 0
EOS_token = 1

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [24]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(vocab_tox, pair[0])
    target_tensor = tensorFromSentence(vocab_detox, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    vocab_tox, vocab_detox, pairs = prepareData(Data)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH + 1), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH + 1), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(vocab_tox, inp)
        tgt_ids = indexesFromSentence(vocab_detox, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    #train_sampler = RandomSampler(train_data)
    #train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    return vocab_tox, vocab_detox, train_dataloader

In [25]:
batch_size = 32
input_lang, output_lang, pairs = prepareData(Data)

Counted words:
tox-vocab 42265
detox-vocab 32051


In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [27]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH + 1):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [28]:
def train_epoch(epoch, dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    total = 0
    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )

    for i, batch in loop:
        input, target = batch

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target)
        #print(encoder_outputs.shape, encoder_hidden.shape, decoder_outputs.shape)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item() * input.shape[0]
        total += input.shape[0]
        loop.set_postfix({"loss": total_loss/total})

    return total_loss / total

In [32]:
def train(train_dataloader, encoder, decoder, epochs, learning_rate=0.001):
    plot_losses = []
    total = 0
    loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, epochs + 1):
        loss = train_epoch(epoch, train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

        print(f"{epoch}/{epochs}, loss:{loss}")



    #showPlot(plot_losses)

In [30]:
vocab_tox, vocab_detox, train_dataloader = get_dataloader(batch_size)

Counted words:
tox-vocab 42265
detox-vocab 32051


In [33]:
hidden_size = 128
batch_size = 32

encoder = EncoderRNN(vocab_tox.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, vocab_detox.n_words).to(device)

train(train_dataloader, encoder, decoder, 80)

Epoch 1: train: 100%|██████████| 8716/8716 [02:00<00:00, 72.17it/s, loss=2.23]


1/80, loss:2.233358381440713


Epoch 2: train:  10%|▉         | 857/8716 [00:12<01:50, 71.18it/s, loss=1.78]


KeyboardInterrupt: 

In [None]:
def evaluate(encoder, decoder, sentence, vocab_tox, vocab_detox):
    with torch.no_grad():
        input_tensor = tensorFromSentence(vocab_tox, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], vocab_tox, vocab_tox)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> hit that if you want her to live
= leave her if you want to live
< if you want her to live <EOS>

> snyder did i order you to shoot ?
= you gave orders to the shooting ?
< a orders from you order ? <EOS>

> my friend killed him
= my friends killed him
< my friend killed him <EOS>

> and by devil i mean robot devil
= and by devil i mean the robodevil
< and i mean it s satan <EOS>

> will destroy this place
= they re gonna destroy it !
< this place will destroy it <EOS>

> i could drown here
= i ll drown
< i could drown <EOS>

> you re a shoe in
= are you butch ?
< you re the biggest loser <EOS>

> what the hell is wrong with you ?
= what s wrong with you ?
< what s wrong with you ? <EOS>

> you re such a jinx
= you re a curse
< you re such a softie <EOS>

> thing s going to be the death of me
= i think this is gonna kill me
< it s gonna be dead <EOS>

