# Import necessary libraries

In [99]:
import re
import nltk
import torch
import string
import numpy as np
import pandas as pd
import contractions
from tqdm import tqdm
from spellchecker import SpellChecker
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Read Data

In [116]:
str_col = ['reference', 'translation']
num_col = ['ref_tox', 'trn_tox', 'similarity', 'lenght_diff']

data = pd.read_csv("../data/raw/filtered.tsv", sep='\t', index_col=0)

In [117]:
data

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


# Preprocess Data
    * Put more toxicity sentence into reference column
    * Convert abbreviations into original form (I'm -> I am)
    * Split sentece using word_tokenize
    * Correct spelling of word

In [118]:
data.columns = ['1', '2', '3', '4', '5', '6']
data['reference'] = np.where(data['5'] < data['6'], data['1'], data['2'])
data['translation'] = np.where(data['5'] < data['6'], data['2'], data['1'])
data['ref_tox'] = np.where(data['5'] < data['6'], data['5'], data['6'])
data['trn_tox'] = np.where(data['5'] < data['6'], data['6'], data['5'])
data['similarity'] = data['3']
data['length_diff'] = data['4']
data = data.drop(['1', '2', '3', '4', '5', '6'], axis=1)

In [119]:
spell = SpellChecker(distance=1)
columns = data.columns

Data = []
for i in tqdm(range(len(data))):
    row = data.loc[i].values
    for it in range(0, 2):
        row[it] = contractions.fix(row[it].lower())
        row[it] = nltk.word_tokenize(row[it])
        words = []
        for word in row[it]:
            for sub_word in word.split('-'):
                words.append(sub_word)
        new_row = []
        for word in words:
            res = spell.correction(word)
            if res is not None:
                new_row.append(res)
            else:
                if len([ch for ch in string.punctuation if ch in word]) == 0:
                    new_row.append(word)
        row[it] = ' '.join(new_row)
    Data.append(row)
data = pd.DataFrame(Data, columns=columns)

100%|██████████| 577777/577777 [04:42<00:00, 2047.28it/s]


In [121]:
data.to_csv('../data/interim/preprocessed_filtered_1.tsv', sep='\t')

In [120]:
data

Unnamed: 0,reference,translation,ref_tox,trn_tox,similarity,length_diff
0,"if altar is flooding her with psychic waste , ...","if altar floods her with her mental waste , it...",0.014195,0.981983,0.785171,0.010309
1,now you are getting nasty .,you are becoming disgusting .,0.065473,0.999039,0.749687,0.071429
2,"well , we could spare your life , for one .","well , we can spare your life .",0.213313,0.985068,0.919051,0.268293
3,"ah ! monkey , you have got to snap out of it .","monkey , you have to wake up .",0.053362,0.994215,0.664333,0.309524
4,i have got orders to put her down .,i have orders to kill her .,0.009402,0.999348,0.726639,0.181818
...,...,...,...,...,...,...
577772,you did not know that estelle had stolen some ...,you did not know that estelle stole your fish ...,0.000121,0.949143,0.870322,0.030769
577773,you would be sucked out of your life !,it'il suck the life out of you !,0.215794,0.996124,0.722897,0.058824
577774,i really can not take this .,"i can not fuckin ' take that , bruv .",0.000049,0.984538,0.617511,0.212121
577775,"they said i was a hero , but i did not care .",they called me a fucking hero . the truth is i...,0.000124,0.991945,0.679613,0.358209


# Prepare Dataloader

In [122]:
SOS_token = 0
EOS_token = 1
PAD_token = 2

class Vocabulary:
    """
        Vocabulary of words:
            * Initially filled by 3 tokens
                * <sos> -> start of sequence
                * <eos> -> end of sequence
                * <pad> -> fill to MAX_length
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {"<sos>": 0, "<eos>": 1, "<pad>": 2}
        self.word2count = {}
        self.index2word = {0: "<sos>", 1: "<eos>", 2 : "<pad>"}
        self.n_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [123]:
MAX_LENGTH = 11 # Max length of sentences

# Split sentence
def getList(sentence):
    return sentence.split(' ')

# Filter pair(toxic text, or translated text) by number of words < MAX_LENGTH
def filterPair(p):
    return len(getList(p[0])) < MAX_LENGTH - 1 and \
        len(getList(p[1])) < MAX_LENGTH - 1 # EOS


# Filter every pair(toxic text, or translated text) by number of words < MAX_LENGTH
def filter(norm_ref, norm_trs):
    filter_ref = []
    filter_trs = []
    for pair in zip(norm_ref, norm_trs):
        if filterPair(pair):
            filter_ref.append(pair[0])
            filter_trs.append(pair[1])
    return filter_ref, filter_trs


# Create vocabulary for toxic text and translated, also pair of them
def prepareData(data):
    # Normalize every data and filter
    norm_ref = [row for row in data['reference']]
    norm_trs = [row for row in data['translation']]
    
    norm_ref, norm_trs = filter(norm_ref, norm_trs)
    # Make Vocabulary instances
    vocab_tox = Vocabulary('tox-vocab')
    vocab_detox = Vocabulary('detox-vocab')
    pairs = []
    for row in zip(norm_ref, norm_trs):
        pairs.append(row)

    for row in norm_ref:
        vocab_tox.addSentence(row)

    for row in norm_trs:
        vocab_detox.addSentence(row)

    print("Counted words:")
    print(vocab_tox.name, vocab_tox.n_words)
    print(vocab_detox.name, vocab_detox.n_words)

    return vocab_tox, vocab_detox, pairs

In [124]:
# Convert every word in sentence to indexes of vocabulary
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in getList(sentence)]

# Convert every word in sentence to indexes of vocabulary in tensor format
def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

# Convert every word in pair sentences to indexes of vocabulary in tensor format
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(vocab_tox, pair[0])
    target_tensor = tensorFromSentence(vocab_detox, pair[1])
    return (input_tensor, target_tensor)


def get_dataloader(batch_size, vocab_tox, vocab_detox, pairs, train_size=0.9):
    """
        Return dataloaders of data pairs by given parameters:
            :param batch_size: dataloader of batch_size
            :param vocab_tox: vocabulary for toxic text
            :param vocab_detox: vocabulary for translated text
            :param pairs: data to create dataloader
            :param train_size: proportion for train part
    """
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(vocab_tox, inp)
        tgt_ids = indexesFromSentence(vocab_detox, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        while len(inp_ids) < MAX_LENGTH:
            inp_ids.append(PAD_token)
        
        while len(tgt_ids) < MAX_LENGTH:
            tgt_ids.append(PAD_token)
        
        input_ids[idx] = inp_ids
        target_ids[idx] = tgt_ids

    idx = [i for i in range(n)]
    train_idx, val_idx = train_test_split(idx, train_size=train_size, random_state=420)
    train_data = TensorDataset(torch.LongTensor(input_ids[train_idx]).to(device),
                               torch.LongTensor(target_ids[train_idx]).to(device))
    val_data = TensorDataset(torch.LongTensor(input_ids[val_idx]).to(device),
                               torch.LongTensor(target_ids[val_idx]).to(device))


    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    return train_dataloader, val_dataloader