In [1]:
import pandas as pd
import numpy as np

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim import Adam

import nltk
from nltk import word_tokenize
nltk.download('punkt')

import matplotlib.pyplot as plt

import os
os.chdir("..") # go to the root dir

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataframe = pd.read_csv('data/interim/preprocessed_paranmt.tsv', sep='\t', index_col=0)

In [3]:
dataframe

Unnamed: 0,similarity,lenght_diff,toxic_sent,neutral_sent,toxic_val,neutral_val
0,0.785171,0.010309,"if alkar floods her with her mental waste , it...","if alkar is flooding her with psychic waste , ...",0.981983,0.014195
1,0.749687,0.071429,you 're becoming disgusting .,now you 're getting nasty .,0.999039,0.065473
2,0.919051,0.268293,"well , we can spare your life .","well , we could spare your life , for one .",0.985068,0.213313
3,0.664333,0.309524,"monkey , you have to wake up .","ah ! monkey , you 've got to snap out of it .",0.994215,0.053362
4,0.726639,0.181818,i have orders to kill her .,i 've got orders to put her down .,0.999348,0.009402
...,...,...,...,...,...,...
577772,0.870322,0.030769,you did n't know that estelle stole your fish ...,you did n't know that estelle had stolen some ...,0.949143,0.000121
577773,0.722897,0.058824,it'il suck the life out of you !,you 'd be sucked out of your life !,0.996124,0.215794
577774,0.617511,0.212121,"i ca n't fuckin ' take that , bruv .",i really ca n't take this .,0.984538,0.000049
577775,0.679613,0.358209,they called me a fucking hero . the truth is i...,"they said i was a hero , but i did n't care .",0.991945,0.000124


In [4]:
# let's convert columns toxic_sent and neutral_sent as a list
df = dataframe.copy()
df.loc[:, 'toxic_sent'] = df['toxic_sent'].apply(lambda x: x.split(' '))
df.loc[:, 'neutral_sent'] = df['neutral_sent'].apply(lambda x: x.split(' '))

In [5]:
df.head()

Unnamed: 0,similarity,lenght_diff,toxic_sent,neutral_sent,toxic_val,neutral_val
0,0.785171,0.010309,"[if, alkar, floods, her, with, her, mental, wa...","[if, alkar, is, flooding, her, with, psychic, ...",0.981983,0.014195
1,0.749687,0.071429,"[you, 're, becoming, disgusting, .]","[now, you, 're, getting, nasty, .]",0.999039,0.065473
2,0.919051,0.268293,"[well, ,, we, can, spare, your, life, .]","[well, ,, we, could, spare, your, life, ,, for...",0.985068,0.213313
3,0.664333,0.309524,"[monkey, ,, you, have, to, wake, up, .]","[ah, !, monkey, ,, you, 've, got, to, snap, ou...",0.994215,0.053362
4,0.726639,0.181818,"[i, have, orders, to, kill, her, .]","[i, 've, got, orders, to, put, her, down, .]",0.999348,0.009402


In [6]:
# drop the column with sentence size more than MAX_SENT_SIZE (for speed)

MAX_SENT_SIZE = 32

df = df[df['toxic_sent'].apply(len) <= (MAX_SENT_SIZE-2)]
df = df[df['neutral_sent'].apply(len) <= (MAX_SENT_SIZE-2)]

In [7]:
df

Unnamed: 0,similarity,lenght_diff,toxic_sent,neutral_sent,toxic_val,neutral_val
0,0.785171,0.010309,"[if, alkar, floods, her, with, her, mental, wa...","[if, alkar, is, flooding, her, with, psychic, ...",0.981983,0.014195
1,0.749687,0.071429,"[you, 're, becoming, disgusting, .]","[now, you, 're, getting, nasty, .]",0.999039,0.065473
2,0.919051,0.268293,"[well, ,, we, can, spare, your, life, .]","[well, ,, we, could, spare, your, life, ,, for...",0.985068,0.213313
3,0.664333,0.309524,"[monkey, ,, you, have, to, wake, up, .]","[ah, !, monkey, ,, you, 've, got, to, snap, ou...",0.994215,0.053362
4,0.726639,0.181818,"[i, have, orders, to, kill, her, .]","[i, 've, got, orders, to, put, her, down, .]",0.999348,0.009402
...,...,...,...,...,...,...
577772,0.870322,0.030769,"[you, did, n't, know, that, estelle, stole, yo...","[you, did, n't, know, that, estelle, had, stol...",0.949143,0.000121
577773,0.722897,0.058824,"[it'il, suck, the, life, out, of, you, !]","[you, 'd, be, sucked, out, of, your, life, !]",0.996124,0.215794
577774,0.617511,0.212121,"[i, ca, n't, fuckin, ', take, that, ,, bruv, .]","[i, really, ca, n't, take, this, .]",0.984538,0.000049
577775,0.679613,0.358209,"[they, called, me, a, fucking, hero, ., the, t...","[they, said, i, was, a, hero, ,, but, i, did, ...",0.991945,0.000124


In [8]:
df = df[:150_000]

## Build the Dataloaders

In [9]:
batch_size = 16

In [10]:
# split into train and val
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(list(df.index), train_size=0.9, random_state=42)

In [11]:
len(train_idx), len(val_idx)

(135000, 15000)

## Let's build the vocab

In [12]:
MAX_TOKENS = 25_000 # TODO: change this

In [13]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for i in df.index:
        toxic_sent = df['toxic_sent'][i]
        yield toxic_sent
        
        neutral_sent = df['neutral_sent'][i]
        yield list(neutral_sent)


UNK_IDX = 0
vocab = build_vocab_from_iterator(
    yield_tokens(df.loc[train_idx]), # to avoid data leakage
    min_freq=2,
    specials=['<unk>', '<pad>', '<sos>', '<eos>'],
    max_tokens=MAX_TOKENS,
)
vocab.set_default_index(UNK_IDX)

In [14]:
len(vocab)

25000

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [16]:
def collate_batch(batch):
    toxic_sent, neutral_sent = [], []
    toxic_val, neutral_val = [], []
    similarity, len_diff = [], []
    for b in batch:
        _similarity, _len_diff, _toxic_sent, _neutral_sent, _toxic_val, _neutral_val = b
        similarity.append(_similarity)
        len_diff.append(_len_diff)
        toxic_val.append(_toxic_val)
        neutral_val.append(_neutral_val)
        
        # add <sos> and <eos> to _toxic_sent and _neutral_sent
        _toxic_sent = ['<sos>'] + _toxic_sent[:MAX_SENT_SIZE-2] + ['<eos>']
        _neutral_sent = ['<sos>'] + _neutral_sent[:MAX_SENT_SIZE-2] + ['<eos>']
        
        _toxic_sent = vocab.lookup_indices(_toxic_sent)
        while len(_toxic_sent) < MAX_SENT_SIZE:
            _toxic_sent.append(vocab['<pad>'])
        
        _neutral_sent = vocab.lookup_indices(_neutral_sent)
        while len(_neutral_sent) < MAX_SENT_SIZE:
            _neutral_sent.append(vocab['<pad>'])
        
        toxic_sent.append(torch.tensor(_toxic_sent).reshape(MAX_SENT_SIZE, 1))
        neutral_sent.append(torch.tensor(_neutral_sent).reshape(MAX_SENT_SIZE, 1))
        
    toxic_sent = torch.cat(toxic_sent, dim=1).to(device)
    neutral_sent = torch.cat(neutral_sent, dim=1).to(device)
    similarity = torch.tensor(similarity, device=device)
    len_diff = torch.tensor(len_diff, device=device)
    toxic_val = torch.tensor(toxic_val, device=device)
    neutral_val = torch.tensor(neutral_val, device=device)
    return similarity, len_diff, toxic_sent, neutral_sent, toxic_val, neutral_val

train_dataloader = DataLoader(
    df.loc[train_idx].to_numpy(),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch,
)

val_dataloader = DataLoader(
    df.loc[val_idx].to_numpy(),
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)

In [17]:
# let's check if shape and everything is ok
for batch in train_dataloader:
    similarity, len_diff, toxic_sent, neutral_sent, toxic_val, neutral_val = batch
    print("similarity.shape:", similarity.shape)
    print("len_diff.shape:", len_diff.shape)
    print("toxic_sent.shape:", toxic_sent.shape)
    print("neutral_sent.shape:", neutral_sent.shape)
    print("toxic_val.shape:", toxic_val.shape)
    print("neutral_val.shape:", neutral_val.shape)
    break

similarity.shape: torch.Size([16])
len_diff.shape: torch.Size([16])
toxic_sent.shape: torch.Size([32, 16])
neutral_sent.shape: torch.Size([32, 16])
toxic_val.shape: torch.Size([16])
neutral_val.shape: torch.Size([16])


# Load the Model

- Simple EncoderDecoder (Seq2Seq) architerture

In [18]:
from src.models.attention.encoder import Encoder
from src.models.attention.decoder import Decoder
from src.models.attention.attention import Attention
from src.models.attention import Seq2SeqAttention

In [19]:
# configure some parameters for the model
padding_idx = vocab['<pad>']

## Encoder
enc_input_dim = len(vocab)
enc_embed_dim = 128
enc_hidden_dim = 256
enc_dropout = 0.5

## Decoder
dec_output_dim = len(vocab)
dec_embed_dim = 128
dec_hidden_dim = 256
dec_dropout = 0.5

In [20]:
# load the encoder and decoder for our model
encoder = Encoder(
    input_dim=enc_input_dim,
    embed_dim=enc_embed_dim,
    hidden_dim=enc_hidden_dim,
    dec_hidden_dim=dec_hidden_dim,
    dropout=enc_dropout,
    padding_idx=padding_idx,
).to(device)

attention = Attention(
    enc_hidden_dim,
    dec_hidden_dim,
)

decoder = Decoder(
    output_dim=dec_output_dim,
    embed_dim=dec_embed_dim,
    hidden_dim=dec_hidden_dim,
    attention=attention,
    enc_hidden_dim=enc_hidden_dim,
    dropout=dec_dropout,
    padding_idx=padding_idx,
).to(device)

In [21]:
best_loss = float('inf')

model = Seq2SeqAttention(
    encoder=encoder,
    decoder=decoder,
    device=device,
    max_sent_size=MAX_SENT_SIZE,
    vocab=vocab,
).to(device)

In [22]:
optimizer = Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

In [23]:
from src.models.train_model import train

best_loss = train(
    model=model,
    loaders=(train_dataloader, val_dataloader),
    optimizer=optimizer,
    criterion=criterion,
    epochs=10,
    device=device,
    best_loss=best_loss,
    ckpt_path='models/attention2.pt',
    clip_grad=1,
)

Training 1: 100%|██████████| 8438/8438 [12:26<00:00, 11.31it/s, loss=4.58]
Evaluating 1: 100%|██████████| 938/938 [00:15<00:00, 60.17it/s, loss=4.43]
Training 2: 100%|██████████| 8438/8438 [12:26<00:00, 11.31it/s, loss=3.94]
Evaluating 2: 100%|██████████| 938/938 [00:15<00:00, 59.78it/s, loss=4.32]
Training 3: 100%|██████████| 8438/8438 [12:25<00:00, 11.31it/s, loss=3.69]
Evaluating 3: 100%|██████████| 938/938 [00:15<00:00, 60.11it/s, loss=4.31]
Training 4: 100%|██████████| 8438/8438 [12:26<00:00, 11.30it/s, loss=3.53]
Evaluating 4: 100%|██████████| 938/938 [00:15<00:00, 59.87it/s, loss=4.32]
Training 5: 100%|██████████| 8438/8438 [12:27<00:00, 11.29it/s, loss=3.4] 
Evaluating 5: 100%|██████████| 938/938 [00:15<00:00, 59.73it/s, loss=4.31]
Training 6: 100%|██████████| 8438/8438 [12:26<00:00, 11.30it/s, loss=3.31]
Evaluating 6: 100%|██████████| 938/938 [00:15<00:00, 59.42it/s, loss=4.29]
Training 7: 100%|██████████| 8438/8438 [12:26<00:00, 11.30it/s, loss=3.23]
Evaluating 7: 100%|██████

In [24]:
# let's load the model and predict
model = torch.load('models/attention2.pt')
model.to(device)
model.eval()

Seq2SeqAttention(
  (encoder): Encoder(
    (embedding): Embedding(25000, 128, padding_idx=1)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(25000, 128, padding_idx=1)
    (rnn): GRU(640, 256)
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (dropout): Dropout(p=0.5, inplace=False)
    (fc_out): Linear(in_features=896, out_features=25000, bias=True)
  )
  (vocab): Vocab()
)

In [34]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

# let's see how our model works
num_examples = 10
for _ in range(num_examples):
    idx = val_idx[np.random.randint(0, len(val_idx))]
    toxic_sent = detokenizer.detokenize(df.loc[val_idx, 'toxic_sent'][idx])
    neutral_sent = detokenizer.detokenize(df.loc[val_idx, 'neutral_sent'][idx])
    
    print('toxic_sent:', toxic_sent)
    print('neutral_sent:', neutral_sent)
    print('prediction:', model.predict(toxic_sent))
    print("\n")

toxic_sent: "i will prove him disgraceful in his own body."
neutral_sent: "i will prove his villainy upon his body."
prediction: "i'll prove him in his body body."


toxic_sent: fuck man, i'm so sorry.
neutral_sent: man, i'm sorry.
prediction: man, i'm sorry.


toxic_sent: carl, i want you to jump out of there.
neutral_sent: carl, i want you to hop on out of there.
prediction: carl, i want you to jump out of there.


toxic_sent: tara, let's go . fuck off.
neutral_sent: tara, let's go.
prediction: tara, let's go.


toxic_sent: jantar's too fat, he'd probably push the camera inside.
neutral_sent: the amber's too thick . it'll probably push the camera further in.
prediction: <unk>'s too thick thick, he probably probably probably probably the the inside.


toxic_sent: he was a drug dealer.
neutral_sent: he was a drug dealer
prediction: he was dealer drug dealer.


toxic_sent: damn the french and their comfortable beds!
neutral_sent: bloody comfortable french beds!
prediction: goddam french