In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from tqdm import tqdm

from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from torch.optim import Adam

import nltk
from nltk import word_tokenize
nltk.download('punkt')

import matplotlib.pyplot as plt

import os
os.chdir("..") # go to the root dir

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
!pip install torchtext

[0m

In [3]:
df = pd.read_csv('data/interim/preprocessed_paranmt.tsv', sep='\t', index_col=0)

In [4]:
df

Unnamed: 0,similarity,lenght_diff,toxic_sent,neutral_sent,toxic_val,neutral_val
0,0.785171,0.010309,"if alkar floods her with her mental waste , it...","if alkar is flooding her with psychic waste , ...",0.981983,0.014195
1,0.749687,0.071429,you 're becoming disgusting .,now you 're getting nasty .,0.999039,0.065473
2,0.919051,0.268293,"well , we can spare your life .","well , we could spare your life , for one .",0.985068,0.213313
3,0.664333,0.309524,"monkey , you have to wake up .","ah ! monkey , you 've got to snap out of it .",0.994215,0.053362
4,0.726639,0.181818,i have orders to kill her .,i 've got orders to put her down .,0.999348,0.009402
...,...,...,...,...,...,...
577772,0.870322,0.030769,you did n't know that estelle stole your fish ...,you did n't know that estelle had stolen some ...,0.949143,0.000121
577773,0.722897,0.058824,it'il suck the life out of you !,you 'd be sucked out of your life !,0.996124,0.215794
577774,0.617511,0.212121,"i ca n't fuckin ' take that , bruv .",i really ca n't take this .,0.984538,0.000049
577775,0.679613,0.358209,they called me a fucking hero . the truth is i...,"they said i was a hero , but i did n't care .",0.991945,0.000124


In [5]:
# let's convert columns toxic_sent and neutral_sent as a list
df.loc[:, 'toxic_sent'] = df['toxic_sent'].apply(lambda x: x.split(' '))
df.loc[:, 'neutral_sent'] = df['neutral_sent'].apply(lambda x: x.split(' '))

In [6]:
df.head()

Unnamed: 0,similarity,lenght_diff,toxic_sent,neutral_sent,toxic_val,neutral_val
0,0.785171,0.010309,"[if, alkar, floods, her, with, her, mental, wa...","[if, alkar, is, flooding, her, with, psychic, ...",0.981983,0.014195
1,0.749687,0.071429,"[you, 're, becoming, disgusting, .]","[now, you, 're, getting, nasty, .]",0.999039,0.065473
2,0.919051,0.268293,"[well, ,, we, can, spare, your, life, .]","[well, ,, we, could, spare, your, life, ,, for...",0.985068,0.213313
3,0.664333,0.309524,"[monkey, ,, you, have, to, wake, up, .]","[ah, !, monkey, ,, you, 've, got, to, snap, ou...",0.994215,0.053362
4,0.726639,0.181818,"[i, have, orders, to, kill, her, .]","[i, 've, got, orders, to, put, her, down, .]",0.999348,0.009402


In [7]:
df = df[:100_000] # let's reduce the size of df for sake of speed

## Build the Dataloaders

In [8]:
batch_size = 16

In [9]:
# split into train and val
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(range(len(df)), train_size=0.8, random_state=42)

## Let's build the vocab

In [10]:
MAX_TOKENS = 20_000 # TODO: change this
MAX_SENT_SIZE = 128

In [11]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for i in df.index:
        toxic_sent = df['toxic_sent'][i]
        yield toxic_sent
        
        neutral_sent = df['neutral_sent'][i]
        yield list(neutral_sent)


UNK_IDX = 0
vocab = build_vocab_from_iterator(
    yield_tokens(df.loc[train_idx]), # to avoid data leakage
    min_freq=2,
    specials=['<unk>', '<pad>', '<sos>', '<eos>'],
    max_tokens=MAX_TOKENS,
)
vocab.set_default_index(UNK_IDX)

In [12]:
len(vocab)

20000

In [13]:
def collate_batch(batch):
    toxic_sent, neutral_sent = [], []
    toxic_val, neutral_val = [], []
    similarity, len_diff = [], []
    for b in batch:
        _similarity, _len_diff, _toxic_sent, _neutral_sent, _toxic_val, _neutral_val = b
        similarity.append(_similarity)
        len_diff.append(_len_diff)
        toxic_val.append(_toxic_val)
        neutral_val.append(_neutral_val)
        
        # add <sos> and <eos> to _toxic_sent and _neutral_sent
        _toxic_sent = ['<sos>'] + _toxic_sent[:MAX_SENT_SIZE-2] + ['<eos>']
        _neutral_sent = ['<sos>'] + _neutral_sent[:MAX_SENT_SIZE-2] + ['<eos>']
        
        _toxic_sent = vocab.lookup_indices(_toxic_sent)
        while len(_toxic_sent) < MAX_SENT_SIZE:
            _toxic_sent.append(vocab['<pad>'])
        
        _neutral_sent = vocab.lookup_indices(_neutral_sent)
        while len(_neutral_sent) < MAX_SENT_SIZE:
            _neutral_sent.append(vocab['<pad>'])
        
        toxic_sent.append(torch.tensor(_toxic_sent).reshape(MAX_SENT_SIZE, 1))
        neutral_sent.append(torch.tensor(_neutral_sent).reshape(MAX_SENT_SIZE, 1))
        
    toxic_sent = torch.cat(toxic_sent, dim=1)
    neutral_sent = torch.cat(neutral_sent, dim=1)
    similarity = torch.tensor(similarity)
    len_diff = torch.tensor(len_diff)
    toxic_val = torch.tensor(toxic_val)
    neutral_val = torch.tensor(neutral_val)
    return similarity, len_diff, toxic_sent, neutral_sent, toxic_val, neutral_val

train_dataloader = DataLoader(
    df.loc[train_idx].to_numpy(),
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    collate_fn=collate_batch,
)

val_dataloader = DataLoader(
    df.loc[val_idx].to_numpy(),
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_batch,
)

In [14]:
# let's check if shape and everything is ok
for batch in train_dataloader:
    similarity, len_diff, toxic_sent, neutral_sent, toxic_val, neutral_val = batch
    print("similarity.shape:", similarity.shape)
    print("len_diff.shape:", len_diff.shape)
    print("toxic_sent.shape:", toxic_sent.shape)
    print("neutral_sent.shape:", neutral_sent.shape)
    print("toxic_val.shape:", toxic_val.shape)
    print("neutral_val.shape:", neutral_val.shape)
    break

similarity.shape: torch.Size([16])
len_diff.shape: torch.Size([16])
toxic_sent.shape: torch.Size([128, 16])
neutral_sent.shape: torch.Size([128, 16])
toxic_val.shape: torch.Size([16])
neutral_val.shape: torch.Size([16])


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the Model

- Simple EncoderDecoder (Seq2Seq) architerture

In [16]:
from src.models.models.encoder import Encoder
from src.models.models.decoder import Decoder
from src.models.models import Seq2Seq

In [23]:
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
EMBED_DIM = 128
NUM_HIDDEN = 256
N_LAYERS = 2
DROPOUT = 0.5
PADDING_IDX = vocab['<pad>']

In [24]:
# load the encoder and decoder for our model
encoder = Encoder(
    input_dim=INPUT_DIM,
    embed_dim=EMBED_DIM,
    hidden_dim=NUM_HIDDEN,
    num_layers=N_LAYERS,
    dropout=DROPOUT,
    padding_idx=PADDING_IDX
).to(device)

decoder = Decoder(
    output_dim=OUTPUT_DIM,
    embed_dim=EMBED_DIM,
    hidden_dim=NUM_HIDDEN,
    num_layers=N_LAYERS,
    dropout=DROPOUT,
    padding_idx=PADDING_IDX
).to(device)

In [25]:
best_loss = float('inf')

model = Seq2Seq(
    encoder=encoder,
    decoder=decoder,
    device=device,
).to(device)

In [26]:
optimizer = Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

In [28]:
from src.models.train_model import train

best_loss = train(
    model=model,
    loaders=(train_dataloader, val_dataloader),
    optimizer=optimizer,
    criterion=criterion,
    epochs=3,
    device=device,
    best_loss=best_loss,
)

Training 1: 100%|██████████| 5000/5000 [23:39<00:00,  3.52it/s, loss=4.94]
Evaluating 1:   0%|          | 0/1250 [00:00<?, ?it/s]


NameError: name 'neutral' is not defined