In [1]:
LOAD_MODEL = 'models/transformer2.01.pt'
MODEL_CHECKPOINT = 'models/transformer2.01.pt'
DATASET_PATH = 'data/interim/preprocessed_paranmt.tsv'

In [2]:
import numpy as np

from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim import Adam

import os
os.chdir("..") # go to the root dir

## Get the Dataset

In [3]:
MAX_SENT_SIZE = 32
MAX_TOKENS = 10_000

In [4]:
from src.data.make_dataset import ParanmtDataset

train_dataset = ParanmtDataset(
    path=DATASET_PATH,
    max_sent_size=MAX_SENT_SIZE,
    train=True,
    seed=42,
)

In [5]:
train_dataset.build_vocab(
    min_freq=2,
    specials=['<unk>', '<pad>', '<sos>', '<eos>'],
    max_tokens=MAX_TOKENS,
)

In [6]:
enc_vocab = train_dataset.toxic_vocab
dec_vocab = train_dataset.neutral_vocab

In [7]:
print("size of encoder vocab:", len(enc_vocab))
print("size of decoder vocab:", len(dec_vocab))

size of encoder vocab: 10000
size of decoder vocab: 10000


In [8]:
val_dataset = ParanmtDataset(
    path=DATASET_PATH,
    max_sent_size=MAX_SENT_SIZE,
    vocabs=(enc_vocab, dec_vocab), # avoid data leakage
    train=False,
    seed=42,
    take_first=10_000,
)

In [9]:
len(train_dataset), len(val_dataset)

(470052, 10000)

## Let's create Dataloader

In [10]:
batch_size = 128

In [11]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
)

In [12]:
# let's check if shape and everything is ok
for batch in train_dataloader:
    toxic_sent, neutral_sent = batch
    print("toxic_sent.shape:", toxic_sent.shape)
    print("neutral_sent.shape:", neutral_sent.shape)
    break

toxic_sent.shape: torch.Size([128, 32])
neutral_sent.shape: torch.Size([128, 32])


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Load the Model

- Transformer architerture

In [14]:
from src.models.transformer.encoder import Encoder
from src.models.transformer.decoder import Decoder
from src.models.transformer import Transformer

In [15]:
# configure some parameters for the model
heads = 4
hidden_dim = 256
ff_expantion = 4
max_size = MAX_SENT_SIZE

## Encoder
enc_input_dim = len(enc_vocab)
enc_dropout = 0.1
enc_num_layers = 3
enc_padding_idx = enc_vocab['<pad>']

## Decoder
dec_output_dim = len(dec_vocab)
dec_dropout = 0.1
dec_num_layers = 3
dec_padding_idx = dec_vocab['<pad>']

In [16]:
# load the encoder and decoder for our model
encoder = Encoder(
    input_dim=enc_input_dim,
    hidden_dim=hidden_dim,
    num_layers=enc_num_layers,
    heads=heads,
    ff_expantion=ff_expantion,
    dropout=enc_dropout,
    device=device,
    max_size=max_size,
    vocab=enc_vocab,
).to(device)

decoder = Decoder(
    output_dim=dec_output_dim,
    hidden_dim=hidden_dim,
    num_layers=dec_num_layers,
    heads=heads,
    ff_expantion=ff_expantion,
    dropout=dec_dropout,
    device=device,
    max_size=max_size,
    vocab=dec_vocab,
).to(device)



In [17]:
best_loss = float('inf')

model = Transformer(
    encoder=encoder,
    decoder=decoder,
    device=device,
    max_sent_size=MAX_SENT_SIZE,
).to(device)

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=decoder.padding_idx)

In [19]:
from src.models.train_model import train

best_loss = train(
    model=model,
    loaders=(train_dataloader, val_dataloader),
    optimizer=optimizer,
    criterion=criterion,
    epochs=20,
    device=device,
    best_loss=best_loss,
    ckpt_path=MODEL_CHECKPOINT,
)

Training 1: 100%|██████████| 3673/3673 [03:17<00:00, 18.63it/s, loss=4.28]
Evaluating 1: 100%|██████████| 79/79 [00:01<00:00, 55.53it/s, loss=3.62]
Training 2: 100%|██████████| 3673/3673 [03:17<00:00, 18.59it/s, loss=3.53]
Evaluating 2: 100%|██████████| 79/79 [00:01<00:00, 56.64it/s, loss=3.28]
Training 3: 100%|██████████| 3673/3673 [03:16<00:00, 18.67it/s, loss=3.27]
Evaluating 3: 100%|██████████| 79/79 [00:01<00:00, 55.58it/s, loss=3.07]
Training 4: 100%|██████████| 3673/3673 [03:16<00:00, 18.71it/s, loss=3.09]
Evaluating 4: 100%|██████████| 79/79 [00:01<00:00, 55.78it/s, loss=2.91]
Training 5: 100%|██████████| 3673/3673 [03:16<00:00, 18.74it/s, loss=2.96]
Evaluating 5: 100%|██████████| 79/79 [00:01<00:00, 55.41it/s, loss=2.8] 
Training 6: 100%|██████████| 3673/3673 [03:16<00:00, 18.67it/s, loss=2.85]
Evaluating 6: 100%|██████████| 79/79 [00:01<00:00, 56.60it/s, loss=2.7] 
Training 7: 100%|██████████| 3673/3673 [03:17<00:00, 18.64it/s, loss=2.76]
Evaluating 7: 100%|██████████| 79/79 

In [20]:
# let's load the model and predict
model = torch.load(MODEL_CHECKPOINT)
model.to(device)
model.eval()
None

In [21]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

# let's see how our model works
num_examples = 10
num_sentence = 3
dataset = val_dataset
for idx in range(num_examples):
    idx = np.random.randint(0, len(dataset))
    toxic_sent = detokenizer.detokenize(dataset.df.loc[idx, 'toxic_sent'])
    neutral_sent = detokenizer.detokenize(dataset.df.loc[idx, 'neutral_sent'])
    
    print('toxic_sent:', toxic_sent)
    print('neutral_sent:', neutral_sent)
    
    # let's use beam search
    # i turned off postprocess_text on purpose 
    # to see everything (postprocess_text removes some tokens and detokenize the sentence)
    preds = model.predict(
        toxic_sent,
        use_beam_search=True,
        num_candidates=num_sentence,
        post_process_text=False
    )
    print("predictions:")
    for i in range(num_sentence):
        print(f"\t{i+1})", preds[i])
    print("\n")

toxic_sent: you earn a living by torturing people.
neutral_sent: you torture people for a living.
predictions:
	1) ['<sos>', 'you', 'are', '<unk>', 'by', 'torturing', 'people', '.', '<eos>']
	2) ['<sos>', 'you', 'are', 'afraid', 'of', 'being', 'tortured', 'people', '.', '<eos>']
	3) ['<sos>', 'you', 'earn', 'a', 'living', 'person', '.', '<eos>']


toxic_sent: i do not give a damn about his education.
neutral_sent: i do not care about his education.
predictions:
	1) ['<sos>', 'i', 'do', 'not', 'care', 'about', 'his', 'education', '.', '<eos>']
	2) ['<sos>', 'i', 'do', 'not', 'care', 'about', 'his', 'education', 'education', '.', '<eos>']
	3) ['<sos>', 'i', 'do', 'not', 'care', 'about', 'the', 'education', '.', '<eos>']


toxic_sent: why did not you have sex with her?
neutral_sent: why did not you give her one?
predictions:
	1) ['<sos>', 'why', 'did', 'not', 'you', 'sleep', 'with', 'her', '?', '<eos>']
	2) ['<sos>', 'why', 'did', 'not', 'you', 'sleep', 'with', 'it', '?', '<eos>']
	3) ['<

In [22]:
from torchtext.data.metrics import bleu_score
from tqdm import tqdm

def calculate_bleu(dataset, model):
    preds = []
    trgs = []
    with torch.no_grad():
        for i in tqdm(range(len(dataset))):
            toxic_sent, neutral_sent = dataset[i]
            toxic_sent = toxic_sent.to(model.device).unsqueeze(0)
            pred = model.predict(toxic_sent, post_process_text=False)
            
            pred = pred[1:-1] # remove <sos> and <eos>
            
            neutral_sent = model.decoder.vocab.lookup_tokens(neutral_sent.numpy())
            neutral_sent = neutral_sent[1:] # remove <sos>
            neutral_sent = neutral_sent[:neutral_sent.index('<eos>')]
            
            preds.append(pred)
            trgs.append([neutral_sent])
        
    return bleu_score(preds, trgs)

In [23]:
calculate_bleu(val_dataset, model)

100%|██████████| 10000/10000 [04:18<00:00, 38.70it/s]


0.2538011075535255