In [1]:
import os
os.chdir("..") # go to root dir

import torch
from src.models.utils import calculate_bleu

from src.data.make_dataset import ParanmtDataset
DATASET_PATH = 'data/interim/preprocessed_paranmt.tsv'

In [2]:
res = dict() # store the results of each model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Get the dataset to test

In [3]:
def get_val_dataset(max_sent_size, max_tokens, take_first=None, min_freq=2, seed=42):
    # get the train dataset
    train_dataset = ParanmtDataset(
        path=DATASET_PATH,
        max_sent_size=max_sent_size,
        train=True,
        seed=seed,
        take_first=take_first,
    )
    # build the vocabs
    train_dataset.build_vocab(
        min_freq=min_freq,
        specials=['<unk>', '<pad>', '<sos>', '<eos>'],
        max_tokens=max_tokens,
    )
    
    # get val_dataset giving vocabs from train dataset
    val_dataset = ParanmtDataset(
        path=DATASET_PATH,
        max_sent_size=max_sent_size,
        vocabs=(train_dataset.toxic_vocab, train_dataset.neutral_vocab), # avoid data leakage
        train=False,
        seed=seed,
    )
    return val_dataset

# Evaluate Seq2Seq

### Model_0
- path: models/seq2seq.01.pt
- Dataset Building
    - seed: 42
    - max_sent_size: 10
    - max_tokens: 8_000
    - min_freq: 2
    
### Model_1
- same as above, but decoder differs
- path: models/seq2seq_2.01.pt


### Evaluate Model_0

In [18]:
model = torch.load('models/seq2seq.01.pt', map_location=device).eval()

In [20]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, where are you?'

In [24]:
val_dataset = get_val_dataset(max_sent_size=10, max_tokens=8_000, min_freq=2, seed=42)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 17527/17527 [01:00<00:00, 292.08it/s]


In [25]:
res['seq2seq_model_0'] = bleu_score
print("Seq2Seq Bleu Score:", bleu_score)

Seq2Seq Bleu Score: 0.0531146153814399


### Evaluate Model_1

In [28]:
model = torch.load('models/seq2seq_2.01.pt', map_location=device).eval()

In [29]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you doing?'

In [30]:
val_dataset = get_val_dataset(max_sent_size=10, max_tokens=8_000, min_freq=2, seed=42)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 17527/17527 [00:44<00:00, 397.11it/s]


In [31]:
res['seq2seq_model_1'] = bleu_score
print("Seq2Seq_2 Bleu Score:", bleu_score)

Seq2Seq_2 Bleu Score: 0.12177348992948088


# Evaluate Attention
### Model_0
- path: models/attention.01.pt
- Dataset Building
    - seed: 42
    - max_sent_size: 10
    - max_tokens: 8_000
    - min_freq: 2

### Model_1
- path: models/attention2.01.pt
- Dataset Building
    - seed: 42
    - max_sent_size: 32
    - max_tokens: 10_000
    - min_freq: 2
    
### Model_2
- path: models/attention2.02.pt
- Dataset same as Model_1, but (overfitted)

### Evaluate Model_0

In [32]:
model = torch.load('models/attention.01.pt', map_location=device).eval()

In [33]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you?'

In [35]:
val_dataset = get_val_dataset(
    max_sent_size=10,
    max_tokens=8_000,
    min_freq=2,
    seed=42
)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 17527/17527 [00:59<00:00, 292.77it/s]


In [36]:
res['attention_model_0'] = bleu_score
print("Attention Bleu Score:", bleu_score)

Attention Bleu Score: 0.1278586578824364


### Evaluate Model_1

In [37]:
model = torch.load('models/attention2.01.pt', map_location=device).eval()

In [38]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you?'

In [39]:
val_dataset = get_val_dataset(
    max_sent_size=32,
    max_tokens=10_000,
    min_freq=2,
    seed=42
)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 52229/52229 [05:24<00:00, 160.89it/s]


In [40]:
res['attention_model_1'] = bleu_score
print("Attention2 Bleu Score:", bleu_score)

Attention2 Bleu Score: 0.17888638316354452


### Evaluate Model_2

In [41]:
model = torch.load('models/attention2.02.pt', map_location=device).eval()

In [42]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you?'

In [43]:
val_dataset = get_val_dataset(
    max_sent_size=32,
    max_tokens=10_000,
    min_freq=2,
    seed=42
)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 52229/52229 [05:29<00:00, 158.52it/s]


In [44]:
res['attention_model_2'] = bleu_score
print("Attention2 Bleu Score:", bleu_score)

Attention2 Bleu Score: 0.20631449393285442


# Evaluate Transformer
### Model_0
- path: models/transformer.01.pt
- Dataset Building
    - seed: 42
    - max_sent_size: 10
    - max_tokens: 10_000
    - min_freq: 2
    
### Model_1
- path: models/transformer2.01.pt
- Dataset Building
    - seed: 42
    - max_sent_size: 32
    - max_tokens: 10_000
    - min_freq: 2

### Evaluate Model_0

In [4]:
model = torch.load('models/transformer.01.pt', map_location=device).eval()

In [5]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you doing?'

In [6]:
val_dataset = get_val_dataset(
    max_sent_size=10,
    max_tokens=8_000,
    min_freq=2,
    seed=42
)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 17527/17527 [04:15<00:00, 68.58it/s]


In [7]:
res['transformer_model_0'] = bleu_score
print("Transformer Bleu Score:", bleu_score)

Transformer Bleu Score: 0.2858192747677208


### Evaluate Model_1

In [8]:
model = torch.load('models/transformer2.01.pt', map_location=device).eval()

In [9]:
# a little test
model.device = device
model.predict('hey, how are you?')

'hey, how are you?'

In [10]:
val_dataset = get_val_dataset(
    max_sent_size=32,
    max_tokens=10_000,
    min_freq=2,
    seed=42
)

bleu_score = calculate_bleu(val_dataset, model)

100%|██████████| 52229/52229 [21:36<00:00, 40.27it/s]


In [11]:
res['transformer_model_1'] = bleu_score
print("Transformer2 Bleu Score:", bleu_score)

Transformer2 Bleu Score: 0.25850203231637403
