Feb 24

The training in notebook _03_overfit_and_test seems to work. The classifier
overfits on the training data and correctly predicts when given the training
data for testing.

Now, the classifier code should be enhanced to a PyTorch module that is trained
in a training + validation loop as it is the case in the code base. The loss
curve should be printed as well.

The expected result is a loss curve similar to the one resulting from the code
base, i.e. validation loss should drop until around the 5th epoch after which
it should rise again

Hopefully, this notebook provides the means to debug and understand why the
validation loss curve does not drop further and thus, why the classifier does
not yield better test results.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

from collections import Counter

import torch
from IPython.lib.pretty import pretty
from torch import tensor
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torchtext.vocab import Vocab

from notebooks._04_classifier import util
from notebooks._04_classifier.classifier import Classifier
from notebooks._04_classifier.util import log_tensor, get_ent_lbls, get_sent_lbls, get_tok_lbls, get_emb_lbls, \
    get_class_lbls, get_mix_emb_lbls, get_word_lbls

# 1 Train data

In [None]:
data = [
    {
        'classes': [1, 1, 1],
        'sents': [
            'married married married',
            'male male male',
            'American American American'
        ]
    },
    {
        'classes': [0, 0, 0],
        'sents': [
            'single single single',
            'female female female',
            'German German German'
        ]
    },
]

batch_size = 2
class_count = 3
emb_size = 4
sent_count = 3
sent_len = 3

util.batch_size = batch_size
util.class_count = class_count
util.emb_size = emb_size
util.sent_count = sent_count
util.sent_len = sent_len

# 2 Pre-processing

## 2.1 Build vocabulary

In [None]:
def tokenize(text):
    return text.split()

words = [word for ent in data for sent in ent['sents'] for word in tokenize(sent)]
vocab = Vocab(Counter(words))

print(pretty(vocab.stoi))

vocab_size = len(vocab)
util.vocab_size = vocab_size

## 2.2 Transform data

Map words to tokens and create tensors.

In [None]:
sents_batch = tensor([[[vocab[word] for word in tokenize(sent)] for sent in ent['sents']] for ent in data])
classes_batch = torch.tensor([ent['classes'] for ent in data])
assert len(sents_batch) == len(classes_batch)

log_tensor(sents_batch, 'sents_batch', [get_ent_lbls(), get_sent_lbls(), get_tok_lbls()])

# 3 Create classifier

In [None]:
classifier = Classifier(vocab_size, emb_size, class_count)

# 4 Forward & Backward

In [None]:
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([80] * class_count))
# criterion = MSELoss()

optimizer = Adam(classifier.parameters(), lr=0.1)

epoch_count = 1001
for epoch in range(epoch_count):

    if epoch in [0, 10, 100, 1000]:
        print(epoch)

        # log_tensor(classifier.embedding_bag.weight.detach(), 'classifier.embedding_bag.weight', [get_word_lbls(), get_emb_lbls()])
        # log_tensor(classifier.class_embs.detach(), 'classifier.class_embs', [get_class_lbls(), get_emb_lbls()])
        # log_tensor(classifier.linear.weight.data, 'classifier.linear.weight.data', [get_class_lbls(), get_mix_emb_lbls()])
        # log_tensor(classifier.linear.bias.data, 'classifier.linear.bias.data', [get_class_lbls()])

    logits_batch = classifier(sents_batch)

    #
    # Loss
    #

    loss = criterion(logits_batch, classes_batch.float())
    print(loss.item())

    # log_tensor(logits_batch, 'logits_batch', [get_ent_lbls(), get_class_lbls()])
    # log_tensor(classes_batch, 'classes_batch', [get_ent_lbls(), get_class_lbls()])
    # log_tensor(loss, 'loss', [])

    #
    # Backward
    #

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# 5 Test

## 5.1 Define test data

In [None]:
# test_data = [
#     {
#         'classes': [1, 1, 1],
#         'sents': [
#             'married married married',
#             'male male male',
#             'American American American'
#         ]
#     },
#     {
#         'classes': [0, 0, 0],
#         'sents': [
#             'single single single',
#             'female female female',
#             'German German German'
#         ]
#     },
# ]
#
# batch_size = 2
# class_count = 3
# emb_size = 4
# sent_count = 3
# sent_len = 3
#
# util.batch_size = batch_size
# util.class_count = class_count
# util.emb_size = emb_size
# util.sent_count = sent_count
# util.sent_len = sent_len

test_data = [
    {
        'classes': [1, 1, 1],  # married, male, American
        'sents': [
            'Barack Obama is married',
            'Barack Obama is male',
            'Barack Obama is American'
        ]
    },
    {
        'classes': [1, 0, 1],  # married, male, American
        'sents': [
            'Michelle Obama is married',
            'Michelle Obama is female',
            'Michelle Obama is American'
        ]
    },
    {
        'classes': [1, 0, 0],  # married, male, American
        'sents': [
            'Angela Merkel is married',
            'Angela Merkel is female',
            'Angela Merkel is German'
        ]
    }
]

batch_size = 3
class_count = 3
emb_size = 4
sent_count = 3
sent_len = 4

util.batch_size = batch_size
util.class_count = class_count
util.emb_size = emb_size
util.sent_count = sent_count
util.sent_len = sent_len

## 5.2 Pre-process test data

In [None]:
test_sents_batch = tensor([[[vocab[word] for word in tokenize(sent)] for sent in ent['sents']] for ent in test_data])
test_classes_batch = torch.tensor([ent['classes'] for ent in test_data])
assert len(sents_batch) == len(classes_batch)

log_tensor(test_sents_batch, 'test_sents_batch', [get_ent_lbls(), get_sent_lbls(), get_tok_lbls()])
log_tensor(test_classes_batch, 'test_classes_batch', [get_ent_lbls(), get_class_lbls()])

# 5.3 Forward test batch

In [None]:
test_logits_batch = classifier(test_sents_batch)

#
# Loss
#

test_loss = criterion(test_logits_batch, test_classes_batch.float())

log_tensor(test_logits_batch, 'test_logits_batch', [get_ent_lbls(), get_class_lbls()])
log_tensor(test_classes_batch, 'test_classes_batch', [get_ent_lbls(), get_class_lbls()])
log_tensor(test_loss, 'test_loss', [])