Feb 23

Notebook 1, "Simple Positive Case" does not deliver very helpful results.
It is not clear whether the prepared classifier weights represent the
classifier state after being learned.

A better test might be starting out with a classifier with random weights
and trying to overfit on a single sample. The expected result would be
learned weights that are similar to the preparation in notebook 1.

# Imports

In [None]:
from collections import Counter
from typing import List

import torch
from IPython.lib.pretty import pretty
from torch import Tensor
from torch import tensor
from torch.nn import BCEWithLogitsLoss
from torch.nn import EmbeddingBag, Linear
from torch.optim import Adam
from torchtext.vocab import Vocab

# Import util

In [None]:
def log_tensor(tensor_: Tensor, title: str, labels: List[List[str]]):
    pass

%run util.ipynb

# Import classifier

In [None]:
def forward(embedding_bag: EmbeddingBag, class_embs: Tensor, linear: Linear, sents_batch: Tensor) -> Tensor:
    pass

%run classifier.ipynb

# 1 Train data

In [None]:
data = [
    {
        'classes': [1, 1, 1],
        'sents': [
            'married married married',
            'male male male',
            'American American American'
        ]
    },
    {
        'classes': [0, 0, 0],
        'sents': [
            'single single single',
            'female female female',
            'German German German'
        ]
    },
]

batch_size = 2
class_count = 3
emb_size = 4
sent_count = 3
sent_len = 3

class_labels = [f'class {i}' for i in range(class_count)]
emb_labels = [f'emb {i}' for i in range(emb_size)]
ent_class_labels = [f'ent {i} / class {j}' for i in range(batch_size) for j in range(class_count)]
ent_labels = [f'ent {i}' for i in range(batch_size)]
ent_sent_labels = [f'ent {i} / sent {j}' for i in range(batch_size) for j in range(sent_count)]
mix_emb_labels = [f'mix {i} / class {j}' for i in range(class_count) for j in range(emb_size)]
sent_labels = [f'sent {i}' for i in range(sent_count)]
tok_labels = [f'tok {i}' for i in range(sent_len)]

# 2 Pre-processing

## 2.1 Build vocabulary

In [None]:
def tokenize(text):
    return text.split()

words = [word for ent in data for sent in ent['sents'] for word in tokenize(sent)]
vocab = Vocab(Counter(words))

print(pretty(vocab.stoi))

vocab_size = len(vocab)
word_labels = [f'word {i}' for i in range(vocab_size)]

## 2.2 Transform data

Map words to tokens and create tensors.

In [None]:
sents_batch = tensor([[[vocab[word] for word in tokenize(sent)] for sent in ent['sents']] for ent in data])
classes_batch = torch.tensor([ent['classes'] for ent in data])
assert len(sents_batch) == len(classes_batch)

log_tensor(sents_batch, 'sents_batch', [ent_labels, sent_labels, tok_labels])

# 3 Prepare classifier

## 3.1 Create EmbeddingBag

Create and prepare an `EmbeddingBag` with randomly distributed token embeddings.

In [None]:
embedding_bag = EmbeddingBag(num_embeddings=vocab_size, embedding_dim=emb_size)

log_tensor(embedding_bag.weight.detach(), 'embedding_bag.weight', [word_labels, emb_labels])

## 3.2 Create class embeddings

Create randomly initialized class embeddings.

In [None]:
class_embs = torch.rand((class_count, emb_size), requires_grad=True)
init_class_embs = class_embs.detach().clone()

log_tensor(class_embs.detach(), 'class_embs', [class_labels, emb_labels])

## 3.3 Create linear layer

Create a randomly initialized linear layer.

In [None]:
linear = Linear(class_count * emb_size, class_count)

log_tensor(linear.weight.data.detach(), 'linear.weight.data', [class_labels, mix_emb_labels])
log_tensor(linear.bias.data.detach(), 'linear.bias.data', [class_labels])

# 4 Forward & Backward

In [None]:
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([80] * class_count))
# criterion = torch.nn.MSELoss()

optimizer = Adam([embedding_bag.weight, class_embs, linear.weight, linear.bias], lr=0.1)

epoch_count = 101
for epoch in range(epoch_count):

    if epoch in [0, 10, 100, 1000]:
        print(epoch)

        log_tensor(embedding_bag.weight.detach(), 'embedding_bag.weight', [word_labels, emb_labels])
        log_tensor(class_embs.detach(), 'class_embs', [class_labels, emb_labels])
        # log_tensor(linear.weight.data, 'linear.weight.data', [class_labels, mix_emb_labels])
        # log_tensor(linear.bias.data, 'linear.bias.data', [class_labels])

    logits_batch = forward(embedding_bag, class_embs, linear, sents_batch)

    #
    # Loss
    #

    loss = criterion(logits_batch, classes_batch.float())
    print(loss.item())

    # log_tensor(logits_batch, 'logits_batch', [ent_labels, class_labels], vmin=-1, vmax=1)
    # log_tensor(classes_batch, 'classes_batch', [ent_labels, class_labels], vmin=-1, vmax=1)
    # log_tensor(loss, 'loss', [])

    #
    # Backward
    #

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# 5 Test

## 5.1 Define test data

In [None]:
test_data = [
    {
        'classes': [1, 1, 1],
        'sents': [
            'married married married',
            'male male male',
            'American American American'
        ]
    },
    {
        'classes': [0, 0, 0],
        'sents': [
            'single single single',
            'female female female',
            'German German German'
        ]
    },
]

batch_size = 2
class_count = 3
emb_size = 4
sent_count = 3
sent_len = 3

class_labels = [f'class {i}' for i in range(class_count)]
emb_labels = [f'emb {i}' for i in range(emb_size)]
ent_class_labels = [f'ent {i} / class {j}' for i in range(batch_size) for j in range(class_count)]
ent_labels = [f'ent {i}' for i in range(batch_size)]
ent_sent_labels = [f'ent {i} / sent {j}' for i in range(batch_size) for j in range(sent_count)]
mix_emb_labels = [f'mix {i} / class {j}' for i in range(class_count) for j in range(emb_size)]
sent_labels = [f'sent {i}' for i in range(sent_count)]
tok_labels = [f'tok {i}' for i in range(sent_len)]

# test_data = [
#     {
#         'classes': [1, 1, 1],  # married, male, American
#         'sents': [
#             'Barack Obama is married',
#             'Barack Obama is male',
#             'Barack Obama is American'
#         ]
#     },
#     {
#         'classes': [1, 0, 1],  # married, male, American
#         'sents': [
#             'Michelle Obama is married',
#             'Michelle Obama is female',
#             'Michelle Obama is American'
#         ]
#     },
#     {
#         'classes': [1, 0, 0],  # married, male, American
#         'sents': [
#             'Angela Merkel is married',
#             'Angela Merkel is female',
#             'Angela Merkel is German'
#         ]
#     }
# ]
#
# batch_size = 3
# # class_count = 3
# # emb_size = 4
# # sent_count = 3
# sent_len = 4
#
# # class_labels = [f'class {i}' for i in range(class_count)]
# # emb_labels = [f'emb {i}' for i in range(emb_size)]
# ent_class_labels = [f'ent {i} / class {j}' for i in range(batch_size) for j in range(class_count)]
# ent_labels = [f'ent {i}' for i in range(batch_size)]
# ent_sent_labels = [f'ent {i} / sent {j}' for i in range(batch_size) for j in range(sent_count)]
# # mix_emb_labels = [f'mix {i} / class {j}' for i in range(class_count) for j in range(emb_size)]
# # sent_labels = [f'sent {i}' for i in range(sent_count)]
# tok_labels = [f'tok {i}' for i in range(sent_len)]
# # word_labels = [f'word {i}' for i in range(vocab_size)]

## 5.2 Pre-process test data

In [None]:
test_sents_batch = tensor([[[vocab[word] for word in tokenize(sent)] for sent in ent['sents']] for ent in test_data])
test_classes_batch = torch.tensor([ent['classes'] for ent in test_data])
assert len(sents_batch) == len(classes_batch)

log_tensor(test_sents_batch, 'test_sents_batch', [ent_labels, sent_labels, tok_labels])
log_tensor(test_classes_batch, 'test_classes_batch', [ent_labels, class_labels])

# 5.3 Forward test batch

In [None]:
test_logits_batch = forward(embedding_bag, class_embs, linear, test_sents_batch)

#
# Loss
#

test_loss = criterion(test_logits_batch, test_classes_batch.float())

log_tensor(test_logits_batch, 'test_logits_batch', [ent_labels, class_labels])
log_tensor(test_classes_batch, 'test_classes_batch', [ent_labels, class_labels])
log_tensor(test_loss, 'test_loss', [])