Feb 23

Notebook 1, "Simple Positive Case" does not deliver very helpful results.
It is not clear whether the prepared classifier weights represent the
classifier state after being learned.

A better test might be starting out with a classifier with random weights
and trying to overfit on a single sample. The expected result would be
learned weights that are similar to the preparation in notebook 1.

# Set up helpers

In [None]:
from IPython.lib.pretty import pretty

def log_tensor(tensor, title, labels, vmin=None, vmax=None):
    pass

%run util.ipynb

# 1 Input data

In [None]:
data = [
    { 'ent': 123, 'classes': [1, 1, 1], 'sents': ['married married married', 'male male male', 'American American American'] },
    { 'ent': 123, 'classes': [1, 1, 1], 'sents': ['married married married', 'male male male', 'American American American'] },
]

# 2 Pre-processing

## 2.1 Build vocabulary

In [None]:
from collections import Counter
from torchtext.vocab import Vocab

def tokenize(text):
    return text.split()

words = [word for ent in data for sent in ent['sents'] for word in tokenize(sent)]
vocab = Vocab(Counter(words))

print(pretty(vocab.stoi))

## 2.2 Transform data

Map words to tokens and create tensors.

In [None]:
import torch
from torch import tensor

sents_batch = tensor([[[vocab[word] for word in tokenize(sent)] for sent in ent['sents']] for ent in data])
classes_batch = torch.tensor([ent['classes'] for ent in data])

assert len(sents_batch) == len(classes_batch)

batch_size = len(sents_batch)
sent_count = 3
sent_len = 3

ent_labels = [f'ent {i}' for i in range(batch_size)]
sent_labels = [f'sent {i}' for i in range(sent_count)]
tok_labels = [f'tok {i}' for i in range(sent_len)]

log_tensor(sents_batch, 'sents_batch', [ent_labels, sent_labels, tok_labels])

# 3 Prepare classifier

## 3.1 Prepare EmbeddingBag

Create and prepare an `EmbeddingBag` with randomly distributed token embeddings.

In [None]:
from torch.nn import EmbeddingBag
from torch import tensor

vocab_size = len(vocab)
assert vocab_size == 5

emb_size = 4

embedding_bag = EmbeddingBag(num_embeddings=vocab_size, embedding_dim=emb_size)

log_output = 1

word_labels = ['<unk>', '<pad>', 'married', 'male', 'American']
emb_labels = [f'emb {i}' for i in range(emb_size)]

if log_output:
    log_tensor(embedding_bag.weight, 'embedding_bag.weight', [word_labels, emb_labels])

## 3.2 Prepare class embeddings

Create randomly initialized class embeddings.

In [None]:
class_count = 3

class_embs = torch.rand((class_count, emb_size), requires_grad=True)

log_output = 1

class_labels = ['married', 'male', 'American']

if log_output:
    log_tensor(class_embs, 'class_embs', [class_labels, emb_labels])

## 3.3 Prepare linear layer

Create a randomly initialized linear layer.

In [None]:
from torch.nn import Linear

linear = Linear(class_count * emb_size, class_count)

log_output = 1

mix_emb_labels = [f'mix {i} / emb {j}' for i in range(class_count) for j in range(emb_size)]

if log_output:
    log_tensor(linear.weight.data, 'linear.weight.data', [class_labels, mix_emb_labels])
    log_tensor(linear.bias.data, 'linear.bias.data', [class_labels])
