Feb 25

For some reason the valid loss does not decrease significantly
on the attention model. Compare with a baseline that does not
include the attention mechanism.

# Imports

In [2]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
from dataclasses import dataclass
from typing import List, Tuple

import torch
from IPython.lib.pretty import pretty
from torch import tensor, Tensor
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, TabularDataset

from classifier import Classifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1 Build train/valid DataLoaders

## 1.1 Read Sample TSVs into TabularDatasets

In [3]:
class_count = 4
sent_count = 3

#
# Define columns for subsequent read into TabularDatasets
#

def tokenize(text: str) -> List[str]:
    return text.split()

ent_field = ('ent', Field(sequential=False, use_vocab=False))

class_fields = [(f'class_{i}', Field(sequential=False, use_vocab=False))
                for i in range(class_count)]

sent_fields = [(f'sent_{i}', Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True))
               for i in range(sent_count)]

fields = [ent_field] + class_fields + sent_fields

#
# Read Train Samples TSV into TabularDataset
#

train_samples_tsv = 'data/ower-v3-fb-3/train.tsv'
valid_samples_tsv = 'data/ower-v3-fb-3/valid.tsv'
# test_samples_tsv = 'data/ower-v3-fb-3/test.tsv'

raw_train_set = TabularDataset(train_samples_tsv, 'tsv', fields, skip_header=True)
raw_valid_set = TabularDataset(valid_samples_tsv, 'tsv', fields, skip_header=True)
# raw_test_set = TabularDataset(test_samples_tsv, 'tsv', fields, skip_header=True)

for i in range(3):
    row = raw_train_set[i]

    print('entity', row.ent)
    print('classes: married = {}, male = {}, American = {}, actor = {}'.format(
        row.class_0, row.class_1, row.class_2, row.class_3))
    print(row.sent_0)
    print(row.sent_1)
    print(row.sent_2)
    print()



entity 1
classes: married = 0, male = 0, American = 0, actor = 0
['the', 'belarusian', "people's", 'republic', 'was', 'declared', 'on', 'the', 'german-occupied', 'territory', 'of', 'modern-day', 'belarus', 'three', 'weeks', 'after', 'the', 'treaty', 'of', 'brest-litovsk', 'was', 'signed', 'on', 'march', '3,', '1918', 'between', 'the', 'new', 'bolshevik', 'government', 'of', 'soviet', 'russia', 'and', 'the', 'central', 'powers', 'in', 'the', 'border', 'city', 'of', 'brest-litovsk.']
['some', 'of', 'the', 'action', 'in', 'joe', "haldeman's", '1989', 'science', 'fiction', 'novel', 'buying', 'time', '(published', 'in', 'uk', 'as', 'the', 'long', 'habit', 'of', 'living)', 'takes', 'place', 'in', 'the', 'conch', 'republic,', 'a', 'lawless', 'place', 'where', 'assassination', 'and', 'other', 'activities', 'are', 'perfectly', 'legal.']
['the', 'republic', 'of', 'minerva', 'was', 'a', 'micronation', 'consisting', 'of', 'the', 'minerva', 'reefs.']

entity 2
classes: married = 0, male = 0, Americ

## 1.2 Build vocab on train data

The docs only show how to build a vocab over a single column. Therefore, the vocab
is built over the first sentence column only atm.

In [4]:
first_sent_field = sent_fields[0][1]
first_sent_field.build_vocab(raw_train_set)
vocab = first_sent_field.vocab

vocab_size = len(vocab)

print(vocab_size)
print(vocab.itos[:10])
print(vocab.itos[vocab_size//2:vocab_size//2+10])
print(vocab.itos[-10:])

55674
['<unk>', '<pad>', 'the', 'and', 'in', 'of', 'a', 'to', 'for', 'was']
['cfas', 'cfc', 'cfr', 'cgbd', 'cgi', 'cgi-animated', 'chace', 'chacha', 'chachi', 'chad,']
['€6', '明', '曹轩领4内援2外援加盟', '曾三度临危受命', '艳光四射)', '艷光四射,', '관광체험),', '및', '배밭', '조형익;']


## 1.3 Transfor each TabularDataset -> List[Sample]

Parse texts from datasets, map words -> tokens (IDs) using vocab

In [5]:
@dataclass
class Sample:
    ent: int
    classes: List[int]
    sents: List[List[int]]

    def __iter__(self):
        return iter((self.ent, self.classes, self.sents))


def transform(raw_set: TabularDataset) -> List[Sample]:
    return [Sample(
        int(getattr(row, 'ent')),
        [int(getattr(row, f'class_{i}')) for i in range(class_count)],
        [[vocab[token] for token in getattr(row, f'sent_{i}')] for i in range(sent_count)]
    ) for row in raw_set]


train_set = transform(raw_train_set)
valid_set = transform(raw_valid_set)
# test_set = transform(raw_test_set)

print('First training sample:')
print(pretty(train_set[0]))

First training sample:
Sample(ent=1, classes=[0, 0, 0, 0], sents=[[2, 7023, 1753, 958, 9, 1911, 12, 2, 34261, 2789, 5, 16205, 8971, 81, 1434, 32, 2, 2796, 5, 12945, 9, 218, 12, 143, 494, 12042, 112, 2, 27, 9024, 499, 5, 1665, 2086, 3, 2, 321, 3252, 4, 2, 1121, 88, 5, 26693], [150, 5, 2, 423, 4, 293, 0, 1601, 290, 538, 299, 13031, 106, 0, 4, 825, 10, 2, 485, 35190, 5, 0, 1109, 244, 4, 2, 0, 6509, 6, 39251, 244, 60, 8877, 3, 57, 3732, 44, 10729, 0], [2, 958, 5, 16163, 9, 6, 0, 2013, 5, 2, 16163, 0]])


## 1.4 Build DataLoaders

In [6]:
def generate_batch(batch: List[Sample]) -> Tuple[Tensor, Tensor]:

    _ent, classes_batch, sents_batch = zip(*batch)

    cropped_sents_batch = [[sent[:sent_len]
                            for sent in sents] for sents in sents_batch]

    padded_sents_batch = [[sent + [0] * (sent_len - len(sent))
                           for sent in sents] for sents in cropped_sents_batch]

    return tensor(padded_sents_batch), tensor(classes_batch)

batch_size = 1024
sent_len = 64

train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=generate_batch, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=generate_batch)
# test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=generate_batch)

# 2 Create classifier

In [7]:
emb_size = 256

classifier = Classifier(vocab_size, emb_size, class_count)

classifier

Classifier(
  (embedding_bag): EmbeddingBag(55674, 256, mode=mean)
  (linear): Linear(in_features=256, out_features=4, bias=True)
)

# 3 Training

In [8]:
# %reload_ext tensorboard
# %tensorboard --logdir runs

In [9]:
# criterion = MSELoss()
# criterion = BCEWithLogitsLoss()
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([80] * class_count))

# optimizer = SGD(classifier.parameters(), lr=0.1)
optimizer = Adam(classifier.parameters(), lr=0.1)

writer = SummaryWriter()


for epoch in range(20):

    #
    # Train
    #

    train_loss = 0.0
    for sents_batch, classes_batch in tqdm(train_loader, leave=False):
        logits_batch = classifier(sents_batch)

        loss = criterion(logits_batch, classes_batch.float())
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader)

    #
    # Validate
    #

    valid_loss = 0.0
    with torch.no_grad():
        for sents_batch, classes_batch in tqdm(valid_loader, leave=False):
            logits_batch = classifier(sents_batch)

            loss = criterion(logits_batch, classes_batch.float())
            valid_loss += loss.item()

    valid_loss /= len(valid_loader)

    #
    # Log
    #

    print(f'Epoch {epoch}: Train loss = {train_loss}, valid loss = {valid_loss}')
    
    writer.add_scalars('loss', {'train': train_loss, 'valid': valid_loss}, epoch)

                                               

Epoch 0: Train loss = 6.0900243918101, valid loss = 4.787463188171387
Epoch 1: Train loss = 2.0882804095745087, valid loss = 2.864480972290039
Epoch 2: Train loss = 1.096674417455991, valid loss = 2.0682241916656494


KeyboardInterrupt: 

# 4 Test

## 4.1 Define test data

In [None]:
test_data = [
    {
        'ent': 1000,
        'classes': [1, 0, 1, 0],  # married, male, American, actor
        'sents': [
            'Michelle is married',
            'Michelle is female',
            'Michelle is American'
        ]
    },
    {
        'ent': 2000,
        'classes': [1, 0, 0, 0],  # married, male, American, actor
        'sents': [
            'Angela is married',
            'Angela is female',
            'Angela is German'
        ]
    }
]

test_set = [Sample(
    item['ent'],
    item['classes'],
    [[vocab[word] for word in tokenize(sent)] for sent in item['sents']]
) for item in test_data]

test_loader = DataLoader(test_set, batch_size=len(test_set), collate_fn=generate_batch)

## 4.2 Forward test batch

In [None]:
test_loss = 0.0
with torch.no_grad():
    for sents_batch, classes_batch in test_loader:
        logits_batch = classifier(sents_batch)
        
        print(logits_batch)

        loss = criterion(logits_batch, classes_batch.float())
        test_loss += loss.item()

test_loss /= len(test_loader)

print(f'Test loss = {test_loss}')