Feb 28

The loss curve indicates that training works, but it is not illustrative.
Print some concrete examples in the training and validation loops to see
how well training performs.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

from dataclasses import dataclass
from datetime import datetime
from typing import List, Tuple

import pandas as pd
import torch
from IPython.core.display import display
from torch import tensor, Tensor
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, TabularDataset
from tqdm import tqdm

from classifier import Classifier

pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 2)
pd.set_option('display.max_rows', None)

# Config

In [None]:
ower_dataset = 'ower-v3-fb-irt-3'
class_count = 4
sent_count = 3

# vectors = None
# vectors = 'charngram.100d'
# vectors = 'fasttext.en.300d'
# vectors = 'fasttext.simple.300d'
# vectors = 'glove.42B.300d'
# vectors = 'glove.840B.300d'
# vectors = 'glove.twitter.27B.25d'
# vectors = 'glove.twitter.27B.50d'
# vectors = 'glove.twitter.27B.100d'
vectors = 'glove.twitter.27B.200d'
# vectors = 'glove.6B.50d'
# vectors = 'glove.6B.100d'
# vectors = 'glove.6B.200d'
# vectors = 'glove.6B.300d'

emb_size = None
# emb_size = 200

batch_size = 1024
sent_len = 64

class_weight = 80
lr = 0.01
epoch_count = 20

log_dir = 'runs/' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + \
          f'_{ower_dataset}' + f'_emb-{emb_size}'

# 1 Build train/valid DataLoaders

## 1.1 Read Sample TSVs into TabularDatasets

In [None]:
#
# Define columns for subsequent read into TabularDatasets
#

def tokenize(text: str) -> List[str]:
    return text.split()

raw_ent_field = Field(sequential=False, use_vocab=False)
raw_class_field = Field(sequential=False, use_vocab=False)
raw_sent_field = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)

ent_field = ('ent', raw_ent_field)
class_fields = [(f'class_{i}', raw_class_field) for i in range(class_count)]
sent_fields = [(f'sent_{i}', raw_sent_field) for i in range(sent_count)]

fields = [ent_field] + class_fields + sent_fields

#
# Read Train Samples TSV into TabularDataset
#

train_samples_tsv = f'data/{ower_dataset}/train.tsv'
valid_samples_tsv = f'data/{ower_dataset}/valid.tsv'

raw_train_set = TabularDataset(train_samples_tsv, 'tsv', fields, skip_header=True)
raw_valid_set = TabularDataset(valid_samples_tsv, 'tsv', fields, skip_header=True)

#
# Print some samples
#

tab_cols = ['ent'] + [f'class_{i}' for i in range(class_count)] + [f'sent_{i}' for i in range(sent_count)]

train_tab_data = [[getattr(row, col) for col in tab_cols] for row in raw_train_set[:20]]
display(pd.DataFrame(train_tab_data, columns=tab_cols))

valid_tab_data = [[getattr(row, col) for col in tab_cols] for row in raw_valid_set[:20]]
display(pd.DataFrame(valid_tab_data, columns=tab_cols))

## 1.2 Build vocab on train data

In [None]:
raw_sent_field.build_vocab(raw_train_set, vectors=vectors)
vocab = raw_sent_field.vocab

#
# Print some samples
#

vocab_size = len(vocab)

print(vocab_size)
print(vocab.itos[:10])
print(vocab.itos[vocab_size//2:vocab_size//2+10])
print(vocab.itos[-10:])

## 1.3 Transfor each TabularDataset -> List[Sample]

Parse texts from datasets, map words -> tokens (IDs) using vocab

In [None]:
@dataclass
class Sample:
    ent: int
    classes: List[int]
    sents: List[List[int]]

    def __iter__(self):
        return iter((self.ent, self.classes, self.sents))


def transform(raw_set: TabularDataset) -> List[Sample]:
    return [Sample(
        int(getattr(row, 'ent')),
        [int(getattr(row, f'class_{i}')) for i in range(class_count)],
        [[vocab[token] for token in getattr(row, f'sent_{i}')] for i in range(sent_count)]
    ) for row in raw_set]


train_set = transform(raw_train_set)
valid_set = transform(raw_valid_set)

#
# Print some samples
#

tab_cols = ['ent', 'classes', 'sents']
tab_data = [[getattr(row, col) for col in tab_cols] for row in train_set[:3]]

pd.DataFrame(tab_data, columns=tab_cols)

## 1.4 Build DataLoaders

In [None]:
def generate_batch(batch: List[Sample]) -> Tuple[Tensor, Tensor]:

    _ent, classes_batch, sents_batch = zip(*batch)

    cropped_sents_batch = [[sent[:sent_len]
                            for sent in sents] for sents in sents_batch]

    padded_sents_batch = [[sent + [0] * (sent_len - len(sent))
                           for sent in sents] for sents in cropped_sents_batch]

    return tensor(padded_sents_batch), tensor(classes_batch)


train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=generate_batch, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=generate_batch)

# 2 Create classifier

In [None]:
if vocab.vectors is None:
    classifier = Classifier.from_random(vocab, emb_size, class_count)
else:
    classifier = Classifier.from_pre_trained(vocab, class_count)

print(classifier)

# 3 Training

In [None]:
# criterion = MSELoss()
# criterion = BCEWithLogitsLoss()
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([class_weight] * class_count))

# optimizer = SGD(classifier.parameters(), lr=0.1)
optimizer = Adam(classifier.parameters(), lr=lr)

writer = SummaryWriter(log_dir=log_dir)


for epoch in range(epoch_count):

    #
    # Train
    #
    
    train_samples = []

    train_loss = 0.0
    for sents_batch, classes_batch in tqdm(train_loader):
        logits_batch = classifier(sents_batch)

        loss = criterion(logits_batch, classes_batch.float())
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #
        # Append first sample of each batch to output
        #

        display_logits = [f'{x:.2f}' for x in logits_batch[0]]
        display_classes = classes_batch[0].detach().numpy()
        display_words = [[vocab.itos[ent] for ent in sent] for sent in sents_batch[0]]

        train_samples.append([display_logits, display_classes] + display_words)

    train_loss /= len(train_loader)

    #
    # Validate
    #
    
    valid_samples = []

    valid_loss = 0.0
    with torch.no_grad():
        for sents_batch, classes_batch in tqdm(valid_loader):
            logits_batch = classifier(sents_batch)

            loss = criterion(logits_batch, classes_batch.float())
            valid_loss += loss.item()

            #
            # Append first sample of each batch to output
            #

            display_logits = [f'{x:.2f}' for x in logits_batch[0]]
            display_classes = classes_batch[0].detach().numpy()
            display_words = [[vocab.itos[ent] for ent in sent] for sent in sents_batch[0]]

            valid_samples.append([display_logits, display_classes] + display_words)

    valid_loss /= len(valid_loader)
    
    train_df = pd.DataFrame(train_samples, columns=['pred', 'gt'] + [f'sent{i}' for i in range(sent_count)])
    display(train_df)

    valid_df = pd.DataFrame(valid_samples, columns=['pred', 'gt'] + [f'sent{i}' for i in range(sent_count)])
    display(valid_df)

    #
    # Log
    #

    print(f'Epoch {epoch}: Train loss = {train_loss}, valid loss = {valid_loss}')
    
    writer.add_scalars('loss', {'train': train_loss, 'valid': valid_loss}, epoch)

# 4 Test

## 4.1 Define test data

In [None]:
test_data = [
    {
        'ent': 1000,
        'classes': [1, 0, 1, 0],  # married, male, American, actor
        'sents': [
            'Michelle is married',
            'Michelle is female',
            'Michelle is American'
        ]
    },
    {
        'ent': 2000,
        'classes': [1, 0, 0, 0],  # married, male, American, actor
        'sents': [
            'Angela is married',
            'Angela is female',
            'Angela is German'
        ]
    }
]

test_set = [Sample(
    item['ent'],
    item['classes'],
    [[vocab[word] for word in tokenize(sent)] for sent in item['sents']]
) for item in test_data]

test_loader = DataLoader(test_set, batch_size=len(test_set), collate_fn=generate_batch)

## 4.2 Forward test batch

In [None]:
test_loss = 0.0
with torch.no_grad():
    for sents_batch, classes_batch in test_loader:
        logits_batch = classifier(sents_batch)
        
        print(logits_batch)

        loss = criterion(logits_batch, classes_batch.float())
        test_loss += loss.item()

test_loss /= len(test_loader)

print(f'Test loss = {test_loss}')