# Relation Classifier RNN Model

Our goal here is to implement a relation model. Our goal is to do basic relation classification.

First things first, set up the initial configuration.

In [1]:
import sys
sys.path.insert(0, '../../')

In [2]:
from tqdm import tqdm_notebook
from functools import partial
tqdm_notebook = partial(tqdm_notebook, leave=False)
tqdm_notebook().pandas()




In [3]:
import logging

from lib.utils import config_logging
from lib.utils import new_experiment_folder
from lib.utils import seed
from lib.utils import save_standard_streams
from lib.utils import device_default

experiment_folder = new_experiment_folder(label='relation_classifier', parent_directory='../../experiments/')
print('Experiment Folder: %s' % experiment_folder)
save_standard_streams(experiment_folder) # Copy Stdout and Stderr to experiments folder

config_logging()
logger = logging.getLogger(__name__)

device = device_default()
logger.info('Device: %d', device)
is_cuda = device >= 0
cuda = lambda t: t.cuda(device=device) if is_cuda else t

random_seed = 1212212
seed(random_seed, is_cuda=is_cuda)

# Async minibatch allocation for speed
# Reference: http://timdettmers.com/2015/03/09/deep-learning-hardware-guide/
cuda_async = lambda t: t.cuda(device=device, async=True) if is_cuda else t

Experiment Folder: ../../experiments/relation_classifier.02_02_13:31:11
[2018-02-02 13:31:11,040][MainProcess][__main__][INFO] Device: 0
[2018-02-02 13:31:11,041][MainProcess][lib.utils][INFO] Seed: 1212212


## Dataset

Load our dataset. Log a couple rows.

In [4]:
import os
import pandas as pd
from tqdm import tqdm_notebook
from torchnlp.datasets import Dataset

def simple_qa_predicate(directory='../../data/relation_ranking/',
               train=False,
               dev=False,
               test=False,
               train_filename='train.txt',
               dev_filename='dev.txt',
               test_filename=''):
    """
    Example line example: 
        film/film/country	film/film/country film/film/genre film/film/language	what country is <e> from ?
    Vocab example: 
        /film/film/genre
    
    Sample Data:
        Question: 'which genre of album is #head_entity# ?'
        True Relation: '/music/album/genre'
        False Relation Pool: ['/music/album/release_type', '/music/album/genre', '/music/album/artist']
    """
    ret = []
    datasets = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    for is_requested, filename in datasets:
        if not is_requested:
            continue
            
        file_path = os.path.join(directory, filename)
        data = pd.read_table(file_path, header=None, names=['True Relation', 'Relation Pool', 'Question', 'Entity'])
        rows = []
        for i, row in tqdm_notebook(data.iterrows(), total=data.shape[0]):
            relation_pool = list(set(row['Relation Pool'].split()))
            rows.append({
                'text': row['Question'].strip(),
                'relation': row['True Relation'],
                'pool': relation_pool,
                'entity': row['Entity'].strip()})
            
        ret.append(Dataset(rows))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

In [5]:
train_dataset, dev_dataset = simple_qa_predicate(train=True, dev=True, test=False)

print('Num Training Data: %d' % len(train_dataset))
print('Train Sample:')
display(pd.DataFrame(train_dataset[:5]))
print('Num Development Data: %d' % len(dev_dataset))
print('Development Sample:')
display(pd.DataFrame(dev_dataset[:5]))





Num Training Data: 75729
Train Sample:


Unnamed: 0,entity,pool,relation,text
0,e,"[music/album/genre, language/human_language/la...",book/written_work/subjects,what is the book <e> about
1,cardiac arrest,"[music/artist/track, music/album/genre, music/...",music/release_track/release,to what release does the release track <e> com...
2,the debt,"[film/film/production_companies, music/album/a...",film/film/country,what country was the film <e> from
3,nobuo uematsu,"[music/artist/track, organization/organization...",music/producer/tracks_produced,what songs have <e> produced ?
4,eve - olution,"[music/album/genre, music/release/region, musi...",music/release/producers,who produced <e> ?


Num Development Data: 10816
Development Sample:


Unnamed: 0,entity,pool,relation,text
0,american,"[people/ethnicity/geographic_distribution, peo...",biology/organism_classification/organisms_of_t...,name an <e> thoroughbread racehorse
1,vision racing driving simulator,"[cvg/computer_videogame/cvg_genre, cvg/compute...",cvg/computer_videogame/cvg_genre,what kind of game is <e> ?
2,romance film,[base/skosbase/vocabulary_equivalent_topic/bro...,tv/tv_genre/programs,what tv program is <e>
3,polaski,[location/location/containedby],location/location/containedby,what state is <e> located in
4,fern emmett,"[people/person/gender, people/person/nationali...",people/deceased_person/cause_of_death,what disease claimed the life of <e>


## Dataset Iterators

Load our dataset. Log a couple rows.

In [6]:
import torch
from torch.autograd import Variable
from torchnlp.utils import pad_batch

# Defines how to combine a batch of rows into a tensor
def collate_fn(batch, train=True):
    """ list of tensors to a batch variable """
    # PyTorch RNN requires sorting decreasing size
    batch = sorted(batch, key=lambda row: len(row['text']), reverse=True)
    input_batch, _ = pad_batch([row['text'] for row in batch])
    relations = [row['relation'] for row in batch]
    mask = Variable(torch.stack([row['mask'] for row in batch]), volatile=not train)

    # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
    to_variable = (
        lambda b: Variable(torch.stack(b).t_().squeeze(0).contiguous(), volatile=not train))

    return (to_variable(input_batch), to_variable(relations), mask)

In [7]:
from functools import partial
from torchnlp.samplers import NoisySortedBatchSampler
from torchnlp.samplers import SortedSampler


sort_key = lambda r: r['text'].size()[0]

def get_iterator(dataset, batch_size, train=False):
    # Use bucket sampling to group similar sized text but with noise + random
    batch_sampler = NoisySortedBatchSampler(dataset, batch_size, sort_key, sort_key_noise=0.5)
    return DataLoader(
        dataset,
        batch_sampler=batch_sampler,
        collate_fn=partial(collate_fn, train=train),
        pin_memory=is_cuda,
        num_workers=0)

## Encode Text

Here we encode our data into a numerical format. 

In [8]:
import torch
from torchnlp.text_encoders import IdentityEncoder
from torchnlp.text_encoders import WhitespaceEncoder

# NOTE: Given the development vocab might be cheating if we use any statistics from that...
text_data = [t.lower() for t in train_dataset['text'] + dev_dataset['text']]
text_encoder = WhitespaceEncoder(text_data, append_eos=False)
print('Text encoder vocab size: %d' % text_encoder.vocab_size)
relation_encoder = IdentityEncoder(train_dataset['relation'] + dev_dataset['relation'])
print('Relation encoder vocab size: %d' % relation_encoder.vocab_size)

for dataset in [train_dataset, dev_dataset]:
    for row in tqdm_notebook(dataset):
        row['text'] = text_encoder.encode(row['text'])
        row['relation'] = relation_encoder.encode(row['relation'])
        row['pool'] = [relation_encoder.encode(r)[0] for r in row['pool']]
        pool = set(row['pool'])
        row['mask'] = torch.FloatTensor([1 if i in pool else 0 for i in range(relation_encoder.vocab_size)])

Text encoder vocab size: 6693
Relation encoder vocab size: 1702








# Model

Instantiate the model.

In [9]:
import torch
from torchnlp.word_to_vector import FastText

# Load embeddings
unk_init = lambda t: torch.FloatTensor(t).uniform_(-0.1, 0.1)
vocab = set(text_encoder.vocab)
pretrained_embedding = FastText(language='en',
                                cache='./../../.pretrained_embeddings_cache',
                                is_include=lambda w: w in vocab)
embedding_weights = torch.Tensor(text_encoder.vocab_size, pretrained_embedding.dim)
for i, token in enumerate(text_encoder.vocab):
    embedding_weights[i] = pretrained_embedding[token]

[2018-02-02 13:31:34,462][MainProcess][lib.pretrained_embeddings][INFO] Loading vectors from ./../../.pretrained_embeddings_cache/wiki.en.vec.pt


In [10]:
from lib.nn import SeqToLabel

def make_model():
    # `embedding_size=300` is required by FastText
    model = SeqToLabel(text_encoder.vocab_size, relation_encoder.vocab_size, embedding_size=300)
    for param in model.parameters():
        param.data.uniform_(-0.1, 0.1)

    model.encoder.embedding.weight.data.copy_(embedding_weights)

    cuda(model)
    return model

## Evaluate

Instantiate a function that evaluates a model. Our evaluation is the same as "Step 3 - Predict Relation and Finish" to ensure consistency.

In [11]:
from tqdm import tqdm_notebook
from torch.autograd import Variable
import random
import math
from lib.utils import batch
from torchnlp.utils import pad_batch

# NOTE: Relation Ranking dataset has a pool of possible relations similar to `Step 3 - Predict Relation and Finish`

def get_softmax_relation_score(model, question_batch, mask_batch, relation_pool_batch):
    question_batch, _ = pad_batch(question_batch)
    question_batch = cuda(Variable(torch.stack(question_batch).t_().squeeze(0).contiguous(), volatile=True))
    mask_batch = cuda(torch.stack(mask_batch))
    output = model(question_batch, Variable(mask_batch, volatile=True)).exp_().data
    
    batch_size = output.size()[0]
    ret = []
    for i in range(batch_size):
        ret.append([output[i][j] for j in relation_pool_batch[i]])
    return ret

def evaluate_pool(dataset, model, batch_size=1):
    logger.info('Evaluating...')
    
    model.train(False)
    scores = []
    for batched in tqdm_notebook(batch(dataset, batch_size), total=math.ceil(len(dataset) / batch_size)):
        question_batch = [row['text'] for row in batched]
        mask_batch = [row['mask'] for row in batched]
        relation_pool_batch = [row['pool'] for row in batched]
        scores.extend(get_softmax_relation_score(model, question_batch, mask_batch, relation_pool_batch))
    model.train(True) # No side affects
    
    correct = 0
    for i, row in tqdm_notebook(enumerate(dataset)):
        max_score = max(scores[i])
        # NOTE: Pool may have multiple <unk> tags
        top_relations = set([r for j, r in enumerate(row['pool']) if scores[i][j] == max_score])
        predicted_relation = random.choice(list(top_relations))
        if predicted_relation == row['relation'][0]:
            correct += 1

    accuracy = correct / len(dataset)
    logger.info('Relation Accuracy (SOTA 88.4%%): %f [%d of %d]', accuracy, correct, len(dataset))
    return accuracy

Similar evaluation metric to our training regime. Training the loss should directly affect this metric.

In [12]:
from lib.metrics import print_random_sample
from lib.metrics import get_accuracy
from torch.nn.modules.loss import NLLLoss

def evaluate_softmax(dataset, model, batch_size=1):
    # Evaluate
    model.train(mode=False)
    criterion = cuda(NLLLoss())
    texts, relations, outputs = [], [], []
    total_loss = 0
    dev_iterator = get_iterator(dataset, batch_size)
    for text, relation, mask in tqdm_notebook(dev_iterator):
        output = model(cuda_async(text), cuda_async(mask))
        # Compute metrics
        total_loss += criterion(output, cuda_async(relation)).data[0] * relation.size()[0]
        # Prevent memory leak by moving output from variable to tensor
        texts.extend(text.data.cpu().transpose(0, 1).split(split_size=1, dim=0))
        relations.extend(relation.data.cpu().split(split_size=1, dim=0))
        outputs.extend(output.data.cpu().split(split_size=1, dim=0))
    model.train(True) # No side affects
    # Print metrics
    # print_random_sample(texts, relations, outputs, text_encoder, relation_encoder, n_samples=5)
    logger.info('NLLLoss: %.03f', (total_loss / len(dataset)))
    return get_accuracy(relations, outputs, print_=True)[0]

## Hyperparameters

Set up the hyperparameters for our model. We use `lib.configurable` to setup arguments.

In [13]:
from lib.optim import Adam
from lib.configurable import configurable
from lib.configurable import add_config
from lib.configurable import log_config

Adam.__init__ = configurable(Adam.__init__)

# Hyperparameters achieve 0.881564349112426%
# Following the evaluation of 150 random models

add_config({
    'lib': {
        'nn.seq_to_label.SeqToLabel.__init__': {
            'bidirectional': True,
            'embedding_size': 300,
            'rnn_size': 450,
            'freeze_embeddings': True,
            'rnn_cell': 'gru',
            'decode_dropout': 0.6,  # dropout before fully connected layer in RNN
            'rnn_layers': 1,
            'rnn_variational_dropout': 0.0,
            'rnn_dropout': 0.8,
            'embedding_dropout': 0.3
        },
        'optim.Optimizer.__init__.max_grad_norm': None,
    },
    'lib.optim.adam.Adam.__init__': {
        'amsgrad': True
    }
})

log_config()

[2018-02-02 13:31:38,414][MainProcess][lib.configurable][INFO] Checking configuration...
NOTE: Due to Python remaining the __main__ module, this check can be ignored here.
NOTE: _check_configuration can be ignored for external libraries.
[2018-02-02 13:31:38,416][MainProcess][lib.configurable][INFO] Configuration checked.
[2018-02-02 13:31:38,417][MainProcess][lib.configurable][INFO] Global configuration:
[2018-02-02 13:31:38,419][MainProcess][root][INFO] {   'lib': {   'nn': {   'seq_to_label': {   'SeqToLabel': {   '__init__': {   'bidirectional': True,
                                                                               'decode_dropout': 0.6,
                                                                               'embedding_dropout': 0.3,
                                                                               'embedding_size': 300,
                                                                               'freeze_embeddings': True,
                       

## Training Loop

Below here, we do a training loop over a number of epochs.

In [14]:
from tqdm import tqdm_notebook
import random

from torch.utils.data import DataLoader
from torch.nn.modules.loss import NLLLoss
from lib.optim import Adam
from functools import partial

from lib.checkpoint import Checkpoint
from lib.utils import get_total_parameters
from lib.utils import resplit_datasets
from lib.optimizer import Optimizer

# TODO: Try to concat multiple templated questions together and do a multi label classifier.

def train(resources=30, checkpoint=None, **kwargs):
    
    if isinstance(checkpoint, str):
        checkpoint = Checkpoint(checkpoint)
        model = checkpoint.model
        train_batch_size = checkpoint.train_batch_size
        optimizer = checkpoint.optimizer
        n_bad_epochs = checkpoint.n_bad_epochs
        max_score = checkpoint.max_score
    else:
        model = make_model()
        train_batch_size = 32
        # NOTE: https://github.com/pytorch/pytorch/issues/679
        params = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = Optimizer(Adam(params=params))
        n_bad_epochs = 0
        max_score = 0

    # NOTE: Because the training dataset was used to train the subject recongition, its better; therefore,
    # we cannot mix them
    epochs = max(round(resources), 1)
    train_max_batch_size = 1024
    patience = 3
    criterion = cuda(NLLLoss())
    logger.info('Epochs: %d', epochs)
    logger.info('Train Dataset Size: %d', len(train_dataset))
    logger.info('Dev Dataset Size: %d', len(dev_dataset))
    logger.info('Train Batch Size: %d', train_batch_size)
    logger.info('Train Max Batch Size: %d', train_max_batch_size)
    logger.info('Total Parameters: %d', get_total_parameters(model))
    logger.info('Model:\n%s' % model)

    # Train!
    for epoch in range(epochs):
        logger.info('Epoch: %d', epoch)

        # Iterate over the training data
        logger.info('Training...')
        model.train(mode=True)
        train_iterator = get_iterator(train_dataset, train_batch_size, train=True)
        for text, relation, mask in tqdm_notebook(train_iterator):
            optimizer.zero_grad()
            output = model(cuda_async(text), cuda_async(mask))
            loss = criterion(output, cuda_async(relation))

            # Backward propagation
            loss.backward()
            optimizer.step()

        # Save checkpoint
        checkpoint_path = Checkpoint.save(
            experiment_folder,
            {
                'model': model,
                'optimizer': optimizer,
                'relation_encoder': relation_encoder,
                'text_encoder': text_encoder,
                'train_batch_size': train_batch_size,
                'n_bad_epochs': n_bad_epochs,
                'max_score': max_score
            },
            device=device)

        # Evaluate
        score = evaluate_softmax(dev_dataset, model, 4096)

        # Scheduler for increasing batch_size inspired by this paper:
        # https://openreview.net/forum?id=B1Yy1BxCZ
        if max_score > score:
            n_bad_epochs += 1
        else:
            n_bad_epochs = 0
            max_score = score

        if n_bad_epochs >= patience:
            train_batch_size = min(train_max_batch_size, train_batch_size * 2)
            logger.info('Ran out of patience, increasing train batch size to: %d', train_batch_size)

        print('–' * 100)
        
    return -max_score, checkpoint_path

In [15]:
train()

[2018-02-02 13:31:38,577][MainProcess][lib.configurable][INFO] seq_to_label.SeqToLabel.__init__ was configured with:
{   'bidirectional': True,
    'decode_dropout': 0.6,
    'embedding_dropout': 0.3,
    'embedding_size': 300,
    'freeze_embeddings': True,
    'input_vocab_size': 6693,
    'output_vocab_size': 1702,
    'rnn_cell': 'gru',
    'rnn_dropout': 0.8,
    'rnn_layers': 1,
    'rnn_size': 450,
    'rnn_variational_dropout': 0.0}
[2018-02-02 13:31:38,579][MainProcess][lib.configurable][INFO] seq_encoder.SeqEncoder.__init__ no config for: lib.nn.seq_encoder.SeqEncoder.__init__
[2018-02-02 13:31:38,579][MainProcess][lib.configurable][INFO] seq_encoder.SeqEncoder.__init__ was configured with:
{   'bidirectional': True,
    'embedding_dropout': 0.3,
    'embedding_size': 300,
    'freeze_embeddings': True,
    'n_layers': 1,
    'rnn_cell': 'gru',
    'rnn_dropout': 0.8,
    'rnn_size': 450,
    'rnn_variational_dropout': 0.0,
    'vocab_size': 6693}
[2018-02-02 13:31:41,332][Ma


[2018-02-02 13:31:52,224][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/13.pt



[2018-02-02 13:31:52,784][MainProcess][__main__][INFO] NLLLoss: 0.942
[2018-02-02 13:31:52,845][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8382951183431953 [9067 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:31:52,853][MainProcess][__main__][INFO] Epoch: 1
[2018-02-02 13:31:52,854][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:03,128][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/24.pt



[2018-02-02 13:32:03,513][MainProcess][__main__][INFO] NLLLoss: 0.785
[2018-02-02 13:32:03,575][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8517011834319527 [9212 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:03,580][MainProcess][__main__][INFO] Epoch: 2
[2018-02-02 13:32:03,581][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:13,659][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/35.pt



[2018-02-02 13:32:14,026][MainProcess][__main__][INFO] NLLLoss: 0.725
[2018-02-02 13:32:14,087][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8638128698224852 [9343 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:14,092][MainProcess][__main__][INFO] Epoch: 3
[2018-02-02 13:32:14,093][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:24,340][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/45.pt



[2018-02-02 13:32:24,851][MainProcess][__main__][INFO] NLLLoss: 0.694
[2018-02-02 13:32:24,910][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8674186390532544 [9382 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:24,916][MainProcess][__main__][INFO] Epoch: 4
[2018-02-02 13:32:24,917][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:35,463][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/56.pt



[2018-02-02 13:32:35,824][MainProcess][__main__][INFO] NLLLoss: 0.679
[2018-02-02 13:32:35,886][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8750924556213018 [9465 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:35,891][MainProcess][__main__][INFO] Epoch: 5
[2018-02-02 13:32:35,892][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:46,827][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/68.pt



[2018-02-02 13:32:47,198][MainProcess][__main__][INFO] NLLLoss: 0.671
[2018-02-02 13:32:47,259][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8761094674556213 [9476 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:47,264][MainProcess][__main__][INFO] Epoch: 6
[2018-02-02 13:32:47,265][MainProcess][__main__][INFO] Training...



[2018-02-02 13:32:57,486][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/78.pt



[2018-02-02 13:32:57,986][MainProcess][__main__][INFO] NLLLoss: 0.665
[2018-02-02 13:32:58,044][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8776812130177515 [9493 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:32:58,050][MainProcess][__main__][INFO] Epoch: 7
[2018-02-02 13:32:58,051][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:08,258][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/89.pt



[2018-02-02 13:33:08,630][MainProcess][__main__][INFO] NLLLoss: 0.667
[2018-02-02 13:33:08,691][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8783284023668639 [9500 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:08,696][MainProcess][__main__][INFO] Epoch: 8
[2018-02-02 13:33:08,697][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:18,715][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/100.pt



[2018-02-02 13:33:19,080][MainProcess][__main__][INFO] NLLLoss: 0.662
[2018-02-02 13:33:19,140][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8750924556213018 [9465 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:19,146][MainProcess][__main__][INFO] Epoch: 9
[2018-02-02 13:33:19,146][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:29,173][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/110.pt



[2018-02-02 13:33:29,548][MainProcess][__main__][INFO] NLLLoss: 0.658
[2018-02-02 13:33:29,608][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8777736686390533 [9494 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:29,613][MainProcess][__main__][INFO] Epoch: 10
[2018-02-02 13:33:29,614][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:39,922][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/121.pt



[2018-02-02 13:33:40,313][MainProcess][__main__][INFO] NLLLoss: 0.667
[2018-02-02 13:33:40,377][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8772189349112426 [9488 of 10816]
[2018-02-02 13:33:40,382][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 64
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:40,383][MainProcess][__main__][INFO] Epoch: 11
[2018-02-02 13:33:40,384][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:46,118][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/127.pt



[2018-02-02 13:33:46,485][MainProcess][__main__][INFO] NLLLoss: 0.659
[2018-02-02 13:33:46,550][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8806397928994083 [9525 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:46,556][MainProcess][__main__][INFO] Epoch: 12
[2018-02-02 13:33:46,558][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:52,484][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/133.pt



[2018-02-02 13:33:53,082][MainProcess][__main__][INFO] NLLLoss: 0.657
[2018-02-02 13:33:53,147][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8820266272189349 [9540 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:53,153][MainProcess][__main__][INFO] Epoch: 13
[2018-02-02 13:33:53,154][MainProcess][__main__][INFO] Training...



[2018-02-02 13:33:59,211][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/140.pt



[2018-02-02 13:33:59,614][MainProcess][__main__][INFO] NLLLoss: 0.664
[2018-02-02 13:33:59,680][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8819341715976331 [9539 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:33:59,688][MainProcess][__main__][INFO] Epoch: 14
[2018-02-02 13:33:59,688][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:06,086][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/147.pt



[2018-02-02 13:34:06,481][MainProcess][__main__][INFO] NLLLoss: 0.668
[2018-02-02 13:34:06,545][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8814718934911243 [9534 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:06,551][MainProcess][__main__][INFO] Epoch: 15
[2018-02-02 13:34:06,552][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:12,921][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/154.pt



[2018-02-02 13:34:13,449][MainProcess][__main__][INFO] NLLLoss: 0.669
[2018-02-02 13:34:13,511][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8812869822485208 [9532 of 10816]
[2018-02-02 13:34:13,518][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 128
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:13,519][MainProcess][__main__][INFO] Epoch: 16
[2018-02-02 13:34:13,519][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:16,947][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/158.pt



[2018-02-02 13:34:17,307][MainProcess][__main__][INFO] NLLLoss: 0.670
[2018-02-02 13:34:17,368][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8812869822485208 [9532 of 10816]
[2018-02-02 13:34:17,373][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 256
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:17,374][MainProcess][__main__][INFO] Epoch: 17
[2018-02-02 13:34:17,376][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:19,610][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/161.pt



[2018-02-02 13:34:20,007][MainProcess][__main__][INFO] NLLLoss: 0.671
[2018-02-02 13:34:20,073][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8830436390532544 [9551 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:20,078][MainProcess][__main__][INFO] Epoch: 18
[2018-02-02 13:34:20,079][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:22,299][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/163.pt



[2018-02-02 13:34:22,842][MainProcess][__main__][INFO] NLLLoss: 0.671
[2018-02-02 13:34:22,903][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.882396449704142 [9544 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:22,909][MainProcess][__main__][INFO] Epoch: 19
[2018-02-02 13:34:22,910][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:25,123][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/166.pt



[2018-02-02 13:34:25,511][MainProcess][__main__][INFO] NLLLoss: 0.673
[2018-02-02 13:34:25,574][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8813794378698225 [9533 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:25,579][MainProcess][__main__][INFO] Epoch: 20
[2018-02-02 13:34:25,580][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:27,820][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/169.pt



[2018-02-02 13:34:28,207][MainProcess][__main__][INFO] NLLLoss: 0.673
[2018-02-02 13:34:28,276][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8833210059171598 [9554 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:28,283][MainProcess][__main__][INFO] Epoch: 21
[2018-02-02 13:34:28,284][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:30,500][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/171.pt



[2018-02-02 13:34:30,868][MainProcess][__main__][INFO] NLLLoss: 0.679
[2018-02-02 13:34:30,929][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8828587278106509 [9549 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:30,934][MainProcess][__main__][INFO] Epoch: 22
[2018-02-02 13:34:30,935][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:33,170][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/174.pt



[2018-02-02 13:34:33,718][MainProcess][__main__][INFO] NLLLoss: 0.680
[2018-02-02 13:34:33,779][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8835983727810651 [9557 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:33,784][MainProcess][__main__][INFO] Epoch: 23
[2018-02-02 13:34:33,785][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:36,010][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/177.pt



[2018-02-02 13:34:36,408][MainProcess][__main__][INFO] NLLLoss: 0.684
[2018-02-02 13:34:36,469][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8825813609467456 [9546 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:36,475][MainProcess][__main__][INFO] Epoch: 24
[2018-02-02 13:34:36,476][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:38,830][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/180.pt



[2018-02-02 13:34:39,250][MainProcess][__main__][INFO] NLLLoss: 0.684
[2018-02-02 13:34:39,311][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8828587278106509 [9549 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:39,317][MainProcess][__main__][INFO] Epoch: 25
[2018-02-02 13:34:39,318][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:41,796][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/183.pt



[2018-02-02 13:34:42,309][MainProcess][__main__][INFO] NLLLoss: 0.683
[2018-02-02 13:34:42,375][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8833210059171598 [9554 of 10816]
[2018-02-02 13:34:42,381][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 512
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:42,382][MainProcess][__main__][INFO] Epoch: 26
[2018-02-02 13:34:42,383][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:44,110][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/185.pt



[2018-02-02 13:34:44,481][MainProcess][__main__][INFO] NLLLoss: 0.684
[2018-02-02 13:34:44,542][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8835059171597633 [9556 of 10816]
[2018-02-02 13:34:44,548][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 1024
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:44,549][MainProcess][__main__][INFO] Epoch: 27
[2018-02-02 13:34:44,550][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:46,199][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/187.pt



[2018-02-02 13:34:46,568][MainProcess][__main__][INFO] NLLLoss: 0.685
[2018-02-02 13:34:46,634][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8833210059171598 [9554 of 10816]
[2018-02-02 13:34:46,639][MainProcess][__main__][INFO] Ran out of patience, increasing train batch size to: 1024
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:46,642][MainProcess][__main__][INFO] Epoch: 28
[2018-02-02 13:34:46,643][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:48,117][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/189.pt



[2018-02-02 13:34:48,483][MainProcess][__main__][INFO] NLLLoss: 0.684
[2018-02-02 13:34:48,545][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8837832840236687 [9559 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
[2018-02-02 13:34:48,551][MainProcess][__main__][INFO] Epoch: 29
[2018-02-02 13:34:48,552][MainProcess][__main__][INFO] Training...



[2018-02-02 13:34:50,262][MainProcess][lib.checkpoint][INFO] Saving checkpoint: ../../experiments/relation_classifier.02_02_13:31:11/191.pt



[2018-02-02 13:34:50,631][MainProcess][__main__][INFO] NLLLoss: 0.686
[2018-02-02 13:34:50,692][MainProcess][lib.metrics.accuracy][INFO] Accuracy: 0.8830436390532544 [9551 of 10816]
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––


(-0.8837832840236687,
 '../../experiments/relation_classifier.02_02_13:31:11/191.pt')

## Hyperparameter Optimization

In [None]:
import torch
import random

from skopt.space import Real, Integer, Categorical

from lib.hyperparameter_optimization import hyperband
from lib.configurable import add_config
from lib.configurable import log_config

space  = [Real(0, 0.9, name='embedding_dropout'),
          Real(0, 0.9, name='rnn_variational_dropout'),
          Real(0, 0.9, name='rnn_dropout'),
          Real(0, 0.9, name='decode_dropout'),
          Integer(50, 250, name='rnn_size'), # We multiply this hyperparameter by two to allow for bidirectional
          Integer(1, 3, name='rnn_layers'),
          Categorical(['lstm', 'gru'], name='rnn_cell'),
          Categorical([True, False], name='freeze_embeddings'),
          Categorical([True, False], name='bidirectional'),
          Real(0, 5, name='max_grad_norm')]

def objective(embedding_dropout, rnn_variational_dropout, rnn_dropout, decode_dropout,
              rnn_size, rnn_layers, rnn_cell, freeze_embeddings, bidirectional, max_grad_norm,
              *args, **kwargs):

    add_config({
        'lib': {
            'nn.seq_to_label.SeqToLabel.__init__': {
                'bidirectional': bidirectional,
                'embedding_size': 300,
                'rnn_size': int(rnn_size) * 2,
                'freeze_embeddings': freeze_embeddings,
                'rnn_cell': rnn_cell,
                'decode_dropout': decode_dropout,  # dropout before fully connected layer in RNN
                'rnn_layers': int(rnn_layers),
                'rnn_variational_dropout': rnn_variational_dropout,
                'embedding_dropout': embedding_dropout,
                'rnn_dropout': rnn_dropout,
            },
            'optimizer.Optimizer.__init__.max_grad_norm': max_grad_norm,
        },
        'lib.optim.adam.Adam.__init__': {
            'amsgrad': True
        }
    })

    ret = train(*args, **kwargs)
    print('=' * 100)
    torch.cuda.empty_cache()
    return ret

scores, hyperparameters = hyperband(objective, space, max_resources_per_model=30, total_resources=1000)
print('Best Accuracy: %.4f' % min(scores))