In [1]:
# Some part of the code was referenced from below.
# https://github.com/pytorch/examples/tree/master/word_language_model
# https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/language_model
import logging
import json
import time
import torch
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
from data_utils import Dictionary, Corpus, create_parameter_grid
from flatten_dict import flatten, unflatten
import os

logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

In [None]:
# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 2
batch_size = 20
seq_length = 30
learning_rate = 0.002
log_interval = 100
clip_norm = 0.5

# Load "Penn Treebank" dataset
#corpus = Corpus()
#ids = corpus.get_data('data/penn/train.txt', batch_size)
#vocab_size = len(corpus.dictionary)
#num_batches = ids.size(1) // seq_length

In [2]:
# RNN based language model
class RNNLM(nn.Module):
    def __init__(
            self,
            vocab_size,
            embed_size,
            hidden_size,
            num_layers=1,
            dropout=0,
            bidirectional=False,
        ):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(p=dropout)
        self.lstm = nn.LSTM(embed_size, hidden_size, dropout=dropout num_layers=num_layers, batch_first=True,) #bidirectional=bidirectional)
        lstm_output_size = hidden_size #if not bidirectional else hidden_size * 2
        self.linear = nn.Linear(lstm_output_size, vocab_size)
        
        for name, param in self.lstm.named_parameters(): # https://discuss.pytorch.org/t/initializing-parameters-of-a-multi-layer-lstm/5791
            if 'bias' in name:
                nn.init.constant(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal(param)

    def forward(self, x, h):
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Dropout vectors
        x = self.dropout(x)
        
        # Forward propagate LSTM
        out, (h, c) = self.lstm(x, h)
        
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        
        # Decode hidden states of all time steps
        out = self.linear(out)
        return out, (h, c)


In [36]:
# Truncated backpropagation
def detach(states):
    return [state.detach() for state in states]

####################################################################################
# TRAIN
####################################################################################
def train_lstm_model(train_data, valid_data, test_data, params, max_epochs=50, verbose_logging=True):

    def train_model():
        for epoch in range(params['num_epochs']):
            print('#'*10, f'Epoch [{epoch+1}/{params["num_epochs"]}]', '#'*10)
            
            # learning rate decay
            if params.get('lr_decay') and params.get('lr_decay') =! 1:
                new_lr = params['lr'] * (params['lr_decay'] ** max(epoch + 1 - params['lr_decay_start'], 0.0))
                print('Learning rate: {:.4f}'.format(learning_rate))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = new_lr
            
            train_epoch_loss = predict(train_data, train=True)
            valid_epoch_loss = predict(valid_data, train=False)

            if verbose_logging:
                print('-'*10, f'End of Epoch {epoch+1}', '-'*10)
                print('Train Loss: {:.4f}, Train Perplexity: {:5.2f}'
                    .format(train_epoch_loss, np.exp(train_epoch_loss)))
                print('Valid Loss: {:.4f}, Valid Perplexity: {:5.2f}'
                    .format(valid_epoch_loss, np.exp(valid_epoch_loss)))
                print('-'*40)
        
        test_epoch_loss = predict(test_data, train=False)            
        print('-'*10, f'Test set results', '-'*10)
        print('Test Loss: {:.4f}, Test Perplexity: {:5.2f}'
                .format(test_epoch_loss, np.exp(test_epoch_loss)))
        
        return True

    def predict(data, train=False):
        if train:
            model.train()
        else:
            model.eval()
        
        # Set initial hidden and cell states
        states = (
            torch.zeros(
                params['model']['num_layers'],# * (2 if params['model']['bidirectional'] else 1), 
                params['batch_size'], 
                params['model']['hidden_size'],
            ).to(device),
            torch.zeros(
                params['model']['num_layers'],# * (2 if params['model']['bidirectional'] else 1), 
                params['batch_size'], 
                params['model']['hidden_size'],
            ).to(device)
        )
        
        losses = []
        for i in range(0, data.size(1) - params['seq_length'], params['seq_length']):
            # Get mini-batch inputs and targets
            inputs = data[:, i:i+params['seq_length']].to(device)
            targets = data[:, (i+1):(i+1)+params['seq_length']].to(device)
            
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            # https://discuss.pytorch.org/t/solved-why-we-need-to-detach-variable-which-contains-hidden-representation/1426/4
            states = detach(states)

            # Forward pass
            outputs, states = model(inputs, states)
            loss = criterion(outputs, targets.reshape(-1)) # in here the targets.reshape(-1) is the same as the .t() transpose in the batchify
            losses.append(loss.item())

            if train:
                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                clip_grad_norm_(model.parameters(), params['clip_norm'])
                optimizer.step()

            step = (i+1) // params['seq_length']
            if step % params['log_interval'] == 0 and i != 0 and verbose_logging:
                loss_mean = sum(losses[-params['log_interval']:]) / params['log_interval']
                print('Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                    .format(step, data.size(1) // params['seq_length'], loss_mean, np.exp(loss_mean)))
        
        loss_mean = sum(losses) / len(losses)
        return loss_mean


    device = torch.device('cuda' if params['cuda'] and torch.cuda.is_available() else 'cpu')

    print(params["model"])
    
    if params.get('seed'):
        torch.manual_seed(params['seed'])
    model = RNNLM(**params["model"]).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    if params['optimizer'] == 'sgd':
        self.optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    if params['optimizer'] == 'adam':
        self.optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
        
    return model, train_model()

In [39]:
def hyperparameter_tune_lstm(data_path):
    parameters = {
        'model': {
            'embed_size': 128,
            'hidden_size': [1024, 300],
            'num_layers': 1,
            'dropout': 0.5
            #'bidirectional': [False, True],
        },
        'num_epochs': 2,
        'batch_size': 20,
        'seq_length': 30,
        'log_interval': 300,
        'clip_norm': 0.5,
        'lr': 0.002,
        'cuda': True,
        'seed': 313,
        ##
        
        'lr_decay_start':    6,
        'lr_decay':    0.8,
        'optimizer':    'sgd',
        'weight_decay': 0, # weight decay applied to all weights (0 = no decay)
    }
    
    
    # Load "Penn Treebank" dataset
    corpus = Corpus()
    train_data = corpus.get_data(os.path.join(data_path, 'train.txt'), parameters['batch_size'])
    valid_data = corpus.get_data(os.path.join(data_path, 'valid.txt'), parameters['batch_size'])
    test_data = corpus.get_data(os.path.join(data_path, 'test.txt'), parameters['batch_size'])

    parameters['model']['vocab_size'] = len(corpus.dictionary)
    print('vocab_size: ', parameters['model']['vocab_size'])
    
    all_results = []

    all_parameters = create_parameter_grid(parameters)
    
    for index, params in enumerate(all_parameters):
        LOGGER.info("\nTuning %s/%s", index+1, len(all_parameters))
        LOGGER.info("Parameters: %s", json.dumps(params, indent=4, default=str))
        start = time.time()
        _, results = train_lstm_model(
            train_data,
            valid_data,
            test_data,
            params=params,
            max_epochs=50,
            verbose_logging=True
            )
        
        # LOGGER.info("Results: %s", json.dumps(results, indent=4, default=str))
        LOGGER.info("Training took: %ss", time.time()-start)
        all_results.append({"parameters": params, "results": results})
        
    
    return all_results

In [40]:
hyperparameter_tune_lstm('data/penn/')

INFO:__main__:
Tuning 1/4
INFO:__main__:Parameters: {
    "model": {
        "embed_size": 128,
        "hidden_size": 1024,
        "num_layers": 1,
        "bidirectional": false,
        "vocab_size": 10000
    },
    "num_epochs": 2,
    "batch_size": 20,
    "seq_length": 30,
    "log_interval": 300,
    "clip_norm": 0.5,
    "lr": 0.002,
    "num_samples": 1000,
    "cuda": true
}


vocab_size:  10000
{'embed_size': 128, 'hidden_size': 1024, 'num_layers': 1, 'bidirectional': False, 'vocab_size': 10000}
########## Epoch [1/2] ##########




Step[300/1549], Loss: 6.0453, Perplexity: 422.14
Step[600/1549], Loss: 5.4975, Perplexity: 244.08
Step[900/1549], Loss: 5.2349, Perplexity: 187.71
Step[1200/1549], Loss: 5.1129, Perplexity: 166.15
Step[1500/1549], Loss: 4.8727, Perplexity: 130.67
---------- End of Epoch 1 ----------
Train Loss: 5.3476, Train Perplexity: 210.10
Valid Loss: 4.9520, Valid Perplexity: 141.46
----------------------------------------
########## Epoch [2/2] ##########
Step[300/1549], Loss: 4.6833, Perplexity: 108.13
Step[600/1549], Loss: 4.4808, Perplexity: 88.31
Step[900/1549], Loss: 4.3361, Perplexity: 76.41
Step[1200/1549], Loss: 4.3098, Perplexity: 74.42
Step[1500/1549], Loss: 4.0955, Perplexity: 60.07


INFO:__main__:Training took: 65.67201852798462s
INFO:__main__:
Tuning 2/4
INFO:__main__:Parameters: {
    "model": {
        "embed_size": 128,
        "hidden_size": 1024,
        "num_layers": 1,
        "bidirectional": true,
        "vocab_size": 10000
    },
    "num_epochs": 2,
    "batch_size": 20,
    "seq_length": 30,
    "log_interval": 300,
    "clip_norm": 0.5,
    "lr": 0.002,
    "num_samples": 1000,
    "cuda": true
}


---------- End of Epoch 2 ----------
Train Loss: 4.3807, Train Perplexity: 79.90
Valid Loss: 4.9013, Valid Perplexity: 134.46
----------------------------------------
{'embed_size': 128, 'hidden_size': 1024, 'num_layers': 1, 'bidirectional': True, 'vocab_size': 10000}
########## Epoch [1/2] ##########
Step[300/1549], Loss: 2.6058, Perplexity: 13.54
Step[600/1549], Loss: 0.7607, Perplexity:  2.14
Step[900/1549], Loss: 0.5299, Perplexity:  1.70
Step[1200/1549], Loss: 0.4674, Perplexity:  1.60
Step[1500/1549], Loss: 0.4321, Perplexity:  1.54
---------- End of Epoch 1 ----------
Train Loss: 0.9485, Train Perplexity:  2.58
Valid Loss: 0.4525, Valid Perplexity:  1.57
----------------------------------------
########## Epoch [2/2] ##########
Step[300/1549], Loss: 0.3567, Perplexity:  1.43
Step[600/1549], Loss: 0.3086, Perplexity:  1.36
Step[900/1549], Loss: 0.2917, Perplexity:  1.34
Step[1200/1549], Loss: 0.2768, Perplexity:  1.32
Step[1500/1549], Loss: 0.2623, Perplexity:  1.30


INFO:__main__:Training took: 117.92710971832275s
INFO:__main__:
Tuning 3/4
INFO:__main__:Parameters: {
    "model": {
        "embed_size": 128,
        "hidden_size": 300,
        "num_layers": 1,
        "bidirectional": false,
        "vocab_size": 10000
    },
    "num_epochs": 2,
    "batch_size": 20,
    "seq_length": 30,
    "log_interval": 300,
    "clip_norm": 0.5,
    "lr": 0.002,
    "num_samples": 1000,
    "cuda": true
}


---------- End of Epoch 2 ----------
Train Loss: 0.2986, Train Perplexity:  1.35
Valid Loss: 0.4588, Valid Perplexity:  1.58
----------------------------------------
{'embed_size': 128, 'hidden_size': 300, 'num_layers': 1, 'bidirectional': False, 'vocab_size': 10000}
########## Epoch [1/2] ##########
Step[300/1549], Loss: 6.2727, Perplexity: 529.88
Step[600/1549], Loss: 5.6858, Perplexity: 294.66
Step[900/1549], Loss: 5.4095, Perplexity: 223.51
Step[1200/1549], Loss: 5.2757, Perplexity: 195.52
Step[1500/1549], Loss: 5.0567, Perplexity: 157.07
---------- End of Epoch 1 ----------
Train Loss: 5.5339, Train Perplexity: 253.12
Valid Loss: 5.1215, Valid Perplexity: 167.59
----------------------------------------
########## Epoch [2/2] ##########
Step[300/1549], Loss: 4.9450, Perplexity: 140.47
Step[600/1549], Loss: 4.8075, Perplexity: 122.42
Step[900/1549], Loss: 4.7044, Perplexity: 110.43
Step[1200/1549], Loss: 4.6851, Perplexity: 108.32
Step[1500/1549], Loss: 4.5033, Perplexity: 90.31


INFO:__main__:Training took: 24.296768188476562s
INFO:__main__:
Tuning 4/4
INFO:__main__:Parameters: {
    "model": {
        "embed_size": 128,
        "hidden_size": 300,
        "num_layers": 1,
        "bidirectional": true,
        "vocab_size": 10000
    },
    "num_epochs": 2,
    "batch_size": 20,
    "seq_length": 30,
    "log_interval": 300,
    "clip_norm": 0.5,
    "lr": 0.002,
    "num_samples": 1000,
    "cuda": true
}


---------- End of Epoch 2 ----------
Train Loss: 4.7304, Train Perplexity: 113.35
Valid Loss: 4.9490, Valid Perplexity: 141.04
----------------------------------------
{'embed_size': 128, 'hidden_size': 300, 'num_layers': 1, 'bidirectional': True, 'vocab_size': 10000}
########## Epoch [1/2] ##########
Step[300/1549], Loss: 3.7231, Perplexity: 41.39
Step[600/1549], Loss: 1.4057, Perplexity:  4.08
Step[900/1549], Loss: 0.8308, Perplexity:  2.30
Step[1200/1549], Loss: 0.6251, Perplexity:  1.87
Step[1500/1549], Loss: 0.5080, Perplexity:  1.66
---------- End of Epoch 1 ----------
Train Loss: 1.3950, Train Perplexity:  4.04
Valid Loss: 0.4965, Valid Perplexity:  1.64
----------------------------------------
########## Epoch [2/2] ##########
Step[300/1549], Loss: 0.3788, Perplexity:  1.46
Step[600/1549], Loss: 0.3277, Perplexity:  1.39
Step[900/1549], Loss: 0.3091, Perplexity:  1.36
Step[1200/1549], Loss: 0.2984, Perplexity:  1.35
Step[1500/1549], Loss: 0.2805, Perplexity:  1.32


INFO:__main__:Training took: 37.24978232383728s


---------- End of Epoch 2 ----------
Train Loss: 0.3183, Train Perplexity:  1.37
Valid Loss: 0.4446, Valid Perplexity:  1.56
----------------------------------------


[{'parameters': {'model': {'embed_size': 128,
    'hidden_size': 1024,
    'num_layers': 1,
    'bidirectional': False,
    'vocab_size': 10000},
   'num_epochs': 2,
   'batch_size': 20,
   'seq_length': 30,
   'log_interval': 300,
   'clip_norm': 0.5,
   'lr': 0.002,
   'num_samples': 1000,
   'cuda': True},
  'results': True},
 {'parameters': {'model': {'embed_size': 128,
    'hidden_size': 1024,
    'num_layers': 1,
    'bidirectional': True,
    'vocab_size': 10000},
   'num_epochs': 2,
   'batch_size': 20,
   'seq_length': 30,
   'log_interval': 300,
   'clip_norm': 0.5,
   'lr': 0.002,
   'num_samples': 1000,
   'cuda': True},
  'results': True},
 {'parameters': {'model': {'embed_size': 128,
    'hidden_size': 300,
    'num_layers': 1,
    'bidirectional': False,
    'vocab_size': 10000},
   'num_epochs': 2,
   'batch_size': 20,
   'seq_length': 30,
   'log_interval': 300,
   'clip_norm': 0.5,
   'lr': 0.002,
   'num_samples': 1000,
   'cuda': True},
  'results': True},
 {'parame