Define model.

In [1]:
import torch.nn as nn
from torch.autograd import Variable

In [2]:
class LanguageModel(nn.Module):
    def __init__(self, rnn_type, vocab_size, embedding_dim, hidden_dim, n_layers, tie_weights=False, dropout=0.5):
        super(LanguageModel, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn_type = rnn_type
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        if rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        else:
            raise ValueError("rnn_type must be GRU or LSTM!")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        if tie_weights:
            assert embedding_dim == hidden_dim, "If tying weights, embedding_dim must be equal to hidden_dim"
            self.fc.weight = self.embedding.weight
            
        self.init_weights()
            
        
    def forward(self, input, hidden):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        output = self.dropout(output) #output is [bptt, batch_size, hidden_dim], hidden is [n_layers, batch_size, hidden_dim]
        decoded = self.fc(output.view(output.size(0)*output.size(1), output.size(2))) #decoded is [bptt*batch_size, vocab_size]
        decoded = decoded.view(output.size(0), output.size(1), decoded.size(1)) #decoded now [bptt, batch_size, vocab_size]
        return decoded, hidden 
    
    def init_weights(self):
        init_range = 1.0
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-init_range, init_range)
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()),
                    Variable(weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()))
        else:
            return Variable(weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

Data preparation.

In [3]:
import os
import torch

In [4]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [5]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        #train, test and valid are just one giant flat tensor with the indexes of each word
        self.train = self.tokenize(os.path.join(path, 'train.txt')) 
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            n_tokens = 0
            for line in f: #splits file into line
                words = line.split() + ['<eos>'] #adds end of sequence token at end of each line
                n_tokens += len(words) #tokens keeps track of number of tokens seen
                for word in words: #adds each word to the dictionary
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(n_tokens) #one giant tensor the length of all tokens seen
            token = 0
            for line in f:
                words = line.split() + ['<eos>'] 
                for word in words: 
                    ids[token] = self.dictionary.word2idx[word] #convert each word into their index
                    token += 1

        return ids

In [6]:
corpus = Corpus('data/')

print(corpus.train.size())

torch.Size([2088628])


In [7]:
def batchify(data, batch_size):
    
    #calculate how many batches we can make
    n_batches = data.size(0) // batch_size
    
    #cut off the end data so everything fits nicely in batches
    data = data.narrow(0, 0, n_batches * batch_size)

    #reshape into (chunk, batch_size) tensor
    data = data.view(batch_size, -1).t().contiguous()
    
    return data

In [8]:
BATCH_SIZE = 32

train_data = batchify(corpus.train, BATCH_SIZE)
val_data = batchify(corpus.valid, BATCH_SIZE)
test_data = batchify(corpus.test, BATCH_SIZE)

print(train_data.size())

torch.Size([65269, 32])


In [9]:
n_tokens = len(corpus.dictionary)

Prepare for training.

In [10]:
model = LanguageModel('GRU', n_tokens, 64, 100, 2, tie_weights=False, dropout=0.5)

In [11]:
criterion = nn.CrossEntropyLoss()

In [12]:
def repackage_hidden(h):
    """
    Wraps hidden states in new Variables, to detach them from their history.
    """
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h) #if using LSTM, hidden state is tuple

In [13]:
BPTT = 35

def get_batch(source, i, evaluation=False):
    """
    Currently have big_number * batch_size, we can't backpropagate all the way through
    to the big number because of vanishing/exploding gradients, therefore we have to split
    this data up into chunks we can BPTT to.
    """
    seq_len = min(BPTT, len(source) - 1 - i) #either the whole BPTT or whatever is left
    data = Variable(source[i:i+seq_len], volatile=evaluation) #if evaluating, want volatile so we don't learn/take grads/optimize
    target = Variable(source[i+1:i+1+seq_len].view(-1)) #target is flattened [bptt*batch_size] tensor
    return data, target

In [14]:
import time

CLIP = 0.25
LOG_INTERVAL = 1
LR = 20

def train():
    model.train() #enables dropout
    total_loss = 0
    start_time = time.time()
    n_tokens = len(corpus.dictionary)
    
    hidden = model.init_hidden(BATCH_SIZE) #gets initial hidden states
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, BPTT)):
        data, targets = get_batch(train_data, i) #data is [bptt, batch_size], target is [bptt*batch_size]
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, n_tokens), targets) #view flattens all bptt*batch_size predictions into one
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), CLIP)
        for p in model.parameters():
            p.data.add_(-LR, p.grad.data)

        total_loss += loss.data

        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss[0] / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // BPTT, LR,
                elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [15]:
def evaluate(data_source):
    model.eval() #disables dropout
    total_loss = 0
    n_tokens = len(corpus.dictionary)
    hidden = model.init_hidden(BATCH_SIZE)
    for i in range(0, data_source.size(0) - 1, BPTT):
        data, targets = get_batch(data_source, i, evaluation=True) #evaluation = True makes volatile
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, n_tokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)

In [20]:
EPOCHS = 0
best_val_loss = None
SAVE = 'model.pt'
import math
    
for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    print('-' * 89)
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), SAVE)
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0

In [21]:
model.load_state_dict(torch.load(SAVE))

In [22]:
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

KeyboardInterrupt: 