In [1]:
import argparse
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

import data
import model

# Add ckp
parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
parser.add_argument('--data', type=str, default='/home/jamesjunior2/wikitext-2', # /input
                    help='location of the data corpus')
parser.add_argument('--checkpoint', type=str, default='',
                    help='model checkpoint to use')
parser.add_argument('--model', type=str, default='LSTM',
                    help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
parser.add_argument('--emsize', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
                    help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
parser.add_argument('--lr', type=float, default=20,
                    help='initial learning rate')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
                    help='batch size')
parser.add_argument('--bptt', type=int, default=35,
                    help='sequence length')
parser.add_argument('--dropout', type=float, default=0.2,
                    help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true',
                    help='tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str,  default='/home/jamesjunior2/quantized_distillation/saved_MODELS/lang_model.pt', # /output
                    help='path to save the final model')
args = parser.parse_args(args=[])

In [2]:
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.manual_seed(args.seed)



In [3]:
###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args.data)

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if args.cuda:
        data = data.cuda()
    return data

In [4]:
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [5]:
train_data

tensor([[    0,   284, 15178,  ...,  1352,  1335,    16],
        [    1,   357,    43,  ...,    46,    43,  2015],
        [    2,  1496,  7369,  ...,   380,    27, 33001],
        ...,
        [  357,   415,   173,  ...,   212,    78,  1575],
        [ 2520,     9,  3890,  ...,   208,    27,   808],
        [   33,    35,    19,  ...,  8832,  6091,   209]])

In [6]:
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)

In [7]:
ntokens

33278

In [8]:
# Load checkpoint
if args.checkpoint != '':
    if args.cuda:
        model = torch.load(args.checkpoint)
    else:
        # Load GPU model on CPU
        model = torch.load(args.checkpoint, map_location=lambda storage, loc: storage)

In [9]:
if args.cuda:
    model.cuda()
else:
    model.cpu()
print (model)

criterion = nn.CrossEntropyLoss()
if args.cuda:
    criterion.cuda()

RNNModel(
  (drop): Dropout(p=0.2, inplace=False)
  (encoder): Embedding(33278, 200)
  (rnn): LSTM(200, 200, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=33278, bias=True)
)


In [10]:
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def get_batch(source, i, evaluation=False):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [11]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data
        #total_loss += loss_val.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:
# Loop over epochs.
lr = args.lr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)


| epoch   1 |   200/ 2983 batches | lr 20.00 | ms/batch 141.06 | loss  7.62 | ppl  2038.69
| epoch   1 |   400/ 2983 batches | lr 20.00 | ms/batch 142.99 | loss  6.85 | ppl   943.33
| epoch   1 |   600/ 2983 batches | lr 20.00 | ms/batch 139.65 | loss  6.47 | ppl   647.76
| epoch   1 |   800/ 2983 batches | lr 20.00 | ms/batch 145.64 | loss  6.29 | ppl   541.33
| epoch   1 |  1000/ 2983 batches | lr 20.00 | ms/batch 148.82 | loss  6.14 | ppl   465.67
| epoch   1 |  1200/ 2983 batches | lr 20.00 | ms/batch 146.42 | loss  6.06 | ppl   428.69
| epoch   1 |  1400/ 2983 batches | lr 20.00 | ms/batch 145.18 | loss  5.95 | ppl   382.33
| epoch   1 |  1600/ 2983 batches | lr 20.00 | ms/batch 148.35 | loss  5.96 | ppl   385.84
| epoch   1 |  1800/ 2983 batches | lr 20.00 | ms/batch 151.80 | loss  5.80 | ppl   330.54
| epoch   1 |  2000/ 2983 batches | lr 20.00 | ms/batch 151.06 | loss  5.78 | ppl   322.88
| epoch   1 |  2200/ 2983 batches | lr 20.00 | ms/batch 153.42 | loss  5.66 | ppl   287.40

  


-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 467.80s | valid loss  5.55 | valid ppl   257.29
-----------------------------------------------------------------------------------------
| epoch   2 |   200/ 2983 batches | lr 20.00 | ms/batch 153.09 | loss  5.55 | ppl   256.59
| epoch   2 |   400/ 2983 batches | lr 20.00 | ms/batch 153.34 | loss  5.53 | ppl   253.40
| epoch   2 |   600/ 2983 batches | lr 20.00 | ms/batch 149.13 | loss  5.36 | ppl   213.14
| epoch   2 |   800/ 2983 batches | lr 20.00 | ms/batch 149.74 | loss  5.38 | ppl   216.79
| epoch   2 |  1000/ 2983 batches | lr 20.00 | ms/batch 149.24 | loss  5.36 | ppl   212.78
| epoch   2 |  1200/ 2983 batches | lr 20.00 | ms/batch 153.33 | loss  5.34 | ppl   207.84
| epoch   2 |  1400/ 2983 batches | lr 20.00 | ms/batch 157.24 | loss  5.33 | ppl   205.93
| epoch   2 |  1600/ 2983 batches | lr 20.00 | ms/batch 152.45 | loss  5.39 | ppl   219.56
| epoch   2 |  18

| epoch   7 |  1000/ 2983 batches | lr 20.00 | ms/batch 156.81 | loss  4.63 | ppl   102.04
| epoch   7 |  1200/ 2983 batches | lr 20.00 | ms/batch 158.00 | loss  4.63 | ppl   102.88
| epoch   7 |  1400/ 2983 batches | lr 20.00 | ms/batch 149.20 | loss  4.67 | ppl   107.10
| epoch   7 |  1600/ 2983 batches | lr 20.00 | ms/batch 155.46 | loss  4.75 | ppl   115.55
| epoch   7 |  1800/ 2983 batches | lr 20.00 | ms/batch 152.71 | loss  4.64 | ppl   103.38
| epoch   7 |  2000/ 2983 batches | lr 20.00 | ms/batch 153.38 | loss  4.67 | ppl   106.57
| epoch   7 |  2200/ 2983 batches | lr 20.00 | ms/batch 147.32 | loss  4.57 | ppl    96.31
| epoch   7 |  2400/ 2983 batches | lr 20.00 | ms/batch 154.46 | loss  4.61 | ppl   100.64
| epoch   7 |  2600/ 2983 batches | lr 20.00 | ms/batch 154.69 | loss  4.64 | ppl   103.51
| epoch   7 |  2800/ 2983 batches | lr 20.00 | ms/batch 151.12 | loss  4.57 | ppl    96.84
-----------------------------------------------------------------------------------------


| epoch  12 |  2400/ 2983 batches | lr 5.00 | ms/batch 156.09 | loss  4.25 | ppl    69.88
| epoch  12 |  2600/ 2983 batches | lr 5.00 | ms/batch 152.27 | loss  4.26 | ppl    71.16
| epoch  12 |  2800/ 2983 batches | lr 5.00 | ms/batch 153.65 | loss  4.19 | ppl    65.78
-----------------------------------------------------------------------------------------
| end of epoch  12 | time: 485.81s | valid loss  4.84 | valid ppl   126.45
-----------------------------------------------------------------------------------------
| epoch  13 |   200/ 2983 batches | lr 5.00 | ms/batch 155.64 | loss  4.38 | ppl    79.81
| epoch  13 |   400/ 2983 batches | lr 5.00 | ms/batch 153.25 | loss  4.38 | ppl    80.23
| epoch  13 |   600/ 2983 batches | lr 5.00 | ms/batch 153.97 | loss  4.22 | ppl    67.70
| epoch  13 |   800/ 2983 batches | lr 5.00 | ms/batch 154.47 | loss  4.27 | ppl    71.53
| epoch  13 |  1000/ 2983 batches | lr 5.00 | ms/batch 152.26 | loss  4.28 | ppl    72.50
| epoch  13 |  1200/ 2983

| epoch  18 |   600/ 2983 batches | lr 1.25 | ms/batch 151.81 | loss  4.09 | ppl    59.65
| epoch  18 |   800/ 2983 batches | lr 1.25 | ms/batch 147.24 | loss  4.16 | ppl    64.21
| epoch  18 |  1000/ 2983 batches | lr 1.25 | ms/batch 184.38 | loss  4.16 | ppl    64.32
| epoch  18 |  1200/ 2983 batches | lr 1.25 | ms/batch 155.65 | loss  4.18 | ppl    65.15
| epoch  18 |  1400/ 2983 batches | lr 1.25 | ms/batch 152.81 | loss  4.20 | ppl    66.59
| epoch  18 |  1600/ 2983 batches | lr 1.25 | ms/batch 150.10 | loss  4.25 | ppl    70.37
| epoch  18 |  1800/ 2983 batches | lr 1.25 | ms/batch 152.86 | loss  4.16 | ppl    63.83
| epoch  18 |  2000/ 2983 batches | lr 1.25 | ms/batch 149.66 | loss  4.20 | ppl    66.44
| epoch  18 |  2200/ 2983 batches | lr 1.25 | ms/batch 161.96 | loss  4.06 | ppl    58.15
| epoch  18 |  2400/ 2983 batches | lr 1.25 | ms/batch 148.27 | loss  4.10 | ppl    60.25
| epoch  18 |  2600/ 2983 batches | lr 1.25 | ms/batch 148.99 | loss  4.13 | ppl    61.89
| epoch  1

| epoch  23 |  2000/ 2983 batches | lr 0.31 | ms/batch 150.37 | loss  4.17 | ppl    64.93
| epoch  23 |  2200/ 2983 batches | lr 0.31 | ms/batch 146.41 | loss  4.04 | ppl    56.66
| epoch  23 |  2400/ 2983 batches | lr 0.31 | ms/batch 151.59 | loss  4.07 | ppl    58.32
| epoch  23 |  2600/ 2983 batches | lr 0.31 | ms/batch 151.93 | loss  4.11 | ppl    60.80
| epoch  23 |  2800/ 2983 batches | lr 0.31 | ms/batch 156.39 | loss  4.03 | ppl    56.28
-----------------------------------------------------------------------------------------
| end of epoch  23 | time: 481.70s | valid loss  4.76 | valid ppl   116.31
-----------------------------------------------------------------------------------------
| epoch  24 |   200/ 2983 batches | lr 0.31 | ms/batch 150.30 | loss  4.21 | ppl    67.26
| epoch  24 |   400/ 2983 batches | lr 0.31 | ms/batch 150.67 | loss  4.24 | ppl    69.45
| epoch  24 |   600/ 2983 batches | lr 0.31 | ms/batch 154.50 | loss  4.06 | ppl    57.69
| epoch  24 |   800/ 2983