In [1]:
import argparse
import pickle
import os, sys
import time
import math
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from embed_regularize import embedded_dropout
from locked_dropout import LockedDropout
from weight_drop import WeightDrop

import gc

import data

from utils import batchify, get_batch, repackage_hidden, create_exp_dir, save_checkpoint

Authors' implementation uses exactly the same regularization and optimizing techniques as were presented in [Merity 2017]. The only differences are: smaller lr(20.0 vs 30.0), 

In [2]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nhidlast, nlayers, 
                 dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, 
                 tie_weights=False, ldropout=0.5, n_experts=10, moc=False, mos=False):
        super(RNNModel, self).__init__()
        #lock dropout uses same dropout mask for all repeated connections in one forward
        self.lockdrop = LockedDropout()
        self.encoder = nn.Embedding(ntoken, ninp)
        self.decoder = nn.Linear(ninp, ntoken)
        
        self.rnns = [torch.nn.LSTM(ninp if l == 0 else nhid, nhid if l != nlayers - 1 else nhidlast, 1, dropout=0) for l in range(nlayers)]
        if wdrop:
            self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        
        #MOS
        if moc or mos:
            self.prior = nn.Linear(nhidlast, n_experts, bias=False)
            self.latent = nn.Sequential(nn.Linear(nhidlast, n_experts*ninp), nn.Tanh())
        
        if tie_weights:
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nhidlast = nhidlast
        self.nlayers = nlayers
        
        #output dropout
        self.dropout = dropout
        #input dropout
        self.dropouti = dropouti
        #hidden dropout
        self.dropouth = dropouth
        #embedded dropout
        self.dropoute = dropoute 
        #latent dropout
        self.ldropout = ldropout 
        self.dropoutl = ldropout 
        
        self.n_experts = n_experts 
        self.ntoken = ntoken

        self.wdrop = wdrop
        self.moc = moc
        self.mos = mos

        size = 0
        for p in self.parameters():
            size += p.nelement()
        print('Model param size: {}'.format(size))

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, return_h=False, return_prob=False):
        batch_size = input.size(1)
        emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0)
        
        
        emb = self.lockdrop(emb, self.dropouti)

        raw_output = emb
        new_hidden = []
        raw_outputs = []
        outputs = []
        for l, rnn in enumerate(self.rnns):
            raw_output, new_h = rnn(raw_output, hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.nlayers - 1:
                raw_output = self.lockdrop(raw_output, self.dropouth)
                outputs.append(raw_output)
        hidden = new_hidden

        output = self.lockdrop(raw_output, self.dropout)# size: seq_len x batch_size x nhidlast
        outputs.append(output)
        """ MoS / MoC / SoftMax"""
        if self.mos:
            latent = self.latent(output) #size: seq_len x batch_size x n_experts*ninp
            latent = self.lockdrop(latent, self.dropoutl) 
            logit = self.decoder(latent.view(-1, self.ninp)) # h^t_c dot w_c^T

            prior_logit = self.prior(output).contiguous().view(-1, self.n_experts)
            prior = nn.functional.softmax(prior_logit)
            prob = nn.functional.softmax(logit.view(-1, self.ntoken)).view(-1, self.n_experts, self.ntoken)
            prob = (prob * prior.unsqueeze(2).expand_as(prob)).sum(1)
        elif self.moc:
            latent = self.latent(output) #size: seq_len x batch_size x n_experts*ninp
            latent = self.lockdrop(latent, self.dropoutl) 
            logit = self.decoder(latent.view(-1, self.ninp)) # h^t_c dot w_c^T, size: seq_len*batch_size*n_experts x ntokens

            prior_logit = self.prior(output).contiguous().view(-1, self.n_experts)
            prior = nn.functional.softmax(prior_logit)
            logit = logit.view(-1, self.n_experts, self.ntoken)
            logit = (logit * prior.unsqueeze(2).expand_as(logit)).sum(1)
            prob = nn.functional.softmax(logit)
            
        else:
            logit = self.decoder(output.view(-1, self.ninp))
            prob = nn.functional.softmax(logit.view(-1, self.ntoken))

            
        if return_prob:
            model_output = prob
        else:
            log_prob = torch.log(prob + 1e-8)
            model_output = log_prob

        model_output = model_output.view(-1, batch_size, self.ntoken)

        if return_h:
            return model_output, hidden, raw_outputs, outputs
        return model_output, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return [(Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else self.nhidlast).zero_()),
                 Variable(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else self.nhidlast).zero_()))
                for l in range(self.nlayers)]

In [3]:
###############################################################################
# Arguments
###############################################################################
import arguments
# Get arguments - defaultPennParameters(), defaultWT2Parameters(), or custom
#args = arguments.defaultPennParameters()



def new_params(args_str):
    a = arguments.parser.parse_args(args_str.split())
    
    if a.nhidlast < 0 or not (a.mos or a.moc):
        a.nhidlast = a.emsize
    if a.dropoutl < 0:
        a.dropoutl = a.dropouth
    if a.small_batch_size < 0:
        a.small_batch_size = a.batch_size
        
    # Set the random seed manually for reproducibility.
    np.random.seed(a.seed)
    torch.manual_seed(a.seed)
    if torch.cuda.is_available():
        if not a.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed_all(a.seed)
            
    if not a.continue_train:
        a.save = '{}-{}'.format(a.save, time.strftime("%Y%m%d-%H%M%S"))
        create_exp_dir(a.save, scripts_to_save=['main.py', 'model.py'])
    
    return a
    
def logging(s, print_=True, log_=True):
    if print_:
        print(s)
    if log_:
        with open(os.path.join(args.save, 'log.txt'), 'a+') as f_log:
            f_log.write(s + '\n')
            
def serialize(folder, var, var_name):
    with open(os.path.join(folder, var_name + '.pkl'), 'wb') as f:
        pickle.dump(var, f, protocol=2)
        
def unserialize(folder, var_name):
    with open(os.path.join(folder, var_name + '.pkl'), 'rb') as f:
        v = pickle.load(f)
        return v

In [23]:
###############################################################################
# Load data
###############################################################################
#args = new_params("--data data/penn")
args = new_params("--data data/wikitext-2")
corpus = data.Corpus(args.data)

eval_batch_size = 10
test_batch_size = 1
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)

Experiment dir : EXP-20180205-002751
torch.Size([21764, 10])
torch.Size([245569, 1])


In [5]:
###############################################################################
# Build the model
###############################################################################
def new_model(args, corpus):
    ntokens = len(corpus.dictionary)
    if args.continue_train:
        model = torch.load(os.path.join(args.save, 'model.pt'))
    else:
        model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nhidlast, args.nlayers, 
                           args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, 
                           args.tied, args.dropoutl, args.n_experts, args.moc, args.mos)

    if args.cuda:
        if args.single_gpu:
            parallel_model = model.cuda()
        else:
            parallel_model = nn.DataParallel(model, dim=1).cuda()
    else:
        parallel_model = model

    total_params = sum(x.data.nelement() for x in model.parameters())
    logging('Args: {}'.format(args))
    logging('Model total parameters: {}'.format(total_params))
    
    return model, parallel_model

criterion = nn.CrossEntropyLoss()

In [16]:
###############################################################################
# Training code
###############################################################################

def evaluate(model, parallel_model, corpus, args, data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, args, evaluation=True)
        targets = targets.view(-1)

        log_prob, hidden = parallel_model(data, hidden)

        loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data

        total_loss += loss * len(data)

        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)


def train(model, parallel_model, optimizer, args, corpus, history, train_state):
    assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size'
    # Turn on training mode which enables dropout.
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)]
    batch, i = 0, 0
    training_loss, log_iter = 0, 1
    training_epoch_loss, losses_iter = 0, 0
    
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        seq_len = min(seq_len, args.bptt + args.max_seq_len_delta)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        optimizer.zero_grad()

        start, end, s_id = 0, args.small_batch_size, 0
        while start < args.batch_size:
            cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1)

            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            hidden[s_id] = repackage_hidden(hidden[s_id])

            log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True)
            raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)
            training_epoch_loss += raw_loss.data[0]
            losses_iter += 1
            
            loss = raw_loss
            # Activiation Regularization
            loss +=  sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            loss += sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            loss *= args.small_batch_size / args.batch_size
            total_loss += raw_loss.data * args.small_batch_size / args.batch_size
            loss.backward()

            s_id += 1
            start = end
            end = start + args.small_batch_size

            gc.collect()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        # total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
            
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            #history["train_errs"].append((train_state.iteration, cur_loss))
            training_loss += cur_loss
            elapsed = time.time() - start_time
            logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                train_state.epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len
        train_state.iteration += 1
    history["train_errs"].append(float(training_epoch_loss)/losses_iter)

class TrainState:
    def __init__(self):
        self.iteration = 1
        self.epoch = 1
        
def SGD(model, parallel_model, args, corpus):
    # Loop over epochs.
    train_state = TrainState()
    saved_iteration = 0
    lr = args.lr
    best_val_loss = []
    stored_loss = 100000000
    history = {"train_errs":[], "val_errs":[], "val_errs2":[]}

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        if args.continue_train:
            optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt'))
            train_state = unserialize(args.save, 'train_state')
            history = unserialize(args.save, 'history')
            if 't0' in optimizer_state['param_groups'][0]:
                optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
            else:
                optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
            optimizer.load_state_dict(optimizer_state)
        else:
            optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)

        for epoch in range(train_state.epoch, args.epochs+1):
            epoch_start_time = time.time()
            train_state.epoch = epoch
            train(model, parallel_model, optimizer, args, corpus, history, train_state)
            if 't0' in optimizer.param_groups[0]:
                tmp = {}
                for prm in model.parameters():
                    tmp[prm] = prm.data.clone()
                    prm.data = optimizer.state[prm]['ax'].clone()

                val_loss2 = evaluate(model, parallel_model, corpus, args, val_data, eval_batch_size)
                history["val_errs2"].append((train_state.iteration, val_loss2))
                logging('-' * 89)
                logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                   val_loss2, math.exp(val_loss2)))
                logging('-' * 89)

                if val_loss2 < stored_loss:
                    save_checkpoint(model, optimizer, args.save)
                    logging('Saving Averaged!')
                    stored_loss = val_loss2

                for prm in model.parameters():
                    prm.data = tmp[prm].clone()

            else:
                val_loss = evaluate(model, parallel_model, corpus, args, val_data, eval_batch_size)
                history["val_errs"].append((train_state.iteration, val_loss))
                logging('-' * 89)
                logging('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                   val_loss, math.exp(val_loss)))
                logging('-' * 89)

                if val_loss < stored_loss:
                    save_checkpoint(model, optimizer, args.save)
                    logging('Saving Normal!')
                    stored_loss = val_loss

                if 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])):
                    logging('Switching!')
                    optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
                    #optimizer.param_groups[0]['lr'] /= 2.
                best_val_loss.append(val_loss)
            
        saved_iteration = train_state.iteration
        serialize(args.save, history, 'history')
        serialize(args.save, train_state, 'train_state')
        eval_test(corpus, args)

    except KeyboardInterrupt:
        logging('-' * 89)
        logging('Exiting from training early')
        train_state.iteration = saved_iteration
        serialize(args.save, history, 'history')
        serialize(args.save, train_state, 'train_state')
        eval_test(corpus, args)

def eval_test(corpus, args):
    # Load the best saved model.
    model = torch.load(os.path.join(args.save, 'model.pt'))
    parallel_model = nn.DataParallel(model, dim=1).cuda()

    # Run on test data.
    test_loss = evaluate(model, parallel_model, corpus, args, test_data, test_batch_size)
    logging('=' * 89)
    logging('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    logging('=' * 89)


In [7]:
history = unserialize("PTB-20180119-065855", "history")
hist = np.array(history["train_errs"])
hist2 = np.array(history["val_errs"])
hist3 = np.array(history["val_errs2"])

plt.figure(figsize=(20,20))
plt.plot(hist[:,0], hist[:,1], label='batch train loss')
plt.plot(hist2[:,0], hist2[:,1], label='val train loss')
plt.plot(hist3[:,0], hist3[:,1], label='val2 train loss')
plt.legend()
plt.show()

#history = unserialize("Experiments/PTBRepro", 'history')
#state = unserialize("Experiments/PTBRepro", 'train_state')

#print(history['train_errs'][-1])
#print(state.epoch)

#--data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 28 --batch_size 12 --lr 20.0 --epoch 1000 --nhid 960 --nhidlast 620 --emsize 280 --n_experts 15 --save PTB/PTB --single_gpu

FileNotFoundError: [Errno 2] No such file or directory: 'PTB-20180119-065855/history.pkl'

In [None]:
# MoS
args = new_params("--data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 20 --batch_size 12\
    --max_seq_len_delta 15 --lr 20.0 --epoch 150 --nhid 960 --nhidlast 620 --mos --emsize 280 --n_experts 15 --save Experiments/MoS2")
train_data = batchify(corpus.train, args.batch_size, args)

model, parallel_model = new_model(args, corpus)
SGD(model, parallel_model, args, corpus)

## MoC

args = new_params("--data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 20 --batch_size 12\
    --max_seq_len_delta 40 --lr 20.0 --epoch 150 --nhid 960 --nhidlast 620 --moc --emsize 280 --n_experts 15 --save Experiments/MoC2")
train_data = batchify(corpus.train, args.batch_size, args)

model, parallel_model = new_model(args, corpus)
SGD(model, parallel_model, args, corpus)

Experiment dir : Experiments/MoS2-20180120-215158
torch.Size([77465, 12])
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Model param size: 21500620
Args: Namespace(alpha=2, batch_size=12, beta=1, bptt=70, clip=0.25, continue_train=False, cuda=True, data='data/penn', dropout=0.4, dropoute=0.1, dropouth=0.225, dropouti=0.4, dropoutl=0.29, emsize=280, epochs=150, log_interval=200, lr=20.0, max_seq_len_delta=15, moc=False, model='LSTM', mos=True, n_experts=15, nhid=960, nhidlast=620, nlayers=3, nonmono=5, save='Experiments/MoS2-20180120-215158', seed=20, single_gpu=False, small_batch_size=12, tied=True, wdecay=1.2e-06, wdrop=0.5)
Model total parameters: 21500620


  result = self.forward(*input, **kwargs)


| epoch   1 |   200/ 1106 batches | lr 20.00 | ms/batch 160.52 | loss  6.96 | ppl  1050.87
| epoch   1 |   400/ 1106 batches | lr 20.00 | ms/batch 160.19 | loss  6.62 | ppl   750.65
| epoch   1 |   600/ 1106 batches | lr 20.00 | ms/batch 161.74 | loss  6.40 | ppl   599.30
| epoch   1 |   800/ 1106 batches | lr 20.00 | ms/batch 160.81 | loss  6.21 | ppl   496.24
| epoch   1 |  1000/ 1106 batches | lr 20.00 | ms/batch 165.30 | loss  6.01 | ppl   405.48
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 189.79s | valid loss  5.80 | valid ppl   330.08
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


Saving Normal!
| epoch   2 |   200/ 1106 batches | lr 20.00 | ms/batch 163.22 | loss  5.78 | ppl   323.54
| epoch   2 |   400/ 1106 batches | lr 20.00 | ms/batch 163.97 | loss  5.63 | ppl   278.61
| epoch   2 |   600/ 1106 batches | lr 20.00 | ms/batch 163.92 | loss  5.53 | ppl   251.76
| epoch   2 |   800/ 1106 batches | lr 20.00 | ms/batch 160.29 | loss  5.51 | ppl   246.62
| epoch   2 |  1000/ 1106 batches | lr 20.00 | ms/batch 160.58 | loss  5.46 | ppl   235.11
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 191.83s | valid loss  5.26 | valid ppl   193.30
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch   3 |   200/ 1106 batches | lr 20.00 | ms/batch 162.11 | loss  5.39 | ppl   220.29
| epoch   3 |   400/ 1106 batches | lr 20.00 | ms/batch 164.03 | loss  5.28 | ppl   197.11
| epoch   3 |   600/ 1106 batches | lr 20.00 | ms/batch 164.64 | loss  5.24 | 

| epoch  13 |   800/ 1106 batches | lr 20.00 | ms/batch 163.76 | loss  4.43 | ppl    83.70
| epoch  13 |  1000/ 1106 batches | lr 20.00 | ms/batch 163.81 | loss  4.45 | ppl    85.70
-----------------------------------------------------------------------------------------
| end of epoch  13 | time: 192.09s | valid loss  4.49 | valid ppl    88.75
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  14 |   200/ 1106 batches | lr 20.00 | ms/batch 162.43 | loss  4.47 | ppl    87.14
| epoch  14 |   400/ 1106 batches | lr 20.00 | ms/batch 162.96 | loss  4.37 | ppl    78.82
| epoch  14 |   600/ 1106 batches | lr 20.00 | ms/batch 160.76 | loss  4.37 | ppl    79.22
| epoch  14 |   800/ 1106 batches | lr 20.00 | ms/batch 162.68 | loss  4.40 | ppl    81.16
| epoch  14 |  1000/ 1106 batches | lr 20.00 | ms/batch 163.26 | loss  4.43 | ppl    83.77
-----------------------------------------------------------------------------------------
| e

Saving Normal!
| epoch  25 |   200/ 1106 batches | lr 20.00 | ms/batch 164.45 | loss  4.25 | ppl    69.96
| epoch  25 |   400/ 1106 batches | lr 20.00 | ms/batch 163.35 | loss  4.12 | ppl    61.80
| epoch  25 |   600/ 1106 batches | lr 20.00 | ms/batch 163.82 | loss  4.14 | ppl    62.98
| epoch  25 |   800/ 1106 batches | lr 20.00 | ms/batch 163.32 | loss  4.18 | ppl    65.05
| epoch  25 |  1000/ 1106 batches | lr 20.00 | ms/batch 162.04 | loss  4.21 | ppl    67.09
-----------------------------------------------------------------------------------------
| end of epoch  25 | time: 191.94s | valid loss  4.35 | valid ppl    77.74
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  26 |   200/ 1106 batches | lr 20.00 | ms/batch 163.54 | loss  4.22 | ppl    68.10
| epoch  26 |   400/ 1106 batches | lr 20.00 | ms/batch 164.86 | loss  4.12 | ppl    61.55
| epoch  26 |   600/ 1106 batches | lr 20.00 | ms/batch 163.44 | loss  4.14 | 

| epoch  36 |   800/ 1106 batches | lr 20.00 | ms/batch 161.76 | loss  4.05 | ppl    57.60
| epoch  36 |  1000/ 1106 batches | lr 20.00 | ms/batch 164.32 | loss  4.09 | ppl    59.46
-----------------------------------------------------------------------------------------
| end of epoch  36 | time: 192.16s | valid loss  4.30 | valid ppl    73.69
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  37 |   200/ 1106 batches | lr 20.00 | ms/batch 163.24 | loss  4.10 | ppl    60.57
| epoch  37 |   400/ 1106 batches | lr 20.00 | ms/batch 164.80 | loss  4.00 | ppl    54.79
| epoch  37 |   600/ 1106 batches | lr 20.00 | ms/batch 163.61 | loss  4.02 | ppl    55.66
| epoch  37 |   800/ 1106 batches | lr 20.00 | ms/batch 164.90 | loss  4.06 | ppl    57.82
| epoch  37 |  1000/ 1106 batches | lr 20.00 | ms/batch 162.97 | loss  4.08 | ppl    59.42
-----------------------------------------------------------------------------------------
| e

Saving Normal!
| epoch  48 |   200/ 1106 batches | lr 20.00 | ms/batch 161.61 | loss  4.04 | ppl    56.72
| epoch  48 |   400/ 1106 batches | lr 20.00 | ms/batch 163.23 | loss  3.93 | ppl    50.85
| epoch  48 |   600/ 1106 batches | lr 20.00 | ms/batch 163.03 | loss  3.94 | ppl    51.60
| epoch  48 |   800/ 1106 batches | lr 20.00 | ms/batch 162.39 | loss  3.97 | ppl    52.83
| epoch  48 |  1000/ 1106 batches | lr 20.00 | ms/batch 164.38 | loss  4.00 | ppl    54.74
-----------------------------------------------------------------------------------------
| end of epoch  48 | time: 192.20s | valid loss  4.26 | valid ppl    70.84
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  49 |   200/ 1106 batches | lr 20.00 | ms/batch 164.92 | loss  4.03 | ppl    56.07
| epoch  49 |   400/ 1106 batches | lr 20.00 | ms/batch 164.27 | loss  3.93 | ppl    50.71
| epoch  49 |   600/ 1106 batches | lr 20.00 | ms/batch 162.43 | loss  3.93 | 

| epoch  59 |   800/ 1106 batches | lr 20.00 | ms/batch 168.52 | loss  3.92 | ppl    50.45
| epoch  59 |  1000/ 1106 batches | lr 20.00 | ms/batch 165.30 | loss  3.96 | ppl    52.32
-----------------------------------------------------------------------------------------
| end of epoch  59 | time: 196.44s | valid loss  4.14 | valid ppl    62.65
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  60 |   200/ 1106 batches | lr 20.00 | ms/batch 166.16 | loss  3.99 | ppl    53.79
| epoch  60 |   400/ 1106 batches | lr 20.00 | ms/batch 163.90 | loss  3.88 | ppl    48.23
| epoch  60 |   600/ 1106 batches | lr 20.00 | ms/batch 166.64 | loss  3.89 | ppl    48.91
| epoch  60 |   800/ 1106 batches | lr 20.00 | ms/batch 165.61 | loss  3.92 | ppl    50.48
| epoch  60 |  1000/ 1106 batches | lr 20.00 | ms/batch 163.40 | loss  3.97 | ppl    52.98
-----------------------------------------------------------------------------------------
|

Saving Averaged!
| epoch  71 |   200/ 1106 batches | lr 20.00 | ms/batch 167.02 | loss  3.93 | ppl    51.04
| epoch  71 |   400/ 1106 batches | lr 20.00 | ms/batch 167.21 | loss  3.84 | ppl    46.40
| epoch  71 |   600/ 1106 batches | lr 20.00 | ms/batch 165.83 | loss  3.85 | ppl    47.10
| epoch  71 |   800/ 1106 batches | lr 20.00 | ms/batch 167.95 | loss  3.89 | ppl    48.72
| epoch  71 |  1000/ 1106 batches | lr 20.00 | ms/batch 165.92 | loss  3.90 | ppl    49.32
-----------------------------------------------------------------------------------------
| end of epoch  71 | time: 196.23s | valid loss  4.12 | valid ppl    61.74
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  72 |   200/ 1106 batches | lr 20.00 | ms/batch 168.79 | loss  3.93 | ppl    51.09
| epoch  72 |   400/ 1106 batches | lr 20.00 | ms/batch 165.84 | loss  3.82 | ppl    45.72
| epoch  72 |   600/ 1106 batches | lr 20.00 | ms/batch 168.32 | loss  3.8

In [12]:
args = new_params("--data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 20 --batch_size 12\
    --max_seq_len_delta 15 --lr 20.0 --epoch 150 --nhid 1010 --nhidlast 650 --emsize 280 --n_experts 5 --mos --save Experiments/PTB-MoC-Experts5")
train_data = batchify(corpus.train, args.batch_size, args)

model, parallel_model = new_model(args, corpus)
SGD(model, parallel_model, args, corpus)

Experiment dir : Experiments/PTB-MoC-Experts5-20180124-164354
torch.Size([77465, 12])
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Model param size: 21434410
Args: Namespace(alpha=2, batch_size=12, beta=1, bptt=70, clip=0.25, continue_train=False, cuda=True, data='data/penn', dropout=0.4, dropoute=0.1, dropouth=0.225, dropouti=0.4, dropoutl=0.29, emsize=280, epochs=150, log_interval=200, lr=20.0, max_seq_len_delta=15, moc=False, model='LSTM', mos=True, n_experts=5, nhid=1010, nhidlast=650, nlayers=3, nonmono=5, save='Experiments/PTB-MoC-Experts5-20180124-164354', seed=20, single_gpu=False, small_batch_size=12, tied=True, wdecay=1.2e-06, wdrop=0.5)
Model total parameters: 21434410


  result = self.forward(*input, **kwargs)


| epoch   1 |   200/ 1106 batches | lr 20.00 | ms/batch 100.77 | loss  7.42 | ppl  1666.25
| epoch   1 |   400/ 1106 batches | lr 20.00 | ms/batch 100.18 | loss  6.81 | ppl   907.43
| epoch   1 |   600/ 1106 batches | lr 20.00 | ms/batch 100.32 | loss  6.53 | ppl   684.23
| epoch   1 |   800/ 1106 batches | lr 20.00 | ms/batch 100.08 | loss  6.51 | ppl   670.00
| epoch   1 |  1000/ 1106 batches | lr 20.00 | ms/batch 102.05 | loss  6.41 | ppl   606.00
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 118.68s | valid loss  6.38 | valid ppl   589.99
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


Saving Normal!
| epoch   2 |   200/ 1106 batches | lr 20.00 | ms/batch 113.34 | loss  6.29 | ppl   541.21
| epoch   2 |   400/ 1106 batches | lr 20.00 | ms/batch 101.80 | loss  6.11 | ppl   449.05
| epoch   2 |   600/ 1106 batches | lr 20.00 | ms/batch 101.47 | loss  5.96 | ppl   387.49
| epoch   2 |   800/ 1106 batches | lr 20.00 | ms/batch 99.62 | loss  5.90 | ppl   363.52
| epoch   2 |  1000/ 1106 batches | lr 20.00 | ms/batch 99.64 | loss  5.82 | ppl   338.08
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 122.10s | valid loss  5.61 | valid ppl   272.43
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch   3 |   200/ 1106 batches | lr 20.00 | ms/batch 100.41 | loss  5.72 | ppl   304.55
| epoch   3 |   400/ 1106 batches | lr 20.00 | ms/batch 101.98 | loss  5.59 | ppl   268.05
| epoch   3 |   600/ 1106 batches | lr 20.00 | ms/batch 102.37 | loss  5.50 | pp

| epoch  13 |   800/ 1106 batches | lr 20.00 | ms/batch 102.82 | loss  4.54 | ppl    93.31
| epoch  13 |  1000/ 1106 batches | lr 20.00 | ms/batch 102.32 | loss  4.55 | ppl    95.10
-----------------------------------------------------------------------------------------
| end of epoch  13 | time: 121.09s | valid loss  4.61 | valid ppl   100.41
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  14 |   200/ 1106 batches | lr 20.00 | ms/batch 101.36 | loss  4.57 | ppl    96.22
| epoch  14 |   400/ 1106 batches | lr 20.00 | ms/batch 102.26 | loss  4.46 | ppl    86.62
| epoch  14 |   600/ 1106 batches | lr 20.00 | ms/batch 100.59 | loss  4.46 | ppl    86.52
| epoch  14 |   800/ 1106 batches | lr 20.00 | ms/batch 102.32 | loss  4.50 | ppl    89.60
| epoch  14 |  1000/ 1106 batches | lr 20.00 | ms/batch 101.75 | loss  4.53 | ppl    93.06
-----------------------------------------------------------------------------------------
| e

Saving Normal!
| epoch  25 |   200/ 1106 batches | lr 20.00 | ms/batch 102.47 | loss  4.32 | ppl    74.82
| epoch  25 |   400/ 1106 batches | lr 20.00 | ms/batch 102.57 | loss  4.20 | ppl    66.67
| epoch  25 |   600/ 1106 batches | lr 20.00 | ms/batch 102.21 | loss  4.21 | ppl    67.49
| epoch  25 |   800/ 1106 batches | lr 20.00 | ms/batch 103.36 | loss  4.25 | ppl    69.96
| epoch  25 |  1000/ 1106 batches | lr 20.00 | ms/batch 102.42 | loss  4.26 | ppl    71.09
-----------------------------------------------------------------------------------------
| end of epoch  25 | time: 121.05s | valid loss  4.42 | valid ppl    82.78
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  26 |   200/ 1106 batches | lr 20.00 | ms/batch 102.61 | loss  4.30 | ppl    73.43
| epoch  26 |   400/ 1106 batches | lr 20.00 | ms/batch 103.41 | loss  4.18 | ppl    65.16
| epoch  26 |   600/ 1106 batches | lr 20.00 | ms/batch 102.53 | loss  4.20 | 

| epoch  36 |   800/ 1106 batches | lr 20.00 | ms/batch 101.76 | loss  4.11 | ppl    60.71
| epoch  36 |  1000/ 1106 batches | lr 20.00 | ms/batch 105.33 | loss  4.14 | ppl    63.04
-----------------------------------------------------------------------------------------
| end of epoch  36 | time: 121.45s | valid loss  4.35 | valid ppl    77.71
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  37 |   200/ 1106 batches | lr 20.00 | ms/batch 102.03 | loss  4.16 | ppl    64.33
| epoch  37 |   400/ 1106 batches | lr 20.00 | ms/batch 102.87 | loss  4.06 | ppl    58.16
| epoch  37 |   600/ 1106 batches | lr 20.00 | ms/batch 102.06 | loss  4.07 | ppl    58.30
| epoch  37 |   800/ 1106 batches | lr 20.00 | ms/batch 103.63 | loss  4.10 | ppl    60.59
| epoch  37 |  1000/ 1106 batches | lr 20.00 | ms/batch 101.71 | loss  4.13 | ppl    62.24
-----------------------------------------------------------------------------------------
| e

| epoch  48 |   200/ 1106 batches | lr 20.00 | ms/batch 101.98 | loss  4.09 | ppl    59.68
| epoch  48 |   400/ 1106 batches | lr 20.00 | ms/batch 102.01 | loss  3.97 | ppl    52.75
| epoch  48 |   600/ 1106 batches | lr 20.00 | ms/batch 101.96 | loss  3.99 | ppl    54.05
| epoch  48 |   800/ 1106 batches | lr 20.00 | ms/batch 101.95 | loss  4.02 | ppl    55.47
| epoch  48 |  1000/ 1106 batches | lr 20.00 | ms/batch 103.08 | loss  4.04 | ppl    56.97
-----------------------------------------------------------------------------------------
| end of epoch  48 | time: 121.14s | valid loss  4.32 | valid ppl    74.91
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  49 |   200/ 1106 batches | lr 20.00 | ms/batch 102.73 | loss  4.08 | ppl    58.88
| epoch  49 |   400/ 1106 batches | lr 20.00 | ms/batch 102.71 | loss  3.98 | ppl    53.26
| epoch  49 |   600/ 1106 batches | lr 20.00 | ms/batch 103.45 | loss  3.99 | ppl    54.02
| 

| epoch  59 |   800/ 1106 batches | lr 20.00 | ms/batch 106.91 | loss  3.96 | ppl    52.24
| epoch  59 |  1000/ 1106 batches | lr 20.00 | ms/batch 105.39 | loss  3.99 | ppl    54.22
-----------------------------------------------------------------------------------------
| end of epoch  59 | time: 125.44s | valid loss  4.15 | valid ppl    63.69
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  60 |   200/ 1106 batches | lr 20.00 | ms/batch 105.30 | loss  4.03 | ppl    56.33
| epoch  60 |   400/ 1106 batches | lr 20.00 | ms/batch 104.68 | loss  3.93 | ppl    50.76
| epoch  60 |   600/ 1106 batches | lr 20.00 | ms/batch 105.56 | loss  3.95 | ppl    51.89
| epoch  60 |   800/ 1106 batches | lr 20.00 | ms/batch 104.76 | loss  3.96 | ppl    52.30
| epoch  60 |  1000/ 1106 batches | lr 20.00 | ms/batch 103.58 | loss  4.01 | ppl    54.95
-----------------------------------------------------------------------------------------
|

Saving Averaged!
| epoch  71 |   200/ 1106 batches | lr 20.00 | ms/batch 106.14 | loss  3.98 | ppl    53.61
| epoch  71 |   400/ 1106 batches | lr 20.00 | ms/batch 106.19 | loss  3.88 | ppl    48.40
| epoch  71 |   600/ 1106 batches | lr 20.00 | ms/batch 105.50 | loss  3.89 | ppl    49.00
| epoch  71 |   800/ 1106 batches | lr 20.00 | ms/batch 106.81 | loss  3.93 | ppl    50.77
| epoch  71 |  1000/ 1106 batches | lr 20.00 | ms/batch 106.52 | loss  3.96 | ppl    52.35
-----------------------------------------------------------------------------------------
| end of epoch  71 | time: 125.54s | valid loss  4.14 | valid ppl    62.89
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  72 |   200/ 1106 batches | lr 20.00 | ms/batch 107.55 | loss  3.99 | ppl    53.81
| epoch  72 |   400/ 1106 batches | lr 20.00 | ms/batch 106.23 | loss  3.88 | ppl    48.38
| epoch  72 |   600/ 1106 batches | lr 20.00 | ms/batch 107.37 | loss  3.8

| epoch  82 |   600/ 1106 batches | lr 20.00 | ms/batch 107.11 | loss  3.86 | ppl    47.58
| epoch  82 |   800/ 1106 batches | lr 20.00 | ms/batch 105.65 | loss  3.89 | ppl    48.73
| epoch  82 |  1000/ 1106 batches | lr 20.00 | ms/batch 107.02 | loss  3.92 | ppl    50.57
-----------------------------------------------------------------------------------------
| end of epoch  82 | time: 125.26s | valid loss  4.13 | valid ppl    62.39
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  83 |   200/ 1106 batches | lr 20.00 | ms/batch 107.06 | loss  3.95 | ppl    52.02
| epoch  83 |   400/ 1106 batches | lr 20.00 | ms/batch 106.22 | loss  3.86 | ppl    47.23
| epoch  83 |   600/ 1106 batches | lr 20.00 | ms/batch 104.75 | loss  3.86 | ppl    47.64
| epoch  83 |   800/ 1106 batches | lr 20.00 | ms/batch 106.09 | loss  3.88 | ppl    48.60
| epoch  83 |  1000/ 1106 batches | lr 20.00 | ms/batch 104.76 | loss  3.92 | ppl    50.19


-----------------------------------------------------------------------------------------
| end of epoch  93 | time: 125.13s | valid loss  4.13 | valid ppl    62.00
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  94 |   200/ 1106 batches | lr 20.00 | ms/batch 105.61 | loss  3.92 | ppl    50.40
| epoch  94 |   400/ 1106 batches | lr 20.00 | ms/batch 104.65 | loss  3.82 | ppl    45.57
| epoch  94 |   600/ 1106 batches | lr 20.00 | ms/batch 106.46 | loss  3.84 | ppl    46.61
| epoch  94 |   800/ 1106 batches | lr 20.00 | ms/batch 105.45 | loss  3.85 | ppl    47.05
| epoch  94 |  1000/ 1106 batches | lr 20.00 | ms/batch 107.43 | loss  3.88 | ppl    48.66
-----------------------------------------------------------------------------------------
| end of epoch  94 | time: 125.43s | valid loss  4.13 | valid ppl    61.97
-----------------------------------------------------------------------------------------
Saving Averaged!
|

Saving Averaged!
| epoch 105 |   200/ 1106 batches | lr 20.00 | ms/batch 105.75 | loss  3.90 | ppl    49.52
| epoch 105 |   400/ 1106 batches | lr 20.00 | ms/batch 106.15 | loss  3.78 | ppl    44.03
| epoch 105 |   600/ 1106 batches | lr 20.00 | ms/batch 106.12 | loss  3.80 | ppl    44.73
| epoch 105 |   800/ 1106 batches | lr 20.00 | ms/batch 105.82 | loss  3.83 | ppl    46.14
| epoch 105 |  1000/ 1106 batches | lr 20.00 | ms/batch 104.33 | loss  3.87 | ppl    48.05
-----------------------------------------------------------------------------------------
| end of epoch 105 | time: 125.25s | valid loss  4.12 | valid ppl    61.68
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch 106 |   200/ 1106 batches | lr 20.00 | ms/batch 105.09 | loss  3.90 | ppl    49.39
| epoch 106 |   400/ 1106 batches | lr 20.00 | ms/batch 105.53 | loss  3.79 | ppl    44.46
| epoch 106 |   600/ 1106 batches | lr 20.00 | ms/batch 105.41 | loss  3.8

| epoch 116 |   600/ 1106 batches | lr 20.00 | ms/batch 105.03 | loss  3.79 | ppl    44.07
| epoch 116 |   800/ 1106 batches | lr 20.00 | ms/batch 105.58 | loss  3.81 | ppl    45.19
| epoch 116 |  1000/ 1106 batches | lr 20.00 | ms/batch 104.64 | loss  3.84 | ppl    46.48
-----------------------------------------------------------------------------------------
| end of epoch 116 | time: 125.02s | valid loss  4.12 | valid ppl    61.43
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch 117 |   200/ 1106 batches | lr 20.00 | ms/batch 105.98 | loss  3.87 | ppl    47.95
| epoch 117 |   400/ 1106 batches | lr 20.00 | ms/batch 105.61 | loss  3.78 | ppl    43.69
| epoch 117 |   600/ 1106 batches | lr 20.00 | ms/batch 104.24 | loss  3.79 | ppl    44.12
| epoch 117 |   800/ 1106 batches | lr 20.00 | ms/batch 104.89 | loss  3.80 | ppl    44.55
| epoch 117 |  1000/ 1106 batches | lr 20.00 | ms/batch 105.95 | loss  3.85 | ppl    47.08


-----------------------------------------------------------------------------------------
| end of epoch 127 | time: 124.93s | valid loss  4.11 | valid ppl    61.19
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch 128 |   200/ 1106 batches | lr 20.00 | ms/batch 105.20 | loss  3.85 | ppl    47.09
| epoch 128 |   400/ 1106 batches | lr 20.00 | ms/batch 105.03 | loss  3.75 | ppl    42.42
| epoch 128 |   600/ 1106 batches | lr 20.00 | ms/batch 107.38 | loss  3.76 | ppl    42.77
| epoch 128 |   800/ 1106 batches | lr 20.00 | ms/batch 106.81 | loss  3.79 | ppl    44.47
| epoch 128 |  1000/ 1106 batches | lr 20.00 | ms/batch 107.96 | loss  3.83 | ppl    46.23
-----------------------------------------------------------------------------------------
| end of epoch 128 | time: 125.79s | valid loss  4.11 | valid ppl    61.17
-----------------------------------------------------------------------------------------
Saving Averaged!
|

Saving Averaged!
| epoch 139 |   200/ 1106 batches | lr 20.00 | ms/batch 107.14 | loss  3.84 | ppl    46.72
| epoch 139 |   400/ 1106 batches | lr 20.00 | ms/batch 105.64 | loss  3.72 | ppl    41.38
| epoch 139 |   600/ 1106 batches | lr 20.00 | ms/batch 104.93 | loss  3.75 | ppl    42.70
| epoch 139 |   800/ 1106 batches | lr 20.00 | ms/batch 106.74 | loss  3.78 | ppl    43.64
| epoch 139 |  1000/ 1106 batches | lr 20.00 | ms/batch 104.82 | loss  3.81 | ppl    44.97
-----------------------------------------------------------------------------------------
| end of epoch 139 | time: 126.24s | valid loss  4.11 | valid ppl    60.96
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch 140 |   200/ 1106 batches | lr 20.00 | ms/batch 120.82 | loss  3.85 | ppl    47.20
| epoch 140 |   400/ 1106 batches | lr 20.00 | ms/batch 105.72 | loss  3.72 | ppl    41.46
| epoch 140 |   600/ 1106 batches | lr 20.00 | ms/batch 105.47 | loss  3.7

| epoch 150 |   600/ 1106 batches | lr 20.00 | ms/batch 105.83 | loss  3.74 | ppl    42.08
| epoch 150 |   800/ 1106 batches | lr 20.00 | ms/batch 105.91 | loss  3.77 | ppl    43.30
| epoch 150 |  1000/ 1106 batches | lr 20.00 | ms/batch 104.83 | loss  3.81 | ppl    45.04
-----------------------------------------------------------------------------------------
| end of epoch 150 | time: 127.73s | valid loss  4.11 | valid ppl    60.78
-----------------------------------------------------------------------------------------
Saving Averaged!
| End of training | test loss  4.07 | test ppl    58.77


In [12]:
args = new_params("--data data/penn --dropouti 0.4 --dropoutl 0.29 --dropouth 0.225 --seed 20 --batch_size 12\
    --max_seq_len_delta 40 --lr 20.0 --epoch 150 --nhid 350 --nhidlast 100 --emsize 1200 --n_experts 15 --save PTB/3000Emb")
train_data = batchify(corpus.train, args.batch_size, args)

model, parallel_model = new_model(args, corpus)
SGD(model, parallel_model, args, corpus)

Experiment dir : PTB/3000Emb-20180130-000512
torch.Size([77465, 12])
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Model param size: 22615200
Args: Namespace(alpha=2, batch_size=12, beta=1, bptt=70, clip=0.25, continue_train=False, cuda=True, data='data/penn', dropout=0.4, dropoute=0.1, dropouth=0.225, dropouti=0.4, dropoutl=0.29, emsize=1200, epochs=150, log_interval=200, lr=20.0, max_seq_len_delta=40, moc=False, model='LSTM', mos=False, n_experts=15, nhid=350, nhidlast=1200, nlayers=3, nonmono=5, save='PTB/3000Emb-20180130-000512', seed=20, single_gpu=False, small_batch_size=12, tied=True, wdecay=1.2e-06, wdrop=0.5)
Model total parameters: 22615200


  result = self.forward(*input, **kwargs)


| epoch   1 |   200/ 1106 batches | lr 20.00 | ms/batch 88.04 | loss  7.27 | ppl  1443.30
| epoch   1 |   400/ 1106 batches | lr 20.00 | ms/batch 73.98 | loss  6.54 | ppl   695.53
| epoch   1 |   600/ 1106 batches | lr 20.00 | ms/batch 73.55 | loss  6.21 | ppl   499.68
| epoch   1 |   800/ 1106 batches | lr 20.00 | ms/batch 74.98 | loss  6.05 | ppl   423.71
| epoch   1 |  1000/ 1106 batches | lr 20.00 | ms/batch 73.92 | loss  5.90 | ppl   366.38
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 89.65s | valid loss  5.62 | valid ppl   274.69
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


Saving Normal!
| epoch   2 |   200/ 1106 batches | lr 20.00 | ms/batch 72.37 | loss  5.73 | ppl   307.26
| epoch   2 |   400/ 1106 batches | lr 20.00 | ms/batch 73.28 | loss  5.59 | ppl   268.74
| epoch   2 |   600/ 1106 batches | lr 20.00 | ms/batch 76.52 | loss  5.49 | ppl   241.94
| epoch   2 |   800/ 1106 batches | lr 20.00 | ms/batch 73.07 | loss  5.48 | ppl   240.26
| epoch   2 |  1000/ 1106 batches | lr 20.00 | ms/batch 73.32 | loss  5.44 | ppl   229.89
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 86.94s | valid loss  5.20 | valid ppl   180.94
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch   3 |   200/ 1106 batches | lr 20.00 | ms/batch 75.91 | loss  5.39 | ppl   219.24
| epoch   3 |   400/ 1106 batches | lr 20.00 | ms/batch 76.25 | loss  5.28 | ppl   195.61
| epoch   3 |   600/ 1106 batches | lr 20.00 | ms/batch 75.21 | loss  5.23 | ppl   187

| epoch  13 |   800/ 1106 batches | lr 20.00 | ms/batch 71.78 | loss  4.46 | ppl    86.61
| epoch  13 |  1000/ 1106 batches | lr 20.00 | ms/batch 75.88 | loss  4.47 | ppl    87.70
-----------------------------------------------------------------------------------------
| end of epoch  13 | time: 89.80s | valid loss  4.46 | valid ppl    86.33
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  14 |   200/ 1106 batches | lr 20.00 | ms/batch 76.48 | loss  4.50 | ppl    90.36
| epoch  14 |   400/ 1106 batches | lr 20.00 | ms/batch 73.45 | loss  4.38 | ppl    80.01
| epoch  14 |   600/ 1106 batches | lr 20.00 | ms/batch 71.53 | loss  4.37 | ppl    79.17
| epoch  14 |   800/ 1106 batches | lr 20.00 | ms/batch 73.72 | loss  4.42 | ppl    83.28
| epoch  14 |  1000/ 1106 batches | lr 20.00 | ms/batch 76.19 | loss  4.44 | ppl    85.00
-----------------------------------------------------------------------------------------
| end of ep

Saving Normal!
| epoch  25 |   200/ 1106 batches | lr 20.00 | ms/batch 76.00 | loss  4.23 | ppl    68.61
| epoch  25 |   400/ 1106 batches | lr 20.00 | ms/batch 75.49 | loss  4.10 | ppl    60.55
| epoch  25 |   600/ 1106 batches | lr 20.00 | ms/batch 76.28 | loss  4.11 | ppl    61.19
| epoch  25 |   800/ 1106 batches | lr 20.00 | ms/batch 75.90 | loss  4.16 | ppl    63.75
| epoch  25 |  1000/ 1106 batches | lr 20.00 | ms/batch 74.38 | loss  4.18 | ppl    65.27
-----------------------------------------------------------------------------------------
| end of epoch  25 | time: 88.77s | valid loss  4.36 | valid ppl    77.88
-----------------------------------------------------------------------------------------
Saving Normal!
| epoch  26 |   200/ 1106 batches | lr 20.00 | ms/batch 73.23 | loss  4.21 | ppl    67.34
| epoch  26 |   400/ 1106 batches | lr 20.00 | ms/batch 74.45 | loss  4.10 | ppl    60.56
| epoch  26 |   600/ 1106 batches | lr 20.00 | ms/batch 75.46 | loss  4.11 | ppl    60

| epoch  36 |  1000/ 1106 batches | lr 20.00 | ms/batch 74.59 | loss  4.04 | ppl    56.93
-----------------------------------------------------------------------------------------
| end of epoch  36 | time: 88.70s | valid loss  4.33 | valid ppl    76.27
-----------------------------------------------------------------------------------------
| epoch  37 |   200/ 1106 batches | lr 20.00 | ms/batch 76.30 | loss  4.06 | ppl    58.05
| epoch  37 |   400/ 1106 batches | lr 20.00 | ms/batch 76.84 | loss  3.97 | ppl    52.76
| epoch  37 |   600/ 1106 batches | lr 20.00 | ms/batch 77.17 | loss  3.97 | ppl    52.80
| epoch  37 |   800/ 1106 batches | lr 20.00 | ms/batch 72.93 | loss  4.01 | ppl    54.97
| epoch  37 |  1000/ 1106 batches | lr 20.00 | ms/batch 75.50 | loss  4.04 | ppl    56.64
-----------------------------------------------------------------------------------------
| end of epoch  37 | time: 88.69s | valid loss  4.34 | valid ppl    76.46
------------------------------------------

Saving Averaged!
| epoch  48 |   200/ 1106 batches | lr 20.00 | ms/batch 76.51 | loss  3.97 | ppl    53.20
| epoch  48 |   400/ 1106 batches | lr 20.00 | ms/batch 76.25 | loss  3.86 | ppl    47.57
| epoch  48 |   600/ 1106 batches | lr 20.00 | ms/batch 77.15 | loss  3.87 | ppl    47.77
| epoch  48 |   800/ 1106 batches | lr 20.00 | ms/batch 77.06 | loss  3.92 | ppl    50.26
| epoch  48 |  1000/ 1106 batches | lr 20.00 | ms/batch 76.48 | loss  3.94 | ppl    51.47
-----------------------------------------------------------------------------------------
| end of epoch  48 | time: 90.10s | valid loss  4.24 | valid ppl    69.39
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  49 |   200/ 1106 batches | lr 20.00 | ms/batch 76.91 | loss  3.96 | ppl    52.59
| epoch  49 |   400/ 1106 batches | lr 20.00 | ms/batch 78.69 | loss  3.86 | ppl    47.48
| epoch  49 |   600/ 1106 batches | lr 20.00 | ms/batch 77.28 | loss  3.87 | ppl  

| epoch  59 |   800/ 1106 batches | lr 20.00 | ms/batch 79.66 | loss  3.84 | ppl    46.59
| epoch  59 |  1000/ 1106 batches | lr 20.00 | ms/batch 75.88 | loss  3.87 | ppl    47.96
-----------------------------------------------------------------------------------------
| end of epoch  59 | time: 91.49s | valid loss  4.23 | valid ppl    68.75
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  60 |   200/ 1106 batches | lr 20.00 | ms/batch 78.85 | loss  3.89 | ppl    48.91
| epoch  60 |   400/ 1106 batches | lr 20.00 | ms/batch 79.50 | loss  3.81 | ppl    44.94
| epoch  60 |   600/ 1106 batches | lr 20.00 | ms/batch 78.84 | loss  3.81 | ppl    45.35
| epoch  60 |   800/ 1106 batches | lr 20.00 | ms/batch 76.97 | loss  3.82 | ppl    45.82
| epoch  60 |  1000/ 1106 batches | lr 20.00 | ms/batch 74.52 | loss  3.88 | ppl    48.46
-----------------------------------------------------------------------------------------
| end of 

Saving Averaged!
| epoch  71 |   200/ 1106 batches | lr 20.00 | ms/batch 77.63 | loss  3.85 | ppl    47.13
| epoch  71 |   400/ 1106 batches | lr 20.00 | ms/batch 77.34 | loss  3.74 | ppl    42.24
| epoch  71 |   600/ 1106 batches | lr 20.00 | ms/batch 77.91 | loss  3.77 | ppl    43.29
| epoch  71 |   800/ 1106 batches | lr 20.00 | ms/batch 78.07 | loss  3.79 | ppl    44.08
| epoch  71 |  1000/ 1106 batches | lr 20.00 | ms/batch 74.99 | loss  3.82 | ppl    45.74
-----------------------------------------------------------------------------------------
| end of epoch  71 | time: 91.00s | valid loss  4.22 | valid ppl    68.34
-----------------------------------------------------------------------------------------
Saving Averaged!
| epoch  72 |   200/ 1106 batches | lr 20.00 | ms/batch 78.92 | loss  3.85 | ppl    46.88
| epoch  72 |   400/ 1106 batches | lr 20.00 | ms/batch 78.15 | loss  3.75 | ppl    42.45
| epoch  72 |   600/ 1106 batches | lr 20.00 | ms/batch 77.87 | loss  3.76 | ppl  

In [None]:
# MoS
#args = new_params("--data data/wikitext-2 --save Experiments/WT2MoS --dropouth 0.2 --seed 1882 --mos --n_experts 15 --nhid 900 --nhidlast 550 --emsize 300 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 3 --max_seq_len_delta 5 --dropouti 0.55 --single_gpu")
#train_data = batchify(corpus.train, args.batch_size, args)

#model, parallel_model = new_model(args, corpus)
#SGD(model, parallel_model, args, corpus)

# MoC
args = new_params("--data data/wikitext-2 --save Experiments/WT2Softmax --dropouth 0.2 --seed 1882 --nhid 870 --nhidlast 800 --emsize 400 --batch_size 15 --lr 15.0 --dropoutl 0.29 --small_batch_size 3 --max_seq_len_delta 5 --dropouti 0.55 --single_gpu")
train_data = batchify(corpus.train, args.batch_size, args)

model, parallel_model = new_model(args, corpus)
SGD(model, parallel_model, args, corpus)

Experiment dir : Experiments/WT2Softmax-20180205-002859
torch.Size([139241, 15])
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Model param size: 25868398
Args: Namespace(alpha=2, batch_size=15, beta=1, bptt=70, clip=0.25, continue_train=False, cuda=True, data='data/wikitext-2', dropout=0.4, dropoute=0.1, dropouth=0.2, dropouti=0.55, dropoutl=0.29, emsize=400, epochs=8000, log_interval=200, lr=15.0, max_seq_len_delta=5, moc=False, model='LSTM', mos=False, n_experts=10, nhid=870, nhidlast=400, nlayers=3, nonmono=5, save='Experiments/WT2Softmax-20180205-002859', seed=1882, single_gpu=True, small_batch_size=3, tied=True, wdecay=1.2e-06, wdrop=0.5)
Model total parameters: 25868398


  result = self.forward(*input, **kwargs)


| epoch   1 |   200/ 1989 batches | lr 15.00 | ms/batch 271.06 | loss  1.56 | ppl     4.74
| epoch   1 |   400/ 1989 batches | lr 15.00 | ms/batch 263.16 | loss  1.42 | ppl     4.15
| epoch   1 |   600/ 1989 batches | lr 15.00 | ms/batch 267.65 | loss  1.37 | ppl     3.94
| epoch   1 |   800/ 1989 batches | lr 15.00 | ms/batch 297.09 | loss  1.33 | ppl     3.78
| epoch   1 |  1000/ 1989 batches | lr 15.00 | ms/batch 305.71 | loss  1.30 | ppl     3.68
| epoch   1 |  1200/ 1989 batches | lr 15.00 | ms/batch 304.38 | loss  1.29 | ppl     3.63
| epoch   1 |  1400/ 1989 batches | lr 15.00 | ms/batch 301.25 | loss  1.27 | ppl     3.54
| epoch   1 |  1600/ 1989 batches | lr 15.00 | ms/batch 304.00 | loss  1.26 | ppl     3.51
| epoch   1 |  1800/ 1989 batches | lr 15.00 | ms/batch 300.61 | loss  1.25 | ppl     3.47
| epoch   1 |  2000/ 1989 batches | lr 15.00 | ms/batch 305.30 | loss  1.23 | ppl     3.44
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


Saving Normal!
| epoch   2 |   200/ 1989 batches | lr 15.00 | ms/batch 309.07 | loss  1.22 | ppl     3.40
| epoch   2 |   400/ 1989 batches | lr 15.00 | ms/batch 309.65 | loss  1.21 | ppl     3.36
| epoch   2 |   600/ 1989 batches | lr 15.00 | ms/batch 303.11 | loss  1.19 | ppl     3.30
| epoch   2 |   800/ 1989 batches | lr 15.00 | ms/batch 301.44 | loss  1.19 | ppl     3.29
| epoch   2 |  1000/ 1989 batches | lr 15.00 | ms/batch 301.30 | loss  1.18 | ppl     3.24
| epoch   2 |  1200/ 1989 batches | lr 15.00 | ms/batch 301.12 | loss  1.17 | ppl     3.23
| epoch   2 |  1400/ 1989 batches | lr 15.00 | ms/batch 302.27 | loss  1.16 | ppl     3.18
| epoch   2 |  1600/ 1989 batches | lr 15.00 | ms/batch 301.67 | loss  1.16 | ppl     3.19
| epoch   2 |  1800/ 1989 batches | lr 15.00 | ms/batch 302.27 | loss  1.16 | ppl     3.18
| epoch   2 |  2000/ 1989 batches | lr 15.00 | ms/batch 302.51 | loss  1.15 | ppl     3.16
---------------------------------------------------------------------------

In [18]:
h = unserialize('Experiments/PTb-20180204-235512', 'history')
print(h['train_errs'])


[6.41199541676867]
