In [1]:
import torch
import math
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

import data

In [2]:
def ngram_split(dataset, n):
    # This function breaks corpus into [context, target]
    # For e.g., in trigram, the tensor returned would be [C(n-2), C(n-1), T]
    ngram = []
    data_len = len(dataset)
    for i, tokenid in enumerate(dataset):
        if i+n<data_len:
            ngram.append(dataset[i:i+n+1].view(-1))
    fin_ngram=torch.stack(ngram)
    return fin_ngram

def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:CONTEXT_SIZE]
            target_tensor = data_tensor[:,CONTEXT_SIZE]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [3]:

# N-gram Neural Network Model
class NgramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(NgramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [4]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 7
BATCH_SIZE = 512
# hidden units
H = 100
torch.manual_seed(42)
learn_rate = 1e-3

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [5]:
data_source = './data/wikitext-2'
corpus = data.Corpus(data_source)

In [6]:
train_set = corpus.train
val_set = corpus.valid
test_set = corpus.test

In [7]:
train_ngram = ngram_split(train_set, CONTEXT_SIZE)
val_ngram = ngram_split(val_set, CONTEXT_SIZE)
test_ngram = ngram_split(test_set, CONTEXT_SIZE)

In [8]:
train_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE)
dev_loader = DataLoader(val_ngram, batch_size = BATCH_SIZE)
test_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE)

In [9]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

vocab_len = len(corpus.dictionary)
# vocab_len = len(vocab)

# create model
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = NgramNNmodel(vocab_len, EMBEDDING_DIM, CONTEXT_SIZE, H)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = learn_rate)

In [10]:
# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_ppl = 999
best_model_path = None
for epoch in range(5):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:CONTEXT_SIZE]
        target_tensor = data_tensor[:,CONTEXT_SIZE]
#         print(context_tensor)
#         print(target_tensor)
        context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    ppl = math.exp(dev_loss)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}; Perplexity: {}".format(epoch, dev_acc, dev_loss, ppl))
#    if dev_acc > best_acc:
    if ppl < best_ppl:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_ppl, ppl))
        best_ppl = ppl
        # set best model path
        best_model_path = 'best_model_{}_gram_{}.dat'.format(CONTEXT_SIZE+1, epoch)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 10.441363334655762; Acc:0.0; Time taken (s): 0.6484103202819824
Training Iteration 500 of epoch 0 complete. Loss: 6.789704322814941; Acc:0.115234375; Time taken (s): 14.784630060195923
Training Iteration 1000 of epoch 0 complete. Loss: 6.175427436828613; Acc:0.158203125; Time taken (s): 14.68571662902832
Training Iteration 1500 of epoch 0 complete. Loss: 6.106536865234375; Acc:0.18359375; Time taken (s): 14.696378707885742
Training Iteration 2000 of epoch 0 complete. Loss: 5.956329822540283; Acc:0.15625; Time taken (s): 14.672945022583008
Training Iteration 2500 of epoch 0 complete. Loss: 6.462583541870117; Acc:0.19140625; Time taken (s): 14.712609052658081
Training Iteration 3000 of epoch 0 complete. Loss: 5.834971904754639; Acc:0.23046875; Time taken (s): 14.720713138580322
Training Iteration 3500 of epoch 0 complete. Loss: 6.225164413452148; Acc:0.193359375; Time taken (s): 14.688605546951294
Training I

### Test

In [11]:
model.load_state_dict(torch.load(best_model_path))

<All keys matched successfully>

In [12]:
test_acc, test_loss = evaluate(model, loss_function, test_loader, gpu)

Dev Iteration 0 complete. Mean Loss: 5.522627830505371; Mean Acc:0.1953125; Time taken (s): 0.01100301742553711
Dev Iteration 500 complete. Mean Loss: 5.399713248787764; Mean Acc:0.2041035145521164; Time taken (s): 5.135958909988403
Dev Iteration 1000 complete. Mean Loss: 5.437153098347423; Mean Acc:0.20221379399299622; Time taken (s): 4.940404176712036
Dev Iteration 1500 complete. Mean Loss: 5.417639294280599; Mean Acc:0.20305734872817993; Time taken (s): 4.879214763641357
Dev Iteration 2000 complete. Mean Loss: 5.396489654523858; Mean Acc:0.2040727734565735; Time taken (s): 4.904557466506958
Dev Iteration 2500 complete. Mean Loss: 5.367625521927154; Mean Acc:0.20471186935901642; Time taken (s): 4.849256277084351
Dev Iteration 3000 complete. Mean Loss: 5.342331157927115; Mean Acc:0.20545300841331482; Time taken (s): 4.96332311630249
Dev Iteration 3500 complete. Mean Loss: 5.310218870418476; Mean Acc:0.20649290084838867; Time taken (s): 5.685816526412964
Dev Iteration 4000 complete. Me

In [13]:
print("Test Perplexity:", math.exp(test_loss))

Test Perplexity: 190.15509964467603
