In [None]:
import torch
import math
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

import data_fnn as data
import model

In [None]:
def ngram_split(orig_corpus, dataset, n):
    # This function breaks corpus into [context, target]
    # For e.g., in trigram, the tensor returned would be [C(n-2), C(n-1), T]
    ngram = []
    data_len = len(dataset)
    eos_id = corpus.dictionary.word2idx['<eos>']       
    for i, tokenid in enumerate(dataset):
        if i+n<data_len:
            temp_gram = dataset[i:i+n+1].view(-1)
            if eos_id in temp_gram[0:n]:
                continue
            ngram.append(temp_gram)
    fin_ngram=torch.stack(ngram)
    return fin_ngram

def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:CONTEXT_SIZE]
            target_tensor = data_tensor[:,CONTEXT_SIZE]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [None]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 7
BATCH_SIZE = 512
shared = True
# hidden units
if shared:
    H = 200
else:
    H = 100
torch.manual_seed(42)
learn_rate = 1e-3

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
data_source = './data/wikitext-2'
corpus = data.Corpus(data_source)

In [None]:
train_set = corpus.train
val_set = corpus.valid
test_set = corpus.test

In [None]:
train_ngram = ngram_split(corpus, train_set, CONTEXT_SIZE)
val_ngram = ngram_split(corpus, val_set, CONTEXT_SIZE)
test_ngram = ngram_split(corpus, test_set, CONTEXT_SIZE)

In [None]:
train_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(val_ngram, batch_size = BATCH_SIZE, shuffle=True)
test_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE, shuffle=True)

In [None]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

vocab_len = len(corpus.dictionary)
# vocab_len = len(vocab)

# create model
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = model.FNNModel(vocab_len, EMBEDDING_DIM, CONTEXT_SIZE, H)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = learn_rate)

In [None]:
# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_ppl = 999
best_model_path = None
for epoch in range(5):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:CONTEXT_SIZE]
        target_tensor = data_tensor[:,CONTEXT_SIZE]
#         print(context_tensor)
#         print(target_tensor)
        context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)
        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    ppl = math.exp(dev_loss)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}; Perplexity: {}".format(epoch, dev_acc, dev_loss, ppl))
#    if dev_acc > best_acc:
    if ppl < best_ppl:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_ppl, ppl))
        best_ppl = ppl
        # set best model path
        best_model_path = 'best_model_{}_gram_{}_{}H.dat'.format(CONTEXT_SIZE+1, epoch, H)
        # saving best model
        torch.save(model.state_dict(), best_model_path)

In [None]:
best_model_path

### Test

In [None]:
model.load_state_dict(torch.load(best_model_path))

In [None]:
test_acc, test_loss = evaluate(model, loss_function, test_loader, gpu)

In [None]:
print("Test Perplexity:", math.exp(test_loss))

In [None]:
torch.save(model, 'model_shared.pt')

### Generate

In [None]:
import data
import model
import random
import torch

In [None]:
import random

In [None]:
corpus = data.Corpus('./data/wikitext-2')
ntokens = len(corpus.dictionary)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_path = 'model_shared.pt'
device = torch.device("cuda")
model = torch.load(model_path)
model.to(device)

In [None]:
l = dict(model.named_children())
embedding_size = l['embeddings'].embedding_dim # 200
input_layer_dim = l['linear1'].in_features # 1400
context_size = int(input_layer_dim/embedding_size) # 7

In [None]:
full_corpus = torch.cat((corpus.train, corpus.valid, corpus.test))

In [None]:
seed_pos = random.randint(0, len(full_corpus)-context_size)
seed_span = full_corpus[seed_pos:seed_pos+context_size] # Pick random span from corpus
generated_text=seed_span.to(device)

In [None]:
model.eval()
for i in range(15):
    with torch.no_grad():
        output = model(generated_text[-7:])
        word_id = torch.argmax(output, dim=1)
        generated_text = torch.cat((generated_text,word_id))
        print(generated_text[-8:])

In [None]:
generated_text = generated_text.cpu()

In [None]:
for index, i in enumerate(generated_text):
    if index == 7: 
        print(" | ", end = "")
    print(corpus.dictionary.idx2word[i], end = " ")