In [1]:
import torch
import math
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

import data_fnn as data
import model

In [2]:
def ngram_split(orig_corpus, dataset, n):
    # This function breaks corpus into [context, target]
    # For e.g., in trigram, the tensor returned would be [C(n-2), C(n-1), T]
    ngram = []
    data_len = len(dataset)
    eos_id = corpus.dictionary.word2idx['<eos>']       
    for i, tokenid in enumerate(dataset):
        if i+n<data_len:
            temp_gram = dataset[i:i+n+1].view(-1)
            if eos_id in temp_gram[0:n]:
                continue
            ngram.append(temp_gram)
    fin_ngram=torch.stack(ngram)
    return fin_ngram

def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:CONTEXT_SIZE]
            target_tensor = data_tensor[:,CONTEXT_SIZE]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [3]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 7
BATCH_SIZE = 512
tied = False
# hidden units
H = 100
torch.manual_seed(42)
learn_rate = 1e-3

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [4]:
data_source = './data/wikitext-2'
corpus = data.Corpus(data_source)

In [5]:
train_set = corpus.train
val_set = corpus.valid
test_set = corpus.test

In [6]:
train_ngram = ngram_split(corpus, train_set, CONTEXT_SIZE)
val_ngram = ngram_split(corpus, val_set, CONTEXT_SIZE)
test_ngram = ngram_split(corpus, test_set, CONTEXT_SIZE)

In [7]:
train_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(val_ngram, batch_size = BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ngram, batch_size = BATCH_SIZE, shuffle=True)

In [8]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

vocab_len = len(corpus.dictionary)

# create model
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = model.FNNModel(vocab_len, EMBEDDING_DIM, CONTEXT_SIZE, H, tied)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = learn_rate)

In [9]:
# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_ppl = 999
best_model_path = None
for epoch in range(20):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:CONTEXT_SIZE]
        target_tensor = data_tensor[:,CONTEXT_SIZE]
#         print(context_tensor)
#         print(target_tensor)
        context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)
        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
#             print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    ppl = math.exp(dev_loss)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}; Perplexity: {}".format(epoch+1, dev_acc, dev_loss, ppl))
    if ppl < best_ppl:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_ppl, ppl))
        best_ppl = ppl
        # set best model path
        best_model_path = 'best_model_{}_gram_{}_{}H.dat'.format(CONTEXT_SIZE+1, epoch, H)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.859422206878662; Mean Acc:0.158203125; Time taken (s): 0.0431671142578125
Epoch 1 complete! Development Accuracy: 0.16658324003219604; Development Loss: 5.977240417491306; Perplexity: 394.35062214570576
Best development accuracy improved from 999 to 394.35062214570576, saving model...

--- Training model Epoch: 2 ---

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 6.151486396789551; Mean Acc:0.171875; Time taken (s): 0.037268877029418945
Epoch 2 complete! Development Accuracy: 0.17587989568710327; Development Loss: 5.844574175097725; Perplexity: 345.35544951498883
Best development accuracy improved from 394.35062214570576 to 345.35544951498883, saving model...

--- Training model Epoch: 3 ---

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.80184268951416; Mean Acc:0.1796875; Time taken (s): 0.04019021987915039
Epoch 3 comp

In [None]:
model

In [10]:
best_model_path

'best_model_8_gram_19_100H.dat'

### Test

In [None]:
best_model_path = 'Experiment3.dat'

In [11]:
model.load_state_dict(torch.load(best_model_path))

<All keys matched successfully>

In [12]:
test_acc, test_loss = evaluate(model, loss_function, test_loader, gpu)

Dev Iteration 0 complete. Mean Loss: 6.155126094818115; Mean Acc:0.154296875; Time taken (s): 0.023903608322143555


In [13]:
print("Test Perplexity:", math.exp(test_loss))

Test Perplexity: 506.15863358029986


In [14]:
torch.save(model, 'Experiment4.pt')

### Generate

In [None]:
import data_fnn as data
import model
import random
import torch

In [None]:
corpus = data.Corpus('./data/wikitext-2')
ntokens = len(corpus.dictionary)
vocab_len = len(corpus.dictionary)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_path = 'Experiment1.pt'
device = torch.device("cuda")
test_model = torch.load(model_path)
# test_model.load_state_dict(torch.load('best_model_8_gram_11_200H.dat'))
test_model.to(device)

In [None]:
l = dict(test_model.named_children())
embedding_size = l['embeddings'].embedding_dim # 200
input_layer_dim = l['linear1'].in_features # 1400
context_size = int(input_layer_dim/embedding_size) # 7

In [6]:
full_corpus = torch.cat((corpus.train, corpus.valid, corpus.test))

In [None]:
seed_pos = random.randint(0, len(full_corpus)-context_size)
seed_span = full_corpus[seed_pos:seed_pos+context_size] # Pick random span from corpus
generated_text=seed_span.to(device)

In [None]:
model.eval()
for i in range(15):
    with torch.no_grad():
        output = model(generated_text[-7:])
        word_id = torch.argmax(output, dim=1)
        generated_text = torch.cat((generated_text,word_id))
        print(generated_text[-8:])

In [None]:
generated_text = generated_text.cpu()

In [None]:
for index, i in enumerate(generated_text):
    if index == 7: 
        print(" | ", end = "")
    print(corpus.dictionary.idx2word[i], end = " ")

In [16]:
import random

In [17]:
model.eval()
for _ in range(5):
    seed_pos = random.randint(0, len(corpus.test)-7)
    seed_span = corpus.test[seed_pos:seed_pos+7] # Pick random span from corpus
    generated_text=seed_span.to(device)
    for i in range(15):
        with torch.no_grad():
            output = model(generated_text[-7:])
            word_id = torch.argmax(output, dim=1)
            generated_text = torch.cat((generated_text,word_id))
            #print(generated_text[-8:])
    sent = [corpus.dictionary.idx2word[i] for i in generated_text] 
    sent = sent[:7] + ['|']+sent[7:]
    print("###",' '.join(sent))
    orig = [corpus.dictionary.idx2word[i] for i in corpus.test[seed_pos:seed_pos+len(sent)-7]] 
    print("$$$",' '.join(orig))

### the words of , as often as | much as the of the decade of the united states . <eos> of the united
$$$ the words of , as often as not consisted of a stern lecture from on the
### ten years later , the census counted | the only rectangular - day run of the two companies of the city of the
$$$ ten years later , the census counted just 16 . 9 million , the remainder having
### . also sought to involve the senate | of the assembly , who had a large amount of energy of the . <eos>
$$$ . also sought to involve the senate in his government , but this was not entirely
### . <eos> persisted on cruising ironclads for | the championship . <eos> of the season of the german , austrian and ottoman force
$$$ . <eos> persisted on cruising ironclads for much longer . during the 1860s , the french
### upstairs window . in preparation for s | the ship , and the other candidate cities in the city of the church s
$$$ upstairs window . in preparation for s visit the free derry wall was painted white and


In [None]:
model

In [None]:
model.tied

In [None]:
model.linear2

In [None]:
model.linear2.weight==model.embeddings.weight

In [None]:
model.embeddings.weight

In [None]:
model.linear2.weight

In [None]:
a = model.embeddings.weight.detach().cpu()
b = model.linear2.weight.detach().cpu()

In [None]:
import numpy as np

In [None]:
np.linalg.norm(a-b)

In [None]:
model.linear2.weight.size()

In [None]:
sum(sum(np.abs(a-b).numpy()))/(28744*200)

In [None]:
np.mean(a.flatten().numpy())

In [None]:
np.var(a.flatten().numpy())