In [1]:
import nltk
import csv
# from nltk.corpus import brown
# from nltk.corpus import wordnet



In [2]:
# nltk.download("brown")
# nltk.download("wordnet")

# len(brown.paras())

In [1]:
import torch
import multiprocessing
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

# Trigram Neural Network Model
class TrigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(TrigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [2]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 7
BATCH_SIZE = 256
# hidden units
H = 100
torch.manual_seed(13013)

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
available_workers = multiprocessing.cpu_count()


In [5]:
# import numpy as np

# num_train = 12000
# UNK_symbol = "<UNK>"
# vocab = set([UNK_symbol])

# # create brown corpus again with all words
# # no preprocessing, only lowercase
# brown_corpus_train = []
# for idx,paragraph in enumerate(brown.paras()):
#     if idx == num_train:
#         break
#     words = []
#     for sentence in paragraph:
#         for word in sentence:
#             words.append(word.lower())
#     brown_corpus_train.append(words)

# # create term frequency of the words
# words_term_frequency_train = {}
# for doc in brown_corpus_train:
#     for word in doc:
#         # this will calculate term frequency
#         # since we are taking all words now
#         words_term_frequency_train[word] = words_term_frequency_train.get(word,0) + 1

# # create vocabulary
# for doc in brown_corpus_train:
#     for word in doc:
#         if words_term_frequency_train.get(word,0) >= 5:
#             vocab.add(word)

# print(len(vocab))

# # create required lists
# x_train = []
# y_train = []
# x_dev = []
# y_dev = []

# # create word to id mappings
# word_to_id_mappings = {}
# for idx,word in enumerate(vocab):
#     word_to_id_mappings[word] = idx

# # function to get id for a given word
# # return <UNK> id if not found
# def get_id_of_word(word):
#     unknown_word_id = word_to_id_mappings['<UNK>']
#     return word_to_id_mappings.get(word,unknown_word_id)

# # creating training and dev set
# for idx,paragraph in enumerate(brown.paras()):
#     for sentence in paragraph:
#         for i,word in enumerate(sentence):
#             if i+CONTEXT_SIZE >= len(sentence):
#                 # sentence boundary reached
#                 # ignoring sentence less than 3 words
#                 break
#             # convert word to id
#             x_extract = [get_id_of_word(word.lower()),get_id_of_word(sentence[i+1].lower())]
#             y_extract = [get_id_of_word(sentence[i+2].lower())]
#             if idx < num_train:
#                 x_train.append(x_extract)
#                 y_train.append(y_extract)
#             else:
#                 x_dev.append(x_extract)
#                 y_dev.append(y_extract)

# # making numpy arrays
# x_train = np.array(x_train)
# y_train = np.array(y_train)
# x_dev = np.array(x_dev)
# y_dev = np.array(y_dev)  
  
# print(x_train.shape)
# print(y_train.shape)
# print(x_dev.shape)
# print(y_dev.shape)

In [6]:
# # print("--- Creating training and dev dataloaders with {} batch size ---".format(BATCH_SIZE))
# train_set = np.concatenate((x_train, y_train), axis=1)
# dev_set = np.concatenate((x_dev, y_dev), axis=1)
# train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = 4)
# dev_loader = DataLoader(dev_set, batch_size = BATCH_SIZE, num_workers = 4)

### Start of wikitext

In [3]:
import data

In [4]:
data_source = './data/wikitext-2'
corpus = data.Corpus(data_source)

In [5]:
train_set = corpus.train
val_set = corpus.valid

In [6]:
def ngram_split(dataset, n):
    # This function breaks corpus into [context, target]
    # For e.g., in trigram, the tensor returned would be [C(n-2), C(n-1), T]
    ngram = []
    data_len = len(dataset)
    for i, tokenid in enumerate(dataset):
        if i+n<data_len:
            ngram.append(dataset[i:i+n+1].view(-1))
    fin_ngram=torch.stack(ngram)
    return fin_ngram

In [7]:
train_ngram = ngram_split(train_set, CONTEXT_SIZE)
val_ngram = ngram_split(val_set, CONTEXT_SIZE)

In [10]:
train_loader = DataLoader(train_ngram, batch_size = BATCH_SIZE)
dev_loader = DataLoader(val_ngram, batch_size = BATCH_SIZE)

### End of wikitext

In [16]:
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:CONTEXT_SIZE]
            target_tensor = data_tensor[:,CONTEXT_SIZE]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [17]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

vocab_len = len(corpus.dictionary)
# vocab_len = len(vocab)

# create model
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = TrigramNNmodel(vocab_len, EMBEDDING_DIM, CONTEXT_SIZE, H)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)


# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_model_path = None
for epoch in range(5):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:CONTEXT_SIZE]
        target_tensor = data_tensor[:,CONTEXT_SIZE]
#         print(context_tensor)
#         print(target_tensor)
        context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(epoch, dev_acc, dev_loss))
    if dev_acc > best_acc:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
        best_acc = dev_acc
        # set best model path
        best_model_path = 'best_model_{}.dat'.format(epoch)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 10.454998016357422; Acc:0.00390625; Time taken (s): 0.9302129745483398
Training Iteration 500 of epoch 0 complete. Loss: 6.281522274017334; Acc:0.15625; Time taken (s): 10.584033727645874
Training Iteration 1000 of epoch 0 complete. Loss: 6.810752868652344; Acc:0.09375; Time taken (s): 10.5910062789917


KeyboardInterrupt: 