Implementation of the 2003 paper on Neural Probabilistic Language Models
by Yoshua Bengio, Réjean Ducharme, Pascal Vincent, and Christian Jauvin

In [51]:
import nltk
import numpy as np
import torch

In [52]:
# Set the seed for reproducibility
# import random

# seed = 42
# torch.manual_seed(seed)
# np.random.seed(seed)
# random.seed(seed)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [53]:
# Load the brown corpus
from nltk.corpus import brown

nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [54]:
from torch import tensor
from torch.utils.data import DataLoader, TensorDataset
# prepare the data

from collections import Counter


# build vocabulary
def build_vocab(corpus):
    vocab = Counter(brown.words())

    # Consolidate words rare words into a single token
    for word in list(vocab.keys()):
        if vocab[word] <= 3:
            vocab['<unk>'] += vocab[word]
            del vocab[word]
    
    # Create indexes
    index_to_word = list(vocab.keys())
    word_to_index = {word: i for i, word in enumerate(index_to_word)}

    # Convert words to indexes
    vocab_indices = [word_to_index[word] for word in brown.words() if word in word_to_index]

    return vocab, vocab_indices


vocab, vocab_indices = build_vocab(brown.words())
print("Vocab size:", len(vocab))


# Chunk the data into sequences, X being the first n-1 words and y being the nth word
def chunk_data(data, window_size=6):
    X = [data[i:i + window_size] for i in range(len(data) - window_size)]
    y = [data[i] for i in range(window_size, len(data))]
    return tensor(X, device=device), tensor(y, device=device)


def get_data(indices, window_size=6, train_samples=800000, test_samples=200000, batch_size=32, shuffle=True):
    train_indices = indices[:train_samples]
    test_indices = indices[train_samples:train_samples + test_samples]
    val_indices = indices[train_samples + test_samples:]

    train_X, train_y = chunk_data(train_indices, window_size)
    test_X, test_y = chunk_data(test_indices, window_size)
    val_X, val_y = chunk_data(val_indices, window_size)

    train_dl = DataLoader(TensorDataset(train_X, train_y), batch_size, shuffle)
    test_dl = DataLoader(TensorDataset(test_X, test_y), batch_size, shuffle)
    val_dl = DataLoader(TensorDataset(val_X, val_y), batch_size, shuffle)

    return train_dl, test_dl, val_dl

Vocab size: 17905


In [55]:
# Neural Probabilistic Language Model

import torch.nn as nn
import torch.nn.functional as F


class NPLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, window_size):
        super(NPLM, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden = nn.Linear(embedding_dim * window_size, hidden_dim)
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        x = x.view(x.size(0), -1)
        x = F.tanh(self.hidden(x))  # TODO: implement direct connections (Wx from paper)
        x = F.log_softmax(self.output(x), dim=1)
        return x

In [59]:
# Set the hyperparameters
vocab_size = len(vocab)
embedding_dim = 60
hidden_dim = 100
window_size = 6
weight_decay = 1e-4

# Set the trainjng parameters
lr = 1e-3
epochs = 10
batch_size = 64

In [57]:
# Get the data
train_dl, test_dl, val_dl = get_data(vocab_indices, batch_size=batch_size)

In [60]:
# Initialize the model
model = NPLM(vocab_size, embedding_dim, hidden_dim, window_size).to(device)
print(model)

# Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Initialize the loss function
criterion = nn.NLLLoss()


# Fit the model
def train_loop(model, train_dl, optimizer, criterion):
    model.train()
    for X, y in train_dl:
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()


def eval_loop(model, eval_dl, criterion):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for X, y in eval_dl:
            y_pred = model(X)
            loss = criterion(y_pred, y)
            total_loss += loss.item()
    return total_loss / len(eval_dl)


def train_one_epoch(model, train_dl, eval_dl, optimizer, criterion):
    train_loop(model, train_dl, optimizer, criterion)
    loss = eval_loop(model, eval_dl, criterion)
    return loss


best_loss = np.inf
for epoch in range(epochs):
    loss = train_one_epoch(model, train_dl, test_dl, optimizer, criterion)
    print(f"Epoch {epoch + 1}/{epochs} Loss: {loss}")
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), "nplm_brown.pt")

NPLM(
  (embeddings): Embedding(17905, 60)
  (hidden): Linear(in_features=360, out_features=100, bias=True)
  (output): Linear(in_features=100, out_features=17905, bias=True)
)
Epoch 1/10 Loss: 5.942491071624755
Epoch 2/10 Loss: 5.83808203125
Epoch 3/10 Loss: 5.774402575378418
Epoch 4/10 Loss: 5.769352248840332
Epoch 5/10 Loss: 5.74943915802002
Epoch 6/10 Loss: 5.750322311706543
Epoch 7/10 Loss: 5.74098383392334
Epoch 8/10 Loss: 5.7399361894226075
Epoch 9/10 Loss: 5.728250364532471
Epoch 10/10 Loss: 5.731992821960449


In [61]:
# Calculate perplexity
def perplexity(model, dl, criterion):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for X, y in dl:
            y_pred = model(X)
            loss = criterion(y_pred, y)
            total_loss += loss.item()
    return np.exp(total_loss / len(dl))

# Restore best model
model.load_state_dict(torch.load("nplm_brown.pt"))

test_perplexity = perplexity(model, test_dl, criterion)
val_perplexity = perplexity(model, val_dl, criterion)
print(f"Test Perplexity: {test_perplexity}")
print(f"Validation Perplexity: {val_perplexity}")

Test Perplexity: 307.4282649635557
Validation Perplexity: 287.0144941594557
