In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import time
from sklearn.metrics.pairwise import cosine_similarity

from utils import negative_sampling_loss, topic_embedding

# Load data

In [None]:
data = np.load('data.npy')
unigram_distribution = np.load('unigram_distribution.npy')[()]
word_vectors = np.load('word_vectors.npy')
decoder = np.load('decoder.npy')[()]

In [None]:
word_vectors = torch.FloatTensor(word_vectors)
beta = 3.0/4.0
unigram_distribution = torch.FloatTensor(unigram_distribution**beta)

# Create iterator

In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class SimpleDataset(Dataset):

    def __init__(self, data_tensor):
        self.data_tensor = data_tensor

    def __getitem__(self, index):
        return self.data_tensor[index]

    def __len__(self):
        return self.data_tensor.size(0)

In [None]:
batch_size = 2048

In [None]:
dataset = SimpleDataset(torch.LongTensor(data))

iterator = DataLoader(
    dataset, batch_size=batch_size, num_workers=4,
    shuffle=True, pin_memory=True, drop_last=True
)

# number of training samples
data_size = len(data)
data_size

# Model

In [None]:
# params
n_topics = 20
embedding_dim = word_vectors.shape[1]
vocab_size = len(unigram_distribution)
window_size = 10
n_documents = len(np.unique(data[:, 0]))
num_sampled = 10

In [None]:
class loss(nn.Module):

    def __init__(self, topics, word_vectors, unigram_distribution, 
                 n_documents, n_topics, num_sampled):
        super(loss, self).__init__()

        self.doc_embedding = nn.Embedding(n_documents, n_topics)
        self.doc_embedding.weight = nn.Parameter(2.0*torch.rand(n_documents, n_topics) - 1.0)
        
        self.neg = negative_sampling_loss(word_vectors, unigram_distribution, num_sampled)
        self.topics = topics

    def forward(self, doc_indices, pivot_words, target_words):
        
        alpha = 1.0/n_topics
        lambda_const = 200
        
        # shape: [batch_size, n_topics]
        doc_weights = self.doc_embedding(doc_indices)
        
        # shape: [batch_size, embedding_dim]
        doc_vectors = self.topics(doc_weights)
        
        neg_loss = self.neg(pivot_words, target_words, doc_vectors)
        dirichlet_loss = lambda_const*(1.0 - alpha)*F.log_softmax(doc_weights).sum(1).mean()

        return neg_loss, dirichlet_loss

In [None]:
topics = topic_embedding(n_topics, embedding_dim)

model = loss(
    topics, word_vectors, unigram_distribution, 
    n_documents, n_topics, num_sampled
)
model.cuda();

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)#, momentum=0.9, nesterov=True)
n_epochs = 20
n_batches = math.floor(data_size/batch_size)
n_batches

In [None]:
%%time
model.train()
for epoch in range(0, n_epochs):
    start = time.time()
    for step, batch in enumerate(iterator, 1 + epoch*n_batches):

        batch = Variable(batch.cuda())
        doc_indices = batch[:, 0]
        pivot_words = batch[:, 1]
        target_words = batch[:, 2:]

        neg_loss, dirichlet_loss = model(doc_indices, pivot_words, target_words)
        total_loss = neg_loss + dirichlet_loss

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
    
    print('{0} {1:.2f} {2:.2f} {3:.2f}'.format(
        epoch, neg_loss.data[0], dirichlet_loss.data[0], time.time() - start
    ))

# Results

In [None]:
doc_vectors = model.doc_embedding.weight.data.cpu().clone().numpy()
topic_vectors = model.topics.topic_vectors.data.cpu().clone().numpy()
resulted_word_vectors = model.neg.embedding.weight.data.cpu().clone().numpy()

In [None]:
similarity = cosine_similarity(topic_vectors, resulted_word_vectors)
most = similarity.argsort(axis=1)[:, -6:]

In [None]:
for j in range(n_topics):
    print([decoder[i] for i in most[j]])