In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import torch

np.set_printoptions(precision=4, suppress=True)

In [None]:
def softmax(x):
    # x has shape [batch_size, n_classes]
    e = np.exp(x)
    n = np.sum(e, 1, keepdims=True)
    return e/n

# Load data

In [None]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = dataset['data']

docs = [(i, doc) for i, doc in enumerate(docs)]
decoder = np.load('decoder.npy')[()]
doc_decoder = np.load('doc_decoder.npy')[()]

# Load the trained model

In [None]:
state = torch.load('200_tmp_model_state.pytorch')
n_topics = 20

doc_vectors = state['doc_weights.weight'].cpu().clone().numpy()
topic_vectors = state['topics.topic_vectors'].cpu().clone().numpy()
resulted_word_vectors = state['neg.embedding.weight'].cpu().clone().numpy()
topic_dist = softmax(doc_vectors)

# Show a document and its topics

In [None]:
i = 10  # document id
print([doc for j, doc in docs if j == doc_decoder[i]][0], '\n')
s = ''
for j, p in enumerate(topic_dist[i]):
    s += '{0}:{1:.3f}  '.format(j, p)
    if j == 9:
        s += '\n'
print(s)

# Show topics

In [None]:
similarity = np.matmul(topic_vectors, resulted_word_vectors.T)
most = similarity.argsort(axis=1)[:, -15:]

In [None]:
for j in range(n_topics):
    topic_words = ' '.join([decoder[i] for i in reversed(most[j])])
    print('topic', j, ':', topic_words)

In [None]:
np.sort(similarity, 1)[:, -5:]

In [None]:
np.sort(similarity, 1)[:, :5]