In [1]:
import numpy as np
from scipy import sparse
import artm
from base_regularizer import BaseRegularizer
from smoothing_regularizer import SmoothingRegularizer
from combined_smoothing_sparsing_regularizer import CombinedSmoothingSparsingRegularizer
from covariance_topics_regularizer import CovarianceTopicsRegularizer

In [2]:
def generate_word_in_doc_freqs(words_count, docs_count):

    density = 0.001
    max_freq = 5

    word_in_doc_freqs = sparse.dok_matrix((words_count, docs_count), dtype=int)

    for i in range(int(density*words_count*docs_count)):

        word_index = np.random.choice(words_count)
        doc_index = np.random.choice(docs_count)

        word_in_doc_freqs[word_index, doc_index] = np.random.choice(max_freq) + 1

    return word_in_doc_freqs

In [3]:
class ZeroRegularizer(BaseRegularizer):

    def __init__(self, words_count, docs_count, topics_count):

        self._word_in_topics_probs_grad = np.zeros((words_count, topics_count))
        self._topic_in_doc_probs_grad = np.zeros((topics_count, docs_count))

    def get_value(self, word_in_topics_probs, topic_in_doc_probs):

        return 0.0

    def get_gradient(self, word_in_topics_probs, topic_in_doc_probs):

        return self._word_in_topics_probs_grad, self._topic_in_doc_probs_grad

In [4]:
np.random.seed(seed=0)

words_count = 10000
docs_count = 100
topics_count = 10

holdout_docs_count = 10

word_in_doc_freqs = generate_word_in_doc_freqs(words_count, docs_count)
words_list = np.array([str(i) for i in range(words_count)])

zero_regularizer = ZeroRegularizer(words_count, docs_count - holdout_docs_count, topics_count)

artm_model = artm.ARTM(topics_count, [zero_regularizer], [1])

In [5]:
train_word_in_doc_freqs = word_in_doc_freqs[:, :-holdout_docs_count]
holdout_word_in_doc_freqs = word_in_doc_freqs[:, -holdout_docs_count:]
holdout_word_in_doc_freqs_overfit = train_word_in_doc_freqs[:, -holdout_docs_count:]

In [6]:
train_result = artm_model.train(train_word_in_doc_freqs, words_list, iterations_count=10, verbose=True)

iter#1: loglike=-17407.22636104274
iter#2: loglike=-16553.85754382502
iter#3: loglike=-15337.18379882692
iter#4: loglike=-14122.195581624563
iter#5: loglike=-13240.240062229397
iter#6: loglike=-12707.388352003167
iter#7: loglike=-12423.625911423469
iter#8: loglike=-12271.463457269032
iter#9: loglike=-12184.984019664067
iter#10: loglike=-12133.017414077396


In [7]:
print('Train perplexity: {}'.format(train_result.get_train_perplexity()))

# Perplexity is big for a complete random holdout set
print('Hold out perplexity: {}'.format(train_result.get_holdout_perplexity(holdout_word_in_doc_freqs,
                                                                          iterations_count=10)[1]))

# While it's reasonable for training data subset
print('Hold out perplexity (train data leak): {}'.format(
    train_result.get_holdout_perplexity(holdout_word_in_doc_freqs_overfit, iterations_count=10)[1]))

Train perplexity: 83.50267830167759
Hold out perplexity: 3483097210.613652
Hold out perplexity (train data leak): 76.9329014289757


In [8]:
train_result.get_top_words_in_topics(10)

array([['4545', '1663', '5103', '7726', '4471', '4419', '3863', '7782',
        '5666', '2251'],
       ['2593', '3877', '7315', '5192', '1725', '8291', '9483', '5878',
        '4300', '2467'],
       ['2597', '6538', '6781', '7762', '960', '6870', '1562', '6514',
        '1609', '4998'],
       ['2659', '1004', '2273', '4562', '4331', '6117', '8889', '6216',
        '8819', '1099'],
       ['1981', '5339', '1401', '9576', '7627', '75', '1440', '2502',
        '4573', '7970'],
       ['815', '2582', '2010', '876', '1614', '2317', '2257', '77', '6447',
        '7526'],
       ['3918', '2003', '1610', '8769', '1820', '6769', '2392', '3041',
        '6687', '3317'],
       ['7444', '7751', '4005', '8837', '8131', '9686', '3012', '8246',
        '7318', '4332'],
       ['4814', '4690', '3945', '127', '6337', '6273', '8393', '7160',
        '4751', '4057'],
       ['5703', '8809', '4047', '7434', '493', '8948', '9987', '1851',
        '3845', '523']],
      dtype='<U4')

### Smoothing reg

In [9]:
smoothing_regularizer = SmoothingRegularizer(beta_0=0.5, alpha_0=0.5, 
                                             beta=np.array([1e-4]*words_count), 
                                             alpha=np.array([1e-4]*topics_count), 
                                             num_topics=topics_count, 
                                             num_words=words_count, 
                                             num_docs=docs_count)

In [10]:
artm_model = artm.ARTM(topics_count, [smoothing_regularizer], [1.])

In [11]:
train_result = artm_model.train(word_in_doc_freqs, words_list, iterations_count=10, verbose=True)

iter#1: loglike=-19482.763378626827
iter#2: loglike=-18520.359458412888
iter#3: loglike=-17228.643654167827
iter#4: loglike=-15972.047206415315
iter#5: loglike=-15031.23507602721
iter#6: loglike=-14380.564561543266
iter#7: loglike=-13968.665122778333
iter#8: loglike=-13763.744803667347
iter#9: loglike=-13676.81786171648
iter#10: loglike=-13634.93516738866


### Combining smooth and sparse

In [12]:
sparse_smooth_reg = CombinedSmoothingSparsingRegularizer(beta_0=0.5, alpha_0=0.5, 
                                                         beta=np.array([1e-4]*words_count), 
                                                         alpha=np.array([1e-4]*topics_count), 
                                                         num_topics=topics_count, 
                                                         num_words=words_count, 
                                                         num_docs=docs_count, 
                                                         domain_specific_topics=np.arange(5), 
                                                         background_topics=np.arange(5, 10))

In [13]:
artm_model = artm.ARTM(topics_count, [sparse_smooth_reg], [1.])

In [14]:
train_result = artm_model.train(word_in_doc_freqs, words_list, iterations_count=10, verbose=True)

iter#1: loglike=-19353.851157493616
iter#2: loglike=-18382.01480925508
iter#3: loglike=-17084.132102431704
iter#4: loglike=-15825.865801640357
iter#5: loglike=-14911.744544324698
iter#6: loglike=-14354.1965844772
iter#7: loglike=-14023.507620175966
iter#8: loglike=-13810.019306692513
iter#9: loglike=-13671.359443310223
iter#10: loglike=-13607.86162922442


In [15]:
train_result.get_top_words_in_topics(10)

array([['4419', '3419', '4471', '9599', '4335', '9316', '3877', '2251',
        '2721', '8255'],
       ['7315', '1221', '165', '1562', '8809', '6870', '6213', '7160',
        '3946', '6117'],
       ['7268', '9809', '6538', '8889', '62', '75', '5145', '1851', '8216',
        '2317'],
       ['8527', '5346', '4690', '3863', '2352', '8948', '9392', '3041',
        '2392', '6273'],
       ['4648', '5830', '1004', '9483', '6559', '3275', '9299', '8246',
        '3012', '3599'],
       ['1950', '4545', '2003', '7627', '6687', '4922', '5249', '1981',
        '2593', '178'],
       ['3659', '4331', '7726', '8131', '6447', '6572', '7434', '7444',
        '2597', '7233'],
       ['1610', '493', '7762', '8146', '4300', '8837', '127', '4751',
        '3317', '5703'],
       ['2273', '2467', '5192', '9673', '5666', '8769', '5674', '3845',
        '9987', '2742'],
       ['7949', '4998', '876', '2654', '889', '1097', '8294', '2656',
        '947', '3328']],
      dtype='<U4')

### Covariance topics reg

In [16]:
covariance_regularizer = CovarianceTopicsRegularizer(tau=0.5, num_topics=topics_count, 
                                                     num_words=words_count, num_docs=docs_count)

In [17]:
artm_model = artm.ARTM(topics_count, [covariance_regularizer], [1.])

In [18]:
train_result = artm_model.train(word_in_doc_freqs, words_list, iterations_count=10, verbose=True)

iter#1: loglike=-19364.56568899796
iter#2: loglike=-18364.448409010456
iter#3: loglike=-17026.670811239164
iter#4: loglike=-15821.971973761776
iter#5: loglike=-14962.085452076823
iter#6: loglike=-14363.729678237405
iter#7: loglike=-13994.68252844139
iter#8: loglike=-13809.889967255864
iter#9: loglike=-13699.14051890597
iter#10: loglike=-13628.15489578


In [19]:
train_result.get_top_words_in_topics(10)

array([['215', '7315', '5037', '4471', '9599', '3877', '5103', '9264',
        '2251', '7726'],
       ['2169', '2273', '1221', '6564', '3419', '2127', '2659', '3845',
        '1820', '876'],
       ['947', '1610', '3135', '3317', '8291', '5666', '2981', '3041',
        '1614', '5192'],
       ['1711', '1344', '9809', '1825', '4419', '6447', '3441', '2742',
        '6337', '5703'],
       ['7463', '80', '8216', '3275', '6128', '9987', '3963', '8246',
        '7233', '4998'],
       ['8924', '9392', '127', '7213', '4573', '6687', '2463', '1968',
        '815', '9576'],
       ['523', '6213', '3946', '3945', '5346', '4300', '4584', '4562',
        '4814', '5249'],
       ['6572', '3863', '7434', '8948', '5830', '5339', '3236', '41',
        '3918', '7762'],
       ['4922', '5145', '2721', '493', '178', '2582', '1071', '4751',
        '7579', '2467'],
       ['6703', '9299', '762', '6232', '4690', '7751', '1401', '1851',
        '2392', '960']],
      dtype='<U4')