From ecd071383a6d544a948294692d6eadaae6a2dfa0 Mon Sep 17 00:00:00 2001 From: Attractadore Date: Fri, 26 May 2017 09:07:10 +0300 Subject: [PATCH] =?UTF-8?q?[MRG+1]=20Added=20n=5Fcomponents=20parameter=20?= =?UTF-8?q?to=20LatentDirichletAllocation=20to=20replace=20=E2=80=A6=20(#8?= =?UTF-8?q?922)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [MRG+2] Added n_components parameter to LatentDirichletAllocation to replace … --- .../plot_topics_extraction_with_nmf_lda.py | 10 +- sklearn/decomposition/online_lda.py | 60 ++++++----- .../decomposition/tests/test_online_lda.py | 101 ++++++++++-------- 3 files changed, 95 insertions(+), 76 deletions(-) diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index e1a6f0bdbacd9..04ab2809f36b1 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -14,7 +14,7 @@ functions: the Frobenius norm, and the generalized Kullback-Leibler divergence. The latter is equivalent to Probabilistic Latent Semantic Indexing. -The default parameters (n_samples / n_features / n_topics) should make +The default parameters (n_samples / n_features / n_components) should make the example runnable in a couple of tens of seconds. You can try to increase the dimensions of the problem, but be aware that the time complexity is polynomial in NMF. In LDA, the time complexity is @@ -36,7 +36,7 @@ n_samples = 2000 n_features = 1000 -n_topics = 10 +n_components = 10 n_top_words = 20 @@ -85,7 +85,7 @@ def print_top_words(model, feature_names, n_top_words): "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() -nmf = NMF(n_components=n_topics, random_state=1, +nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) @@ -98,7 +98,7 @@ def print_top_words(model, feature_names, n_top_words): "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() -nmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler', +nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) @@ -109,7 +109,7 @@ def print_top_words(model, feature_names, n_top_words): print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) -lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, +lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 4717bd5af80a3..657ce3ece7e3f 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -143,17 +143,17 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): Parameters ---------- - n_topics : int, optional (default=10) + n_components : int, optional (default=10) Number of topics. doc_topic_prior : float, optional (default=None) Prior of document topic distribution `theta`. If the value is None, - defaults to `1 / n_topics`. + defaults to `1 / n_components`. In the literature, this is called `alpha`. topic_word_prior : float, optional (default=None) Prior of topic word distribution `beta`. If the value is None, defaults - to `1 / n_topics`. + to `1 / n_components`. In the literature, this is called `eta`. learning_method : 'batch' | 'online', default='online' @@ -224,10 +224,15 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - + + n_topics : int, optional (default=None) + This parameter has been renamed to n_components and will + be removed in version 0.21. + .. deprecated:: 0.19 + Attributes ---------- - components_ : array, [n_topics, n_features] + components_ : array, [n_components, n_features] Variational parameters for topic word distribution. Since the complete conditional for topic word distribution is a Dirichlet, ``components_[i, j]`` can be viewed as pseudocount that represents the @@ -255,13 +260,13 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): """ - def __init__(self, n_topics=10, doc_topic_prior=None, + def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method=None, learning_decay=.7, learning_offset=10., max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, - n_jobs=1, verbose=0, random_state=None): - self.n_topics = n_topics + n_jobs=1, verbose=0, random_state=None, n_topics=None): + self.n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior self.learning_method = learning_method @@ -277,13 +282,20 @@ def __init__(self, n_topics=10, doc_topic_prior=None, self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state + self.n_topics = n_topics def _check_params(self): """Check model parameters.""" + if self.n_topics is not None: + self._n_components = self.n_topics + warnings.warn("n_topics has been renamed to n_components in version 0.19 " + "and will be removed in 0.21", DeprecationWarning) + else: + self._n_components = self.n_components - if self.n_topics <= 0: - raise ValueError("Invalid 'n_topics' parameter: %r" - % self.n_topics) + if self._n_components <= 0: + raise ValueError("Invalid 'n_components' parameter: %r" + % self._n_components) if self.total_samples <= 0: raise ValueError("Invalid 'total_samples' parameter: %r" @@ -305,12 +317,12 @@ def _init_latent_vars(self, n_features): self.n_iter_ = 0 if self.doc_topic_prior is None: - self.doc_topic_prior_ = 1. / self.n_topics + self.doc_topic_prior_ = 1. / self._n_components else: self.doc_topic_prior_ = self.doc_topic_prior if self.topic_word_prior is None: - self.topic_word_prior_ = 1. / self.n_topics + self.topic_word_prior_ = 1. / self._n_components else: self.topic_word_prior_ = self.topic_word_prior @@ -318,7 +330,7 @@ def _init_latent_vars(self, n_features): init_var = 1. / init_gamma # In the literature, this is called `lambda` self.components_ = self.random_state_.gamma( - init_gamma, init_var, (self.n_topics, n_features)) + init_gamma, init_var, (self._n_components, n_features)) # In the literature, this is `exp(E[log(beta)])` self.exp_dirichlet_component_ = np.exp( @@ -409,7 +421,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): Returns ------- - doc_topic_distr : array, shape=(n_samples, n_topics) + doc_topic_distr : array, shape=(n_samples, n_components) Unnormalized document topic distribution. """ @@ -569,7 +581,7 @@ def _unnormalized_transform(self, X): Returns ------- - doc_topic_distr : shape=(n_samples, n_topics) + doc_topic_distr : shape=(n_samples, n_components) Document topic distribution for X. """ if not hasattr(self, 'components_'): @@ -603,7 +615,7 @@ def transform(self, X): Returns ------- - doc_topic_distr : shape=(n_samples, n_topics) + doc_topic_distr : shape=(n_samples, n_components) Document topic distribution for X. """ doc_topic_distr = self._unnormalized_transform(X) @@ -622,7 +634,7 @@ def _approx_bound(self, X, doc_topic_distr, sub_sampling): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. - doc_topic_distr : array, shape=(n_samples, n_topics) + doc_topic_distr : array, shape=(n_samples, n_components) Document topic distribution. In the literature, this is called gamma. @@ -644,7 +656,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): return score is_sparse_x = sp.issparse(X) - n_samples, n_topics = doc_topic_distr.shape + n_samples, n_components = doc_topic_distr.shape n_features = self.components_.shape[1] score = 0 @@ -673,7 +685,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, - dirichlet_doc_topic, self.n_topics) + dirichlet_doc_topic, self._n_components) # Compensate for the subsampling of the population of documents if sub_sampling: @@ -717,7 +729,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, X : array-like or sparse matrix, [n_samples, n_features] Document word matrix. - doc_topic_distr : None or array, shape=(n_samples, n_topics) + doc_topic_distr : None or array, shape=(n_samples, n_components) Document topic distribution. If it is None, it will be generated by applying transform on X. @@ -736,12 +748,12 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) else: - n_samples, n_topics = doc_topic_distr.shape + n_samples, n_components = doc_topic_distr.shape if n_samples != X.shape[0]: raise ValueError("Number of samples in X and doc_topic_distr" " do not match.") - if n_topics != self.n_topics: + if n_components != self._n_components: raise ValueError("Number of topics does not match.") current_samples = X.shape[0] @@ -769,7 +781,7 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False): X : array-like or sparse matrix, [n_samples, n_features] Document word matrix. - doc_topic_distr : None or array, shape=(n_samples, n_topics) + doc_topic_distr : None or array, shape=(n_samples, n_components) Document topic distribution. This argument is deprecated and is currently being ignored. diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index c3a221fe4800a..597681dcf8118 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -2,6 +2,7 @@ from scipy.linalg import block_diag from scipy.sparse import csr_matrix from scipy.special import psi +import warnings from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition._online_lda import (_dirichlet_expectation_1d, @@ -23,22 +24,22 @@ def _build_sparse_mtx(): # Create 3 topics and each topic has 3 distinct words. # (Each word only belongs to a single topic.) - n_topics = 3 - block = n_topics * np.ones((3, 3)) - blocks = [block] * n_topics + n_components = 3 + block = n_components * np.ones((3, 3)) + blocks = [block] * n_components X = block_diag(*blocks) X = csr_matrix(X) - return (n_topics, X) + return (n_components, X) def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result - n_topics, X = _build_sparse_mtx() - prior = 1. / n_topics - lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior, + n_components, X = _build_sparse_mtx() + prior = 1. / n_components + lda_1 = LatentDirichletAllocation(n_components=n_components, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0) + lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) @@ -48,8 +49,8 @@ def test_lda_default_prior_params(): def test_lda_fit_batch(): # Test LDA batch learning_offset (`fit` method with 'batch' learning) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, evaluate_every=1, learning_method='batch', random_state=rng) lda.fit(X) @@ -63,8 +64,8 @@ def test_lda_fit_batch(): def test_lda_fit_online(): # Test LDA online learning (`fit` method with 'online' learning) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., evaluate_every=1, learning_method='online', random_state=rng) lda.fit(X) @@ -80,8 +81,8 @@ def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., total_samples=100, random_state=rng) for i in xrange(3): lda.partial_fit(X) @@ -95,8 +96,8 @@ def test_lda_partial_fit(): def test_lda_dense_input(): # Test LDA with dense input. rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_method='batch', random_state=rng) lda.fit(X.toarray()) @@ -112,8 +113,8 @@ def test_lda_transform(): # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) - n_topics = 3 - lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) + n_components = 3 + lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any()) assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0])) @@ -125,7 +126,7 @@ def test_lda_fit_transform(): for method in ('online', 'batch'): rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) - lda = LatentDirichletAllocation(n_topics=5, learning_method=method, + lda = LatentDirichletAllocation(n_components=5, learning_method=method, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) @@ -135,11 +136,11 @@ def test_lda_fit_transform(): def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) - n_topics = rng.randint(3, 6) + n_components = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", @@ -151,7 +152,7 @@ def test_invalid_params(): X = np.ones((5, 10)) invalid_models = ( - ('n_topics', LatentDirichletAllocation(n_topics=0)), + ('n_components', LatentDirichletAllocation(n_components=0)), ('learning_method', LatentDirichletAllocation(learning_method='unknown')), ('total_samples', LatentDirichletAllocation(total_samples=0)), @@ -186,8 +187,8 @@ def test_lda_transform_mismatch(): X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) - n_topics = rng.randint(3, 6) - lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) + n_components = rng.randint(3, 6) + lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2) @@ -195,11 +196,11 @@ def test_lda_transform_mismatch(): @if_safe_multiprocessing_with_blas def test_lda_multi_jobs(): - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU for method in ('online', 'batch'): rng = np.random.RandomState(0) - lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=2, + lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng) @@ -215,8 +216,8 @@ def test_lda_multi_jobs(): def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=2, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_offset=5., total_samples=30, random_state=rng) for i in range(2): @@ -231,31 +232,31 @@ def test_lda_partial_fit_multi_jobs(): def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) - n_topics = rng.randint(3, 6) + n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples - invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) + invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) assert_raises_regexp(ValueError, r'Number of samples', lda._perplexity_precomp_distr, X, invalid_n_samples) # invalid topic number - invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) + invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) assert_raises_regexp(ValueError, r'Number of topics', - lda._perplexity_precomp_distr, X, invalid_n_topics) + lda._perplexity_precomp_distr, X, invalid_n_components) def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) @@ -273,12 +274,12 @@ def test_lda_perplexity(): def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) @@ -292,8 +293,8 @@ def test_lda_score(): def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) @@ -304,8 +305,8 @@ def test_perplexity_input_format(): def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) @@ -318,8 +319,8 @@ def test_lda_score_perplexity(): def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) @@ -336,8 +337,8 @@ def test_lda_fit_perplexity(): def test_doc_topic_distr_deprecation(): # Test that the appropriate warning message is displayed when a user # attempts to pass the doc_topic_distr argument to the perplexity method - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr1 = lda.fit_transform(X) @@ -367,3 +368,9 @@ def test_dirichlet_expectation(): assert_allclose(_dirichlet_expectation_2d(x), psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), rtol=1e-11, atol=3e-9) + + +def test_lda_n_topics_deprecation(): + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_topics=10, learning_method='batch') + assert_warns(DeprecationWarning, lda.fit, X) \ No newline at end of file