[MRG+1] Added n_components parameter to LatentDirichletAllocation to …

…replace … (scikit-learn#8922) [MRG+2] Added n_components parameter to LatentDirichletAllocation to replace …
Sundrique · Jun 14, 2017 · ecd0713 · ecd0713
1 parent 4a4f315
commit ecd0713
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 76 deletions.
diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py
@@ -14,7 +14,7 @@
 functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
 The latter is equivalent to Probabilistic Latent Semantic Indexing.
 
-The default parameters (n_samples / n_features / n_topics) should make
+The default parameters (n_samples / n_features / n_components) should make
 the example runnable in a couple of tens of seconds. You can try to
 increase the dimensions of the problem, but be aware that the time
 complexity is polynomial in NMF. In LDA, the time complexity is
@@ -36,7 +36,7 @@
 
 n_samples = 2000
 n_features = 1000
-n_topics = 10
+n_components = 10
 n_top_words = 20
 
 
@@ -85,7 +85,7 @@ def print_top_words(model, feature_names, n_top_words):
       "n_samples=%d and n_features=%d..."
       % (n_samples, n_features))
 t0 = time()
-nmf = NMF(n_components=n_topics, random_state=1,
+nmf = NMF(n_components=n_components, random_state=1,
           alpha=.1, l1_ratio=.5).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
@@ -98,7 +98,7 @@ def print_top_words(model, feature_names, n_top_words):
       "tf-idf features, n_samples=%d and n_features=%d..."
       % (n_samples, n_features))
 t0 = time()
-nmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler',
+nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',
           solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
@@ -109,7 +109,7 @@ def print_top_words(model, feature_names, n_top_words):
 print("Fitting LDA models with tf features, "
       "n_samples=%d and n_features=%d..."
       % (n_samples, n_features))
-lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
+lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                 learning_method='online',
                                 learning_offset=50.,
                                 random_state=0)

diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
@@ -143,17 +143,17 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_topics : int, optional (default=10)
+    n_components : int, optional (default=10)
         Number of topics.
 
     doc_topic_prior : float, optional (default=None)
         Prior of document topic distribution `theta`. If the value is None,
-        defaults to `1 / n_topics`.
+        defaults to `1 / n_components`.
         In the literature, this is called `alpha`.
 
     topic_word_prior : float, optional (default=None)
         Prior of topic word distribution `beta`. If the value is None, defaults
-        to `1 / n_topics`.
+        to `1 / n_components`.
         In the literature, this is called `eta`.
 
     learning_method : 'batch' | 'online', default='online'
@@ -224,10 +224,15 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
-
+         
+    n_topics : int, optional (default=None)
+        This parameter has been renamed to n_components and will
+        be removed in version 0.21.
+        .. deprecated:: 0.19
+        
     Attributes
     ----------
-    components_ : array, [n_topics, n_features]
+    components_ : array, [n_components, n_features]
         Variational parameters for topic word distribution. Since the complete
         conditional for topic word distribution is a Dirichlet,
         ``components_[i, j]`` can be viewed as pseudocount that represents the
@@ -255,13 +260,13 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
 
     """
 
-    def __init__(self, n_topics=10, doc_topic_prior=None,
+    def __init__(self, n_components=10, doc_topic_prior=None,
                  topic_word_prior=None, learning_method=None,
                  learning_decay=.7, learning_offset=10., max_iter=10,
                  batch_size=128, evaluate_every=-1, total_samples=1e6,
                  perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100,
-                 n_jobs=1, verbose=0, random_state=None):
-        self.n_topics = n_topics
+                 n_jobs=1, verbose=0, random_state=None, n_topics=None):
+        self.n_components = n_components
         self.doc_topic_prior = doc_topic_prior
         self.topic_word_prior = topic_word_prior
         self.learning_method = learning_method
@@ -277,13 +282,20 @@ def __init__(self, n_topics=10, doc_topic_prior=None,
         self.n_jobs = n_jobs
         self.verbose = verbose
         self.random_state = random_state
+        self.n_topics = n_topics
 
     def _check_params(self):
         """Check model parameters."""
+        if self.n_topics is not None:
+            self._n_components = self.n_topics
+            warnings.warn("n_topics has been renamed to n_components in version 0.19 "
+                          "and will be removed in 0.21", DeprecationWarning)
+        else:
+            self._n_components = self.n_components
 
-        if self.n_topics <= 0:
-            raise ValueError("Invalid 'n_topics' parameter: %r"
-                             % self.n_topics)
+        if self._n_components <= 0:
+            raise ValueError("Invalid 'n_components' parameter: %r"
+                             % self._n_components)
 
         if self.total_samples <= 0:
             raise ValueError("Invalid 'total_samples' parameter: %r"
@@ -305,20 +317,20 @@ def _init_latent_vars(self, n_features):
         self.n_iter_ = 0
 
         if self.doc_topic_prior is None:
-            self.doc_topic_prior_ = 1. / self.n_topics
+            self.doc_topic_prior_ = 1. / self._n_components
         else:
             self.doc_topic_prior_ = self.doc_topic_prior
 
         if self.topic_word_prior is None:
-            self.topic_word_prior_ = 1. / self.n_topics
+            self.topic_word_prior_ = 1. / self._n_components
         else:
             self.topic_word_prior_ = self.topic_word_prior
 
         init_gamma = 100.
         init_var = 1. / init_gamma
         # In the literature, this is called `lambda`
         self.components_ = self.random_state_.gamma(
-            init_gamma, init_var, (self.n_topics, n_features))
+            init_gamma, init_var, (self._n_components, n_features))
 
         # In the literature, this is `exp(E[log(beta)])`
         self.exp_dirichlet_component_ = np.exp(
@@ -409,7 +421,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None):
 
         Returns
         -------
-        doc_topic_distr : array, shape=(n_samples, n_topics)
+        doc_topic_distr : array, shape=(n_samples, n_components)
             Unnormalized document topic distribution.
         """
 
@@ -569,7 +581,7 @@ def _unnormalized_transform(self, X):
 
         Returns
         -------
-        doc_topic_distr : shape=(n_samples, n_topics)
+        doc_topic_distr : shape=(n_samples, n_components)
             Document topic distribution for X.
         """
         if not hasattr(self, 'components_'):
@@ -603,7 +615,7 @@ def transform(self, X):
 
         Returns
         -------
-        doc_topic_distr : shape=(n_samples, n_topics)
+        doc_topic_distr : shape=(n_samples, n_components)
             Document topic distribution for X.
         """
         doc_topic_distr = self._unnormalized_transform(X)
@@ -622,7 +634,7 @@ def _approx_bound(self, X, doc_topic_distr, sub_sampling):
         X : array-like or sparse matrix, shape=(n_samples, n_features)
             Document word matrix.
 
-        doc_topic_distr : array, shape=(n_samples, n_topics)
+        doc_topic_distr : array, shape=(n_samples, n_components)
             Document topic distribution. In the literature, this is called
             gamma.
 
@@ -644,7 +656,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size):
             return score
 
         is_sparse_x = sp.issparse(X)
-        n_samples, n_topics = doc_topic_distr.shape
+        n_samples, n_components = doc_topic_distr.shape
         n_features = self.components_.shape[1]
         score = 0
 
@@ -673,7 +685,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size):
 
         # compute E[log p(theta | alpha) - log q(theta | gamma)]
         score += _loglikelihood(doc_topic_prior, doc_topic_distr,
-                                dirichlet_doc_topic, self.n_topics)
+                                dirichlet_doc_topic, self._n_components)
 
         # Compensate for the subsampling of the population of documents
         if sub_sampling:
@@ -717,7 +729,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
         X : array-like or sparse matrix, [n_samples, n_features]
             Document word matrix.
 
-        doc_topic_distr : None or array, shape=(n_samples, n_topics)
+        doc_topic_distr : None or array, shape=(n_samples, n_components)
             Document topic distribution.
             If it is None, it will be generated by applying transform on X.
 
@@ -736,12 +748,12 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
         if doc_topic_distr is None:
             doc_topic_distr = self._unnormalized_transform(X)
         else:
-            n_samples, n_topics = doc_topic_distr.shape
+            n_samples, n_components = doc_topic_distr.shape
             if n_samples != X.shape[0]:
                 raise ValueError("Number of samples in X and doc_topic_distr"
                                  " do not match.")
 
-            if n_topics != self.n_topics:
+            if n_components != self._n_components:
                 raise ValueError("Number of topics does not match.")
 
         current_samples = X.shape[0]
@@ -769,7 +781,7 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False):
         X : array-like or sparse matrix, [n_samples, n_features]
             Document word matrix.
 
-        doc_topic_distr : None or array, shape=(n_samples, n_topics)
+        doc_topic_distr : None or array, shape=(n_samples, n_components)
             Document topic distribution.
             This argument is deprecated and is currently being ignored.