Skip to content

Commit

Permalink
[MRG+1] Added n_components parameter to LatentDirichletAllocation to …
Browse files Browse the repository at this point in the history
…replace … (scikit-learn#8922)

[MRG+2] Added n_components parameter to LatentDirichletAllocation to replace …
  • Loading branch information
Attractadore authored and Sundrique committed Jun 14, 2017
1 parent 4a4f315 commit ecd0713
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 76 deletions.
10 changes: 5 additions & 5 deletions examples/applications/plot_topics_extraction_with_nmf_lda.py
Expand Up @@ -14,7 +14,7 @@
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.
The default parameters (n_samples / n_features / n_topics) should make
The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
Expand All @@ -36,7 +36,7 @@

n_samples = 2000
n_features = 1000
n_topics = 10
n_components = 10
n_top_words = 20


Expand Down Expand Up @@ -85,7 +85,7 @@ def print_top_words(model, feature_names, n_top_words):
"n_samples=%d and n_features=%d..."
% (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
nmf = NMF(n_components=n_components, random_state=1,
alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Expand All @@ -98,7 +98,7 @@ def print_top_words(model, feature_names, n_top_words):
"tf-idf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler',
nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler',
solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Expand All @@ -109,7 +109,7 @@ def print_top_words(model, feature_names, n_top_words):
print("Fitting LDA models with tf features, "
"n_samples=%d and n_features=%d..."
% (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
Expand Down
60 changes: 36 additions & 24 deletions sklearn/decomposition/online_lda.py
Expand Up @@ -143,17 +143,17 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
Parameters
----------
n_topics : int, optional (default=10)
n_components : int, optional (default=10)
Number of topics.
doc_topic_prior : float, optional (default=None)
Prior of document topic distribution `theta`. If the value is None,
defaults to `1 / n_topics`.
defaults to `1 / n_components`.
In the literature, this is called `alpha`.
topic_word_prior : float, optional (default=None)
Prior of topic word distribution `beta`. If the value is None, defaults
to `1 / n_topics`.
to `1 / n_components`.
In the literature, this is called `eta`.
learning_method : 'batch' | 'online', default='online'
Expand Down Expand Up @@ -224,10 +224,15 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
n_topics : int, optional (default=None)
This parameter has been renamed to n_components and will
be removed in version 0.21.
.. deprecated:: 0.19
Attributes
----------
components_ : array, [n_topics, n_features]
components_ : array, [n_components, n_features]
Variational parameters for topic word distribution. Since the complete
conditional for topic word distribution is a Dirichlet,
``components_[i, j]`` can be viewed as pseudocount that represents the
Expand Down Expand Up @@ -255,13 +260,13 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
"""

def __init__(self, n_topics=10, doc_topic_prior=None,
def __init__(self, n_components=10, doc_topic_prior=None,
topic_word_prior=None, learning_method=None,
learning_decay=.7, learning_offset=10., max_iter=10,
batch_size=128, evaluate_every=-1, total_samples=1e6,
perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100,
n_jobs=1, verbose=0, random_state=None):
self.n_topics = n_topics
n_jobs=1, verbose=0, random_state=None, n_topics=None):
self.n_components = n_components
self.doc_topic_prior = doc_topic_prior
self.topic_word_prior = topic_word_prior
self.learning_method = learning_method
Expand All @@ -277,13 +282,20 @@ def __init__(self, n_topics=10, doc_topic_prior=None,
self.n_jobs = n_jobs
self.verbose = verbose
self.random_state = random_state
self.n_topics = n_topics

def _check_params(self):
"""Check model parameters."""
if self.n_topics is not None:
self._n_components = self.n_topics
warnings.warn("n_topics has been renamed to n_components in version 0.19 "
"and will be removed in 0.21", DeprecationWarning)
else:
self._n_components = self.n_components

if self.n_topics <= 0:
raise ValueError("Invalid 'n_topics' parameter: %r"
% self.n_topics)
if self._n_components <= 0:
raise ValueError("Invalid 'n_components' parameter: %r"
% self._n_components)

if self.total_samples <= 0:
raise ValueError("Invalid 'total_samples' parameter: %r"
Expand All @@ -305,20 +317,20 @@ def _init_latent_vars(self, n_features):
self.n_iter_ = 0

if self.doc_topic_prior is None:
self.doc_topic_prior_ = 1. / self.n_topics
self.doc_topic_prior_ = 1. / self._n_components
else:
self.doc_topic_prior_ = self.doc_topic_prior

if self.topic_word_prior is None:
self.topic_word_prior_ = 1. / self.n_topics
self.topic_word_prior_ = 1. / self._n_components
else:
self.topic_word_prior_ = self.topic_word_prior

init_gamma = 100.
init_var = 1. / init_gamma
# In the literature, this is called `lambda`
self.components_ = self.random_state_.gamma(
init_gamma, init_var, (self.n_topics, n_features))
init_gamma, init_var, (self._n_components, n_features))

# In the literature, this is `exp(E[log(beta)])`
self.exp_dirichlet_component_ = np.exp(
Expand Down Expand Up @@ -409,7 +421,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None):
Returns
-------
doc_topic_distr : array, shape=(n_samples, n_topics)
doc_topic_distr : array, shape=(n_samples, n_components)
Unnormalized document topic distribution.
"""

Expand Down Expand Up @@ -569,7 +581,7 @@ def _unnormalized_transform(self, X):
Returns
-------
doc_topic_distr : shape=(n_samples, n_topics)
doc_topic_distr : shape=(n_samples, n_components)
Document topic distribution for X.
"""
if not hasattr(self, 'components_'):
Expand Down Expand Up @@ -603,7 +615,7 @@ def transform(self, X):
Returns
-------
doc_topic_distr : shape=(n_samples, n_topics)
doc_topic_distr : shape=(n_samples, n_components)
Document topic distribution for X.
"""
doc_topic_distr = self._unnormalized_transform(X)
Expand All @@ -622,7 +634,7 @@ def _approx_bound(self, X, doc_topic_distr, sub_sampling):
X : array-like or sparse matrix, shape=(n_samples, n_features)
Document word matrix.
doc_topic_distr : array, shape=(n_samples, n_topics)
doc_topic_distr : array, shape=(n_samples, n_components)
Document topic distribution. In the literature, this is called
gamma.
Expand All @@ -644,7 +656,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size):
return score

is_sparse_x = sp.issparse(X)
n_samples, n_topics = doc_topic_distr.shape
n_samples, n_components = doc_topic_distr.shape
n_features = self.components_.shape[1]
score = 0

Expand Down Expand Up @@ -673,7 +685,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size):

# compute E[log p(theta | alpha) - log q(theta | gamma)]
score += _loglikelihood(doc_topic_prior, doc_topic_distr,
dirichlet_doc_topic, self.n_topics)
dirichlet_doc_topic, self._n_components)

# Compensate for the subsampling of the population of documents
if sub_sampling:
Expand Down Expand Up @@ -717,7 +729,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
X : array-like or sparse matrix, [n_samples, n_features]
Document word matrix.
doc_topic_distr : None or array, shape=(n_samples, n_topics)
doc_topic_distr : None or array, shape=(n_samples, n_components)
Document topic distribution.
If it is None, it will be generated by applying transform on X.
Expand All @@ -736,12 +748,12 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
if doc_topic_distr is None:
doc_topic_distr = self._unnormalized_transform(X)
else:
n_samples, n_topics = doc_topic_distr.shape
n_samples, n_components = doc_topic_distr.shape
if n_samples != X.shape[0]:
raise ValueError("Number of samples in X and doc_topic_distr"
" do not match.")

if n_topics != self.n_topics:
if n_components != self._n_components:
raise ValueError("Number of topics does not match.")

current_samples = X.shape[0]
Expand Down Expand Up @@ -769,7 +781,7 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False):
X : array-like or sparse matrix, [n_samples, n_features]
Document word matrix.
doc_topic_distr : None or array, shape=(n_samples, n_topics)
doc_topic_distr : None or array, shape=(n_samples, n_components)
Document topic distribution.
This argument is deprecated and is currently being ignored.
Expand Down

0 comments on commit ecd0713

Please sign in to comment.