In [1]:
import string
import logging
import itertools
from gensim.corpora.dictionary import Dictionary
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import numpy as np
from scipy.special import psi, polygamma, gammaln

from lda.utils import dict_from_corpus, get_random_state

In [2]:
logger = logging.getLogger(__name__)

This example is taken from gensim, it has been modified only for learning purposes (to have everything in one page)

## Data

In [3]:
cats = ['comp.windows.x', 'talk.religion.misc']
sw_nltk = stopwords.words('english')
texts=fetch_20newsgroups(subset='train', categories=cats).data

def remove_punct(text):
    text=text.lower()
    text=text.translate(str.maketrans('', '', string.punctuation))
    return text

texts_toks=[remove_punct(t).split() for t in texts]
texts_toks_nostop=[[tok for tok in text_toks if tok not in sw_nltk] for text_toks in texts_toks]

common_dictionary = Dictionary(texts_toks_nostop)
common_corpus = [common_dictionary.doc2bow(text) for text in texts_toks_nostop]


## State class

In [4]:
class LdaState():
    """Encapsulate information for distributed computation of :class:`~gensim.models.ldamodel.LdaModel` objects.

    Objects of this class are sent over the network, so try to keep them lean to
    reduce traffic.

    """

    def __init__(self, eta, shape, dtype=np.float32):
        """

        Parameters
        ----------
        eta : numpy.ndarray
            The prior probabilities assigned to each term.
        shape : tuple of (int, int)
            Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
        dtype : type
            Overrides the numpy array default types.

        """
        self.eta = eta.astype(dtype, copy=False)
        self.sstats = np.zeros(shape, dtype=dtype)
        self.numdocs = 0
        self.dtype = dtype

    def reset(self):
        """Prepare the state for a new EM iteration (reset sufficient stats)."""
        self.sstats[:] = 0.0
        self.numdocs = 0

    def merge(self, other):
        """Merge the result of an E step from one node with that of another node (summing up sufficient statistics).

        The merging is trivial and after merging all cluster nodes, we have the
        exact same result as if the computation was run on a single node (no
        approximation).

        Parameters
        ----------
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.

        """
        assert other is not None
        self.sstats += other.sstats
        self.numdocs += other.numdocs

    def blend(self, rhot, other, targetsize=None):
        """Merge the current state with another one using a weighted average for the sufficient statistics.

        The number of documents is stretched in both state objects, so that they are of comparable magnitude.
        This procedure corresponds to the stochastic gradient update from
        `'Online Learning for LDA' by Hoffman et al.`_, see equations (5) and (9).

        Parameters
        ----------
        rhot : float
            Weight of the `other` state in the computed average. A value of 0.0 means that `other`
            is completely ignored. A value of 1.0 means `self` is completely ignored.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        """
        assert other is not None
        if targetsize is None:
            targetsize = self.numdocs

        # stretch the current model's expected n*phi counts to target size
        if self.numdocs == 0 or targetsize == self.numdocs:
            scale = 1.0
        else:
            scale = 1.0 * targetsize / self.numdocs
        self.sstats *= (1.0 - rhot) * scale

        # stretch the incoming n*phi counts to target size
        if other.numdocs == 0 or targetsize == other.numdocs:
            scale = 1.0
        else:
            logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize)
            scale = 1.0 * targetsize / other.numdocs
        self.sstats += rhot * scale * other.sstats

        self.numdocs = targetsize

    def blend2(self, rhot, other, targetsize=None):
        """Merge the current state with another one using a weighted sum for the sufficient statistics.

        In contrast to :meth:`~gensim.models.ldamodel.LdaState.blend`, the sufficient statistics are not scaled
        prior to aggregation.

        Parameters
        ----------
        rhot : float
            Unused.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        """
        assert other is not None
        if targetsize is None:
            targetsize = self.numdocs

        # merge the two matrices by summing
        self.sstats += other.sstats
        self.numdocs = targetsize

    def get_lambda(self):
        """Get the parameters of the posterior over the topics, also referred to as "the topics".

        Returns
        -------
        numpy.ndarray
            Parameters of the posterior probability over topics.

        """
        return self.eta + self.sstats

    def get_Elogbeta(self):
        """Get the log (posterior) probabilities for each topic.

        Returns
        -------
        numpy.ndarray
            Posterior probabilities for each topic.
        """
        return dirichlet_expectation(self.get_lambda())

    @classmethod
    def load(cls, fname, *args, **kwargs):
        """Load a previously stored state from disk.

        Overrides :class:`~gensim.utils.SaveLoad.load` by enforcing the `dtype` parameter
        to ensure backwards compatibility.

        Parameters
        ----------
        fname : str
            Path to file that contains the needed object.
        args : object
            Positional parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
        kwargs : object
            Key-word parameters to be propagated to class:`~gensim.utils.SaveLoad.load`

        Returns
        -------
        :class:`~gensim.models.ldamodel.LdaState`
            The state loaded from the given file.

        """
        result = super(LdaState, cls).load(fname, *args, **kwargs)

        # dtype could be absent in old models
        if not hasattr(result, 'dtype'):
            result.dtype = np.float64  # float64 was implicitly used before (because it's the default in numpy)
            logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)

        return result


## Helpers

In [5]:
def dirichlet_expectation(alpha):
    """Expected value of log(theta) where theta is drawn from a Dirichlet distribution.

    Parameters
    ----------
    alpha : numpy.ndarray
        Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.

    Returns
    -------
    numpy.ndarray
        Log of expected values, dimension same as `alpha.ndim`.

    """
    if len(alpha.shape) == 1:
        result = psi(alpha) - psi(np.sum(alpha))
    else:
        result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
    return result.astype(alpha.dtype, copy=False)  # keep the same precision as input


In [6]:
def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32):
    """Yield elements from `iterable` in "chunksize"-ed groups.

    The last returned element may be smaller if the length of collection is not divisible by `chunksize`.

    Parameters
    ----------
    iterable : iterable of object
        An iterable.
    chunksize : int
        Split iterable into chunks of this size.
    as_numpy : bool, optional
        Yield chunks as `np.ndarray` instead of lists.

    Yields
    ------
    list OR np.ndarray
        "chunksize"-ed chunks of elements from `iterable`.

    Examples
    --------
    .. sourcecode:: pycon

        >>> print(list(grouper(range(10), 3)))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

    """
    it = iter(iterable)
    while True:
        if as_numpy:
            # convert each document to a 2d numpy array (~6x faster when transmitting
            # chunk data over the wire, in Pyro)
            wrapped_chunk = [[np.array(doc, dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
        else:
            wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
        if not wrapped_chunk[0]:
            break
        # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
        yield wrapped_chunk.pop()
        
grouper=chunkize_serial

In [7]:
def mean_absolute_difference(a, b):
    """Mean absolute difference between two arrays.

    Parameters
    ----------
    a : numpy.ndarray
        Input 1d array.
    b : numpy.ndarray
        Input 1d array.

    Returns
    -------
    float
        mean(abs(a - b)).

    """
    return np.mean(np.abs(a - b))

## Init model params

In [8]:
id2word = dict_from_corpus(common_corpus)
num_terms = len(id2word)
num_topics = 3
chunksize = 2000
decay = 0.5
offset = 1.0
minimum_probability = 0.01
num_updates = 0

passes = 10
update_every = 1
eval_every = 10
minimum_phi_value = 0.01
per_word_topics = False

dtype = np.finfo(np.float32).dtype
iterations = 50
gamma_threshold = 0.001

random_state = get_random_state(None)

#### init alpha and eta

In [9]:
#simple uniform priors
alpha =  init_prior = np.fromiter(
            (1.0 / num_topics for i in range(num_topics)),
            dtype=dtype, count=num_topics,
        )
eta =  init_prior = np.fromiter(
            (1.0 / num_topics for i in range(num_terms)),
            dtype=dtype, count=num_terms,
        )

In [10]:
alpha

array([0.33333334, 0.33333334, 0.33333334], dtype=float32)

In [11]:
eta

array([0.33333334, 0.33333334, 0.33333334, ..., 0.33333334, 0.33333334,
       0.33333334], dtype=float32)

In [12]:
eta.shape

(24296,)

In [13]:
# Initialize the variational distribution q(beta|lambda)
state = LdaState(eta, (num_topics, num_terms), dtype=dtype)
state.sstats= np.zeros((num_topics, num_terms), dtype=dtype)
state.sstats[...] = np.random.mtrand._rand.gamma(100., 1. / 100., (num_topics, num_terms))
expElogbeta = np.exp(dirichlet_expectation(state.sstats))

In [14]:
state.sstats.shape

(3, 24296)

In [15]:
state.sstats

array([[1.0780917 , 1.1127311 , 0.90114397, ..., 1.0393286 , 0.90416795,
        1.0334735 ],
       [1.003914  , 0.9444049 , 1.1655728 , ..., 0.88193494, 1.1274736 ,
        1.0421051 ],
       [0.9466089 , 0.88825935, 1.1553314 , ..., 1.1808109 , 1.0925727 ,
        1.0331155 ]], dtype=float32)

In [16]:
expElogbeta.shape

(3, 24296)

## More helpers

In [17]:
def do_estep(chunk, expElogbeta, dtype, num_topics,iterations, alpha, gamma_threshold, state=None):
    """Perform inference on a chunk of documents, and accumulate the collected sufficient statistics.

    Parameters
    ----------
    chunk : list of list of (int, float)
        The corpus chunk on which the inference step will be performed.
    state : :class:`~gensim.models.ldamodel.LdaState`, optional
        The state to be updated with the newly accumulated sufficient statistics. If none, the models
        `self.state` is updated.

    Returns
    -------
    numpy.ndarray
        Gamma parameters controlling the topic weights, shape (`len(chunk)`, `self.num_topics`).

    """
#     gamma, sstats = inference(chunk, collect_sstats=True)
    gamma, sstats =  inference(chunk, expElogbeta, dtype, num_topics,iterations, alpha, gamma_threshold,
              collect_sstats=True)
    state.sstats += sstats
    state.numdocs += gamma.shape[0]  # avoids calling len(chunk) on a generator
    assert gamma.dtype == dtype
    return gamma, state

In [18]:
def inference(chunk, expElogbeta, dtype, num_topics,iterations, alpha, gamma_threshold,
              collect_sstats=False):
    """Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights)
    for each document in the chunk.

    This function does not modify the model. The whole input chunk of document is assumed to fit in RAM;
    chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational
    parameter directly using the optimization presented in
    `Lee, Seung: Algorithms for non-negative matrix factorization"
    <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.

    Parameters
    ----------
    chunk : list of list of (int, float)
        The corpus chunk on which the inference step will be performed.
    collect_sstats : bool, optional
        If set to True, also collect (and return) sufficient statistics needed to update the model's topic-word
        distributions.

    Returns
    -------
    (numpy.ndarray, {numpy.ndarray, None})
        The first element is always returned and it corresponds to the states gamma matrix. The second element is
        only returned if `collect_sstats` == True and corresponds to the sufficient statistics for the M step.

    """
    try:
        len(chunk)
    except TypeError:
        # convert iterators/generators to plain list, so we have len() etc.
        chunk = list(chunk)


    # Initialize the variational distribution q(theta|gamma) for the chunk
#     gamma = random_state.gamma(100., 1. / 100., (len(chunk), num_topics)).astype(dtype, copy=False)
    gamma =  np.random.mtrand._rand.gamma(100., 1. / 100., (len(chunk), num_topics)).astype(dtype, copy=False)
    Elogtheta = dirichlet_expectation(gamma)
    expElogtheta = np.exp(Elogtheta)

    assert Elogtheta.dtype == dtype
    assert expElogtheta.dtype == dtype

    if collect_sstats:
        sstats = np.zeros_like(expElogbeta, dtype=dtype)
    else:
        sstats = None
    converged = 0

    # Now, for each document d update that document's gamma and phi
    # Inference code copied from Hoffman's `onlineldavb.py` (esp. the
    # Lee&Seung trick which speeds things up by an order of magnitude, compared
    # to Blei's original LDA-C code, cool!).
    integer_types = (int, np.integer,)
    epsilon = np.finfo(dtype).eps
    for d, doc in enumerate(chunk):
        if len(doc) > 0 and not isinstance(doc[0][0], integer_types):
            # make sure the term IDs are ints, otherwise np will get upset
            ids = [int(idx) for idx, _ in doc]
        else:
            ids = [idx for idx, _ in doc]
        cts = np.fromiter((cnt for _, cnt in doc), dtype=dtype, count=len(doc))
        gammad = gamma[d, :]
        Elogthetad = Elogtheta[d, :]
        expElogthetad = expElogtheta[d, :]
        #expElogbeta is global!
        expElogbetad = expElogbeta[:, ids]

        # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_kw.
        # phinorm is the normalizer.
        # TODO treat zeros explicitly, instead of adding epsilon?
        phinorm = np.dot(expElogthetad, expElogbetad) + epsilon

        # Iterate between gamma and phi until convergence
        for _ in range(iterations):
            lastgamma = gammad
            # We represent phi implicitly to save memory and time.
            # Substituting the value of the optimal phi back into
            # the update for gamma gives this update. Cf. Lee&Seung 2001.
            gammad = alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
            Elogthetad = dirichlet_expectation(gammad)
            expElogthetad = np.exp(Elogthetad)
            phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
            # If gamma hasn't changed much, we're done.
            meanchange = mean_absolute_difference(gammad, lastgamma)
            if meanchange < gamma_threshold:
                converged += 1
                break
        gamma[d, :] = gammad
        assert gammad.dtype == dtype
        if collect_sstats:
            # Contribution of document d to the expected sufficient
            # statistics for the M step.
            sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)

#     if len(chunk) > 1:
#         logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations)

    if collect_sstats:
        # This step finishes computing the sufficient statistics for the
        # M step, so that
        # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
        # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
        sstats *= expElogbeta
        assert sstats.dtype == dtype

    assert gamma.dtype == dtype
    return gamma, sstats

In [19]:
def do_mstep(rho, state, other,random_state, num_updates, alpha, id2word, extra_pass=False):
    """Maximization step: use linear interpolation between the existing topics and
    collected sufficient statistics in `other` to update the topics.

    Parameters
    ----------
    rho : float
        Learning rate.
    other : :class:`~gensim.models.ldamodel.LdaModel`
        The model whose sufficient statistics will be used to update the topics.
    extra_pass : bool, optional
        Whether this step required an additional pass over the corpus.

    """
    logger.debug("updating topics")
    # update self with the new blend; also keep track of how much did
    # the topics change through this update, to assess convergence
    previous_Elogbeta = state.get_Elogbeta()
    state.blend(rho, other)

    current_Elogbeta = state.get_Elogbeta()
#     sync_state(current_Elogbeta)
    expElogbeta=sync_state(state, dtype, current_Elogbeta=current_Elogbeta)

    # print out some debug info at the end of each EM iteration
    # self.print_topics(5)
    show_topics(num_topics=num_topics, num_words=20, alpha=alpha, id2word=id2word, random_state=random_state, state=state,
                log=True)
    diff = mean_absolute_difference(previous_Elogbeta.ravel(), current_Elogbeta.ravel())
    logger.info("topic diff=%f, rho=%f", diff, rho)

    # if self.optimize_eta:
    #     self.update_eta(self.state.get_lambda(), rho)

    if not extra_pass:
        # only update if this isn't an additional pass
        num_updates += other.numdocs
    return num_updates, expElogbeta

In [20]:
def show_topics(num_topics, num_words, alpha, id2word, random_state, state, log=False, formatted=True):
    """Get a representation for selected topics.

    Parameters
    ----------
    num_topics : int, optional
        Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in LDA.
        The returned topics subset of all topics is therefore arbitrary and may change between two LDA
        training runs.
    num_words : int, optional
        Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
        probability for each topic).
    log : bool, optional
        Whether the output is also logged, besides being returned.
    formatted : bool, optional
        Whether the topic representations should be formatted as strings. If False, they are returned as
        2 tuples of (word, probability).

    Returns
    -------
    list of {str, tuple of (str, float)}
        a list of topics, each represented either as a string (when `formatted` == True) or word-probability
        pairs.

    """
#     if num_topics < 0 or num_topics >= self.num_topics:
#         num_topics = self.num_topics
#         chosen_topics = range(num_topics)
#     else:
#         num_topics = min(num_topics, self.num_topics)

    # add a little random jitter, to randomize results around the same alpha
    sort_alpha = alpha + 0.0001 * random_state.rand(len(alpha))
    # random_state.rand returns float64, but converting back to dtype won't speed up anything

    sorted_topics = list(argsort(sort_alpha))
    chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]

    shown = []

    topic = state.get_lambda()
    for i in chosen_topics:
        topic_ = topic[i]
        topic_ = topic_ / topic_.sum()  # normalize to probability distribution
        bestn = argsort(topic_, num_words, reverse=True)
        topic_ = [(id2word[id], topic_[id]) for id in bestn]
        if formatted:
            topic_ = ' + '.join('%.3f*"%s"' % (v, k) for k, v in topic_)

        shown.append((i, topic_))
        logger.info("topic #%i (%.3f): %s", i, alpha[i], topic_)

    return shown

In [21]:
def sync_state(state, dtype, current_Elogbeta=None):
    """Propagate the states topic probabilities to the inner object's attribute.

    Parameters
    ----------
    current_Elogbeta: numpy.ndarray
        Posterior probabilities for each topic, optional.
        If omitted, it will get Elogbeta from state.

    """
    if current_Elogbeta is None:
        current_Elogbeta = state.get_Elogbeta()
    expElogbeta = np.exp(current_Elogbeta)
    assert expElogbeta.dtype == dtype
    return expElogbeta

In [22]:
def argsort(x, topn=None, reverse=False):
    """Efficiently calculate indices of the `topn` smallest elements in array `x`.

    Parameters
    ----------
    x : array_like
        Array to get the smallest element indices from.
    topn : int, optional
        Number of indices of the smallest (greatest) elements to be returned.
        If not given, indices of all elements will be returned in ascending (descending) order.
    reverse : bool, optional
        Return the `topn` greatest elements in descending order,
        instead of smallest elements in ascending order?

    Returns
    -------
    numpy.ndarray
        Array of `topn` indices that sort the array in the requested order.

    """
    x = np.asarray(x)  # unify code path for when `x` is not a np array (list, tuple...)
    if topn is None:
        topn = x.size
    if topn <= 0:
        return []
    if reverse:
        x = -x
    if topn >= x.size or not hasattr(np, 'argpartition'):
        return np.argsort(x)[:topn]
    # np >= 1.8 has a fast partial argsort, use that!
    most_extreme = np.argpartition(x, topn)[:topn]
    return most_extreme.take(np.argsort(x.take(most_extreme)))  # resort topn into order



In [23]:
def get_topic_terms(state, topicid, topn=10):
    """Get the representation for a single topic. Words the integer IDs, in constrast to
    :meth:`~gensim.models.ldamodel.LdaModel.show_topic` that represents words by the actual strings.

    Parameters
    ----------
    topicid : int
        The ID of the topic to be returned
    topn : int, optional
        Number of the most significant words that are associated with the topic.

    Returns
    -------
    list of (int, float)
        Word ID - probability pairs for the most relevant words generated by the topic.

    """
    topic = get_topics(state)[topicid]
    topic = topic / topic.sum()  # normalize to probability distribution
    bestn = argsort(topic, topn, reverse=True)
    return [(idx, topic[idx]) for idx in bestn]


In [24]:
def get_topics(state):
    """Get the term-topic matrix learned during inference.


    Returns
    -------
    numpy.ndarray
        The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`).

    """
    topics = state.get_lambda()
    return topics / topics.sum(axis=1)[:, None]

## Training

In [25]:
lencorpus = len(common_corpus)
chunksize = min(lencorpus, chunksize)

In [26]:
numworkers=1
updateafter = min(lencorpus, update_every * numworkers * chunksize)
updateafter

970

In [27]:
evalafter = min(lencorpus, (eval_every or 0) * numworkers * chunksize)
evalafter

970

In [28]:
updates_per_pass = max(1, lencorpus / updateafter)
updates_per_pass

1

In [29]:
chunks_as_numpy=False

In [30]:
# rho is the "speed" of updating; TODO try other fncs
# pass_ + num_updates handles increasing the starting t for each pass,
# while allowing it to "reset" on the first pass of each update
def rho():
    return pow(offset + pass_ + (num_updates / chunksize), -decay)

In [31]:
passes

10

In [32]:
for pass_ in range(passes):
    #todo! can get without this class?
    other = LdaState(eta, state.sstats.shape, dtype)
    dirty = False
    reallen = 0
    chunks = grouper(common_corpus, chunksize, as_numpy=chunks_as_numpy, dtype=dtype)
    for chunk_no, chunk in enumerate(chunks):
        reallen += len(chunk)  # keep track of how many documents we've processed so far
        gammat, state = do_estep(chunk, expElogbeta, dtype, num_topics,iterations, alpha, gamma_threshold, other)
        dirty = True
        del chunk
        # perform an M step. determine when based on update_every, don't do this after every chunk
        if update_every and (chunk_no + 1) % (update_every * numworkers) == 0:
            num_updates, expElogbeta=do_mstep(rho(), state, other,random_state, num_updates, alpha, id2word, pass_ > 0)
            del other 
            other = LdaState(eta, state.sstats.shape, dtype)
    if dirty:
        # finish any remaining updates
        num_updates, expElogbeta=do_mstep(rho(), state, other,random_state, num_updates, alpha, id2word, pass_ > 0)
        del other
        dirty = False

In [33]:
for topic_id in range(3):
    print()
    print(f'topic nr {topic_id}')
    topic_words=get_topic_terms(state, topic_id)
    for word_weight in topic_words:
        print(common_dictionary[word_weight[0]])


topic nr 0
x
file
entry
program
0
c
output
entries
oname
rules

topic nr 1
one
subject
lines
organization
would
people
god
writes
jesus
article

topic nr 2
x
window
subject
lines
organization
server
use
get
motif
available
