In [2]:
import numpy as np
import scipy as sp
from scipy.special import gammaln
import scipy.misc
import random

def index_sample(p):
    """
    Desc: Samples from n topics distributed multinomially and returns topic number
    input: p - A one dimensional array of float64 type that contains the probability for each topic
    output: an Integer specifying which topic was chosen from a multinomial distribution
    """
    r = random.random()
    for i in range(len(p)):
        r = r - p[i]
        if r < 0:
            return i
    return len(p) - 1

def word_indices(vec):
    """
    Desc: Take a vector of word counts from a document and create a generator for word indices
    input: A vector from a Document Term Frequency matrix for one document.
    output: A generator object to store the word indices when called
    """
    for idx in vec.nonzero()[0]:
        for i in xrange(int(vec[idx])):
            yield idx

def log_multi_beta(alpha, K = None):
    """
    Desc: Compute the logarithm of the multinomial beta function
    input: alpha - A vector with type float64 or a scaler of float64
           K - An integer that, if alpha is a scalar, multiplies the log by K
    output: a float64 with value of the logarithm of the multinomial beta
    """

    if K is None:
        return np.sum(gammaln(alpha) - gammaln(np.sum(alpha)))
    else:
        return K * gammaln(alpha) - gammaln(K * alpha)

class LdaSampler(object):

    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        if not isinstance(alpha, float):
            raise Exception(" Initial value for alpha must be a floating point number (.3)")

        if not isinstance(beta, float):
            raise Exception(" Initial value for beta must be a floating point number (.3)")

        if not isinstance(ntopics, int):
            raise Exception(" The number of topics must be an integer")

        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        """
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact

        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact

        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word

        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document

        Topics: size(?) An empty set
           Will come back to this
        """
        ndocs, vsize = self.matrix.shape

        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = {}

        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for i, w, in enumerate(word_indices(self.matrix[m,:])):
                # Initialize a random topic for each word
                z = np.random.randint(self.ntopics)
                self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                self.NM[m] += 1
                self.NZW[z,w] += 1
                self.NZ[z] += 1
                self.topics[(m,i)] = z

    def _conditional_distribution(self, m, w):
        """
        Desc: Compute the conditional distribution of words in document and topic
        Input: m: An integer representing the column index of the document
               w: The generator object from word_indices

        Output: p_z: An array size(w X 1) containing probabilities for topics of word
        """
        vsize = self.NZW.shape[1]
        left = (self.NZW[:,w] + self.beta) / (self.NZ + self.beta * vsize)
        right = (self.NZM[m,:] + self.alpha) / (self.NM[m] + self.alpha * self.ntopics)
        p_z = abs(left * right)
        p_z /= np.sum(p_z)
        return p_z

    def loglikelihood(self):
        """
        Desc: Compute the log likelihood that the model generated the data
        Input: self references
        Output: lik: float of the log likelihood
        """
        # Why are these being repeated here?
        vsize = self.NZW.shape[1]
        ndocs = self.NZM.shape[0]
        lik = 0

        for z in xrange(self.ntopics):
            lik += log_multi_beta(self.NZW[z,:] + self.beta)
            lik -= log_multi_beta(self.beta, vsize)

        for m in xrange(ndocs):
            lik += log_multi_beta(self.NZM[m,:] + self.alpha)
            lik -= log_multi_beta(self.alpha, self.ntopics)

        return lik

    def phi_theta(self):
        """
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding
            [0] phi: Probability of topic by word
            [1] theta: Probability of document by topic
        """
        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 1)[:, np.newaxis]

        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 1)[:, np.newaxis]

        return num_phi, num_theta


    def run(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        n_docs, vsize = self.matrix.shape



        for iteration in xrange(maxiter):
            for m in xrange(n_docs):
                for i,w in enumerate(word_indices(self.matrix[m,:])):
                    z = self.topics[(m,i)]

                    self.NZM[m,z] -= 1
                    self.NM[m] -= 1
                    self.NZW[z,w] -= 1
                    self.NZ[z] -= 1

                    p_z = self._conditional_distribution(m,w)
                    z = index_sample(p_z)

                    self.NZM[m,z] += 1
                    self.NM[m] += 1
                    self.NZW[z,w] += 1
                    self.NZ[z] += 1

            if iteration > burnin:
                yield self.phi_theta()

    def prn(self,x = None):
        print x

    # For some reason this returns (maxiter - burnin) - 2 iterations?
    def update(self, maxiter = 20, burnin = 0):
        """
        Desc: Runs gibbs sampler for maxiter iterations
            Input: maxiter - integer specifying maximum number of iterations
                   burnin  - integer specifying number of iterations to burn through.
                                should be set to zero after initial burnin
            Output: phi_theta() Two arrays, holding
                [0] Probability of topic by word
                [1] Probability of document by topic
        """
        for iteration, phi_theta in enumerate(self.run( maxiter, burnin)):
            self.prn(iteration)
            self.prn(self.loglikelihood())
        return self.phi_theta()

    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ

In [4]:
import numpy as np
import pandas as pd
import os
import glob
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln

DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))

p = .5
rand_sample = [ allfiles[i] for i in sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))) ]
rand_sample

np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())

comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ['words','count','source']
big_frame = big_frame.fillna(value = 0)
big_frame = big_frame.pivot(index = 'source',columns = 'words', values = 'count')
big_frame = big_frame.fillna(value = 0)

In [6]:
print big_frame.head()

words                                                  0    a  aa  aaa  \
source                                                                   
data_folder/wordcounts/wordcounts_10.2307_2276732.CSV  0    7   0    0   
data_folder/wordcounts/wordcounts_10.2307_2276856.CSV  0  181   0    0   
data_folder/wordcounts/wordcounts_10.2307_2276867.CSV  0    4   0    0   
data_folder/wordcounts/wordcounts_10.2307_2276929.CSV  0   28   0    0   
data_folder/wordcounts/wordcounts_10.2307_2276953.CSV  0   78   0    0   

words                                                  aaaaaaemh  aaaai  aaas  \
source                                                                          
data_folder/wordcounts/wordcounts_10.2307_2276732.CSV          0      0     0   
data_folder/wordcounts/wordcounts_10.2307_2276856.CSV          0      0     0   
data_folder/wordcounts/wordcounts_10.2307_2276867.CSV          0      0     0   
data_folder/wordcounts/wordcounts_10.2307_2276929.CSV          0      0     

In [8]:
import timeit
start_time = timeit.default_timer()

sampler = LdaSampler(data = big_frame.values, ntopics = 5, alpha = .1, beta = .1)
LDAtest = sampler.update(maxiter = 20)

elapsed = timeit.default_timer() - start_time
print "time elapsed: ", elapsed, " seconds\n"

0
-545292582237.0
1
-545293011821.0
2
-545312574588.0
3
-545400695781.0
4
-545489208551.0
5
-545535945630.0
6
-545552886080.0
7
-545554314346.0
8
-545550136627.0
9
-545532161016.0
10
-545501929152.0
11
-545471553852.0
12
-545452008458.0
13
-545436938583.0
14
-545426989614.0
15
-545427185471.0
16
-545428018413.0
17
-545432969125.0
18
-545446207762.0
time elapsed:  505.472303152  seconds



In [9]:
print LDAtest

(array([[ -3.86560200e-05,  -1.15987193e-01,  -1.98746608e-04, ...,
         -1.47986178e-04,  -8.94164503e-05,   1.60090588e-05],
       [  1.74904761e-04,  -1.09700197e-01,   7.60202492e-04, ...,
         -3.73084519e-05,   3.42279375e-07,   3.42279375e-07],
       [  4.07698988e-07,  -1.13894382e-01,  -1.13748018e-04, ...,
          4.40722606e-04,   4.07698988e-07,   4.07698988e-07],
       [ -3.61769284e-05,  -1.06260268e-01,  -2.78462779e-04, ...,
         -1.12513566e-04,   1.33091269e-04,   3.31898426e-07],
       [ -1.00036560e-04,   6.11975059e-01,  -8.39664295e-05, ...,
         -4.78086369e-05,  -5.18261694e-05,  -1.16508443e-05]]), array([[ 5.96203606, -1.23414634, -1.18536585, -1.43563097, -1.1068929 ],
       [-0.9226317 , -1.01635503, -0.99463108,  4.36854682, -0.43492901],
       [-1.14200937,  1.96262363, -0.99521083, -1.05143155,  2.22602811],
       ..., 
       [ 6.20363224, -1.32342036, -1.29920545, -1.31812334, -1.26288309],
       [ 1.90867925, -1.01039832, -0.9