In [7]:
import os
import sys
sys.path.append('./fortran')
import gibbsSampler
print gibbsSampler.gibbs_sampler.gibbssampler.__doc__
import numpy as np
import scipy as sp
from scipy.special import gammaln
import scipy.misc
import random
import numpy as np
import pandas as pd
import os
import glob
import random
random.seed(1234)
#
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln

def index_sample(p):
    """
    Desc: Samples from n topics distributed multinomially and returns topic number
    input: p - A one dimensional array of float64 type that contains the probability for each topic
    output: an Integer specifying which topic was chosen from a multinomial distribution
    """
    r = random.random()
    for i in range(len(p)):
        r = r - p[i]
        if r < 0:
            return i
    return len(p) - 1

def word_indices(vec):
    """
    Desc: Take a vector of word counts from a document and create a generator for word indices
    input: A vector from a Document Term Frequency matrix for one document.
    output: A generator object to store the word indices when called
    """
    for idx in vec.nonzero()[0]:
        for i in xrange(int(vec[idx])):
            yield idx

def log_multi_beta(alpha, K = None):
    """
    Desc: Compute the logarithm of the multinomial beta function
    input: alpha - A vector with type float64 or a scaler of float64
           K - An integer that, if alpha is a scalar, multiplies the log by K
    output: a float64 with value of the logarithm of the multinomial beta
    """

    if K is None:
        return np.sum(gammaln(alpha) - gammaln(np.sum(alpha)))
    else:
        return K * gammaln(alpha) - gammaln(K * alpha)

class LdaSampler(object):

    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        if not isinstance(alpha, float):
            raise Exception(" Initial value for alpha must be a floating point number (.3)")

        if not isinstance(beta, float):
            raise Exception(" Initial value for beta must be a floating point number (.3)")

        if not isinstance(ntopics, int):
            raise Exception(" The number of topics must be an integer")

        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        """
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact

        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact

        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word

        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document

        Topics: size(?) An empty set
           Will come back to this
        """
        ndocs, vsize = self.matrix.shape

        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = np.zeros((ndocs,vsize))
        self.logL = []
        
        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for n in xrange(vsize):
                if self.matrix[m,n] == 0:
                    continue
                for w in xrange(self.matrix[m,n]):
                # Initialize a random topic for each word
                    z = np.random.randint(self.ntopics)
                    self.topics[m,n] = z
                    self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                    self.NM[m] += 1
                    self.NZW[z,n] += 1
                    self.NZ[z] += 1
                # Keep document, iterator for word, word index, and assignment
                #self.topics.append([i,w,z])
        
        #self.topics = np.vstack(self.topics)
    def _conditional_distribution(self, m, n):
        """
        Desc: Compute the conditional distribution of words in document and topic
        Input: m: An integer representing the column index of the document
               w: The generator object from word_indices

        Output: p_z: An array size(w X 1) containing probabilities for topics of word
        
        The formula is:
        ((n_{k,-i}^(t) + \beta_t)/\sum_{t=1}^V(n_{k,-i}^(t) + \beta_t)) *
        ((n_{m,-i}^(t) + \alpha_k)/(\sum_{k=1}^K(n_m^k + \alpha_k) - 1))
        """
        vsize = self.NZW.shape[1]
        left = (np.delete(self.NZW,n,axis=1) + self.beta) / (self.NZ + self.beta * vsize)[:,None]

        # Changed NZM _ one top to matrix to be concurrent with equation 79.
        # Also, on bottom changed NM[m] to NZM[m,:] to sum over interaction of topic and doc
        right = (np.delete(self.matrix[m,:],n) + self.alpha) / (self.NZM[m,:] + self.alpha * self.ntopics)[:,None]

        # Left and Right are 4x3, so do 4x3 * 3x4 for 4x4 Z transition matrix
        
        p_z = np.dot(left , right.transpose())
        p_z = abs(p_z)
        p_z /= np.sum(p_z,axis=1)[:,None]

        return p_z

    def loglikelihood(self):
        """
        Desc: Compute the log likelihood that the model generated the data
        Input: self references
        Output: lik: float of the log likelihood
        """
        # Why are these being repeated here?
        vsize = self.NZW.shape[1]
        ndocs = self.NZM.shape[0]
        lik = 0

        for z in xrange(self.ntopics):
            lik += log_multi_beta(self.NZW[z,:] + self.beta)
            lik -= log_multi_beta(self.beta, vsize)

        for m in xrange(ndocs):
            lik += log_multi_beta(self.NZM[m,:] + self.alpha)
            lik -= log_multi_beta(self.alpha, self.ntopics)

        return lik

    def phi_theta(self):
        """
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding
            [0] phi: Probability of topic by word
            [1] theta: Probability of document by topic
        """
        

        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 0)[np.newaxis,:]

        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 0)[ np.newaxis,:]

        return num_phi, num_theta


    def run(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        n_docs, vsize = self.matrix.shape
        topics2 = self.topics


        for iteration in xrange(maxiter + 2):
            # Idea: After each iteration we now want to
            # make assignments relative to the newly generated topics
            if iteration > 1:
                self.topics = topics2
            for m in xrange(n_docs):
                for n in xrange(vsize):
                    if self.matrix[m,n] == 0:
                        continue
                    for w in xrange(self.matrix[m,n]):
                        
                        z = self.topics[m,n]
                    
                        self.NZM[m,z] -= 1
                        self.NM[m] -= 1
                        self.NZW[z,n] -= 1
                        self.NZ[z] -= 1

                        p_z = self._conditional_distribution(m,n)
                        # Choosing a random topic row
                        ind_z = np.random.randint(self.ntopics)
                    
                        # Sampling random topic
                        z = index_sample(p_z[ind_z,:])
                        
                        #Self.topics needs to change after we iterate over this word
                        # Otherwise at each iteration we subtract one from that space w*n times
                        # giving us a negative number
                        topics2[m,n] = z

                        self.NZM[m,z] += 1
                        self.NM[m] += 1
                        self.NZW[z,n] += 1
                        self.NZ[z] += 1

            if iteration > burnin:
                yield self.phi_theta()

                
    def runfort(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        M,N = self.matrix.shape

        p_z = np.zeros(self.ntopics)
        p_z += 1./self.ntopics
        gibbsSampler.gibbs_sampler.gibbssampler(matrix = self.matrix, nzw = self.NZW.transpose(),
                                                nzm = self.NZM.transpose(),
                                                nz = self.NZ,
                                                nm = self.NM,
                                                max_iter = maxiter,
                                                p_z = p_z)

        #if iteration > burnin:
        return self.phi_theta()
                
                

    def prn(self,x = None):
        print x

    # For some reason this returns (maxiter - burnin) - 2 iterations?
    def update(self, maxiter = 20, burnin = 0):
        """
        Desc: Runs gibbs sampler for maxiter iterations
            Input: maxiter - integer specifying maximum number of iterations
                   burnin  - integer specifying number of iterations to burn through.
                                should be set to zero after initial burnin
            Output: phi_theta() Two arrays, holding
                [0] Probability of topic by word
                [1] Probability of document by topic
        """
        
        for iteration, phi_theta in enumerate(self.run( maxiter, burnin)):
            self.prn(iteration)
            self.prn(self.loglikelihood())
            self.logL.append(self.loglikelihood())
        return self.phi_theta(), self.logL

    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ
        self.logL = self.logL

gibbssampler(matrix,nzw,nzm,nz,nm,max_iter,p_z,[ntopics,m,n])

Wrapper for ``gibbssampler``.

Parameters
----------
matrix : input rank-2 array('i') with bounds (m,n)
nzw : input rank-2 array('i') with bounds (n,ntopics)
nzm : input rank-2 array('i') with bounds (ntopics,m)
nz : input rank-1 array('i') with bounds (ntopics)
nm : input rank-1 array('i') with bounds (m)
max_iter : input int
p_z : input rank-1 array('f') with bounds (ntopics)

Other Parameters
----------------
ntopics : input int, optional
    Default: shape(nzw,1)
m : input int, optional
    Default: shape(matrix,0)
n : input int, optional
    Default: shape(matrix,1)



In [57]:
arr = np.array([[1,2,3,4],[1,2,3,4]])
print arr
arr2 = np.delete(arr, 1,axis=1)
print arr2

[[1 2 3 4]
 [1 2 3 4]]
[[1 3 4]
 [1 3 4]]


In [33]:

#
DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))
p=.5
# sample files for train
gen_sample = np.array(sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))))
rand_sample = [ allfiles[i] for i in gen_sample ]
#
# take rest for test
rand_sample2 = []
for i in xrange(len(allfiles)):
    if i not in gen_sample:
        rand_sample2.append(allfiles[i])
#
# train data

np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
#
# test data
np_array_list_test = []
for file_ in rand_sample2:
    df = pd.read_csv(file_, index_col = None, header = 0)
    df['source'] = file_
    np_array_list_test.append(df.as_matrix())
    
#
# train data frame
comb_np_array = np.vstack(np_array_list)
train_frame = pd.DataFrame(comb_np_array)
train_frame.columns = ['words','count','source']
subless = (train_frame['words'].str.len() > 2)
submore = (train_frame['words'].str.len() < 20)
train_frame = train_frame.loc[subless]
train_frame = train_frame.loc[submore]
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.pivot(index = 'source',columns = 'words', values = 'count')
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.loc[:, (train_frame.sum(axis = 0) > 5)]
#

# test data frame
comb_np_array_test = np.vstack(np_array_list_test)
test_frame = pd.DataFrame(comb_np_array_test)
test_frame.columns = ['words','count','source']
test_frame = test_frame.fillna(value=0)
test_frame = test_frame.pivot(index = 'source', columns = 'words', values = 'count')
test_frame = test_frame.fillna(value = 0)

In [8]:
testing_frame = np.array([[1,0,4,2],[2,4,0,5]])
sampler = LdaSampler(data = testing_frame, ntopics = 4, alpha = .1, beta = .1)
p_z = np.zeros(4) + .25
M,N = testing_frame.shape

In [143]:
testtt = np.array([[ 1.125,       1.125 ,      0.32142857 , 0.32142857],
 [ 1.30357143 , 1.30357143 , 0.37244898  ,0.37244898],
 [ 8.44642857,  8.44642857,  2.41326531 , 2.41326531],
 [ 5.03125,     5.03125 ,    1.4375 ,     1.4375    ]])

In [10]:
sampler.update( maxiter = 400, burnin = 0)



0
-183.501728817
1
-182.235122429
2
-179.837227156
3
-178.521550362
4
-185.926272032
5
-184.316130973
6
-183.064846598
7
-182.385250451
8
-180.067741613
9
-180.45720638
10
-180.067741613
11
-179.810579215
12
-178.705938272
13
-184.772530638
14
-180.45720638
15
-176.743634313
16
-179.253339457
17
-182.186052023
18
-187.103491013
19
-187.103491013
20
-182.47041238
21
-181.349158519
22
-184.148355982
23
-184.738564722
24
-182.035924002
25
-184.049130002
26
-181.65123473
27
-180.067741613
28
-179.899966622
29
-183.937320131
30
-182.855101652
31
-182.855101652
32
-181.31357849
33
-180.345396509
34
-180.299909022
35
-178.9689791
36
-180.45720638
37
-180.067741613
38
-180.45720638
39
-179.788156751
40
-183.552630859
41
-177.240133456
42
-183.570277829
43
-181.588495264
44
-186.7559962
45
-183.879248754
46
-182.018208148
47
-182.517445413
48
-182.035924002
49
-179.638028729
50
-183.311462478
51
-182.843156822
52
-179.837227156
53
-176.1240977
54
-185.89962409
55
-180.45720638
56
-177.622428887

((array([[ 0.61764706,  0.25      ,  0.02272727,  0.28378378],
         [ 0.02941176,  0.02272727,  0.47727273,  0.41891892],
         [ 0.02941176,  0.47727273,  0.25      ,  0.28378378],
         [ 0.32352941,  0.25      ,  0.25      ,  0.01351351]]),
  array([[ 0.21153846,  0.59615385,  0.40384615,  0.34375   ],
         [ 0.78846154,  0.40384615,  0.59615385,  0.65625   ]])),
 [-185.68054567822381,
  -178.70593827173474,
  -185.89962409012986,
  -182.46563688564473,
  -178.05931110680967,
  -179.638028729024,
  -182.29786189446409,
  -181.65123472953903,
  -183.87924875385471,
  -177.20154068450424,
  -179.71135323476389,
  -183.5017288173315,
  -182.23512242896427,
  -179.83722715616588,
  -178.52155036225994,
  -185.92627203161274,
  -184.31613097343714,
  -183.06484659774904,
  -182.38525045056744,
  -180.06774161284633,
  -180.45720637960804,
  -180.06774161284633,
  -179.810579214683,
  -178.70593827173474,
  -184.7725306377084,
  -180.45720637960807,
  -176.74363431290377,
  

In [10]:
sampler.topics

array([[ 0.,  0.,  2.,  2.],
       [ 1.,  1.,  0.,  1.]])

In [None]:
gibbsSampler.gibbs_sampler.gibbssampler(matrix = sampler.matrix, nzw = sampler.NZW.transpose(),
                                                nzm = sampler.NZM.transpose(),
                                                nz = sampler.NZ,
                                                nm = sampler.NM,
                                                max_iter = 10,
                                                p_z = p_z,
                                                m = M,
                                                n = N)

In [None]:
for m in xrange(testing_frame.shape[0]):
    for i,w in enumerate(word_indices(testing_frame[m,:])):
        print i,w

In [None]:
sampler = LdaSampler(data = train_frame.values, ntopics = 4, alpha = .1, beta = .1)
LDAtest = sampler.runfort(maxiter = 8)

In [None]:
gibbsSampler.gibbs_sampler.gibbssampler(matrix = sampler.matrix, nzw = sampler.NZW.transpose(),
                                                nzm = sampler.NZM.transpose(),
                                                nz = sampler.NZ,
                                                nm = sampler.NM,
                                                max_iter = 10,
                                                p_z = p_z,
                                                m = M,
                                                n = N)

In [None]:
sampler.NZM