First we import necessary components. Note that the Fortran is imported as gibbsSampler. The package f2py available in numpy gives convenient access to wrappers from Fortran modules to Python modules.

In [1]:
import os
import sys
sys.path.append('./fortran')
import gibbsSampler
print gibbsSampler.gibbs_sampler.gibbssampler.__doc__
import numpy as np
import scipy as sp
import pandas as pd
from scipy.special import gammaln
import scipy.misc
import random
import os
import glob
import random
random.seed(1234)
#
from scipy.special import gammaln



gibbssampler(matrix,nzw,nzm,nz,nm,p_z,topics,alpha,beta,lik,[ntopics,max_iter,m,n,top_size])

Wrapper for ``gibbssampler``.

Parameters
----------
matrix : input rank-2 array('q') with bounds (n,m)
nzw : input rank-2 array('q') with bounds (ntopics,n)
nzm : input rank-2 array('q') with bounds (ntopics,m)
nz : input rank-1 array('q') with bounds (ntopics)
nm : input rank-1 array('q') with bounds (m)
p_z : input rank-1 array('d') with bounds (ntopics)
topics : input rank-1 array('q') with bounds (top_size)
alpha : input float
beta : input float
lik : in/output rank-1 array('d') with bounds (max_iter)

Other Parameters
----------------
ntopics : input long, optional
    Default: shape(nzw,0)
max_iter : input long, optional
    Default: len(lik)
m : input long, optional
    Default: shape(matrix,1)
n : input long, optional
    Default: shape(matrix,0)
top_size : input long, optional
    Default: len(topics)



This is the class which runs the Latent Dirichlet Allocation

In [6]:
class LdaSampler(object):

    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        #if not isinstance(alpha, float):
        #    raise Exception(" Initial value for alpha must be a floating point number (.3)")

        #if not isinstance(beta, float):
        #    raise Exception(" Initial value for beta must be a floating point number (.3)")

        #if not isinstance(ntopics, int):
        #    raise Exception(" The number of topics must be an integer")

        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        """
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact

        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact

        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word

        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document

        Topics: size(?) An empty set
           Will come back to this
        """
        ndocs, vsize = self.matrix.shape

        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = []
        self.logL = []
        
        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for n in xrange(vsize):
                if self.matrix[m,n] == 0:
                    continue
                for w in xrange(self.matrix[m,n]):
                # Initialize a random topic for each word
                    z = np.random.randint(self.ntopics)
                    self.topics.append(z)
                    self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                    self.NM[m] += 1
                    self.NZW[z,n] += 1
                    self.NZ[z] += 1
                # Keep document, iterator for word, word index, and assignment
                #self.topics.append([i,w,z])
        
        self.topics = np.vstack(self.topics)


    def phi_theta(self):
        """
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding
            [0] phi: Probability of topic by word
            [1] theta: Probability of document by topic
        """
        

        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 0)[np.newaxis,:]

        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 1)[:,np.newaxis ]

        return num_phi, num_theta


                
    def run(self, maxiter = 30):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        M,N = self.matrix.shape

        p_z = np.zeros(self.ntopics)
        p_z += 1./self.ntopics
        
 
        # Make everything fortan contiguous
        p_z=p_z.flatten() # Flatten array (Make 1-D)
        p_z=p_z.reshape(self.ntopics, order='F')
        run_matrix = np.array(self.matrix.transpose(),order='F')
        run_NZM = np.array(self.NZM.transpose(),order='F')
        run_NZW = np.array(self.NZW,order='F')
        run_NZ = np.array(self.NZ,order='F')
        run_NM = np.array(self.NM,order='F')
        # index starts at 1 in fortran
        run_topics = np.array(self.topics,order='F') + 1
        run_topics2 = run_topics
        
        
        topics2 = self.topics.transpose()
        loglik = np.zeros(maxiter)
        
        gibbsSampler.gibbs_sampler.gibbssampler(matrix = run_matrix,
                                                nzw = run_NZW,
                                                nzm = run_NZM,
                                                nz = run_NZ,
                                                nm = run_NM,
                                                max_iter = maxiter,
                                                p_z = p_z,
                                                m = M,
                                                n = N,
                                                topics = run_topics,
                                                alpha = self.alpha,
                                                beta = self.beta,
                                                lik = loglik)
                                                
        
        self.NZM = run_NZM.transpose()
        self.matrix = run_matrix.transpose()
        self.NZW = run_NZW
        self.NZ = run_NZ
        self.NM = run_NM

        #if iteration > burnin:
        return self.phi_theta(),loglik
                
  
    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ
        self.logL = self.logL
        


Now we pull the JSTOR data and perform some manipulations

In [7]:
DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))
# p is the percent of data to take
p=.2
# sample files for train
gen_sample = np.array(sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))))
rand_sample = [ allfiles[i] for i in gen_sample ]
#
# take rest for test
rand_sample2 = []
for i in xrange(len(allfiles)):
    if i not in gen_sample:
        rand_sample2.append(allfiles[i])
        
# train data

np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())

# train data frame
comb_np_array = np.vstack(np_array_list)
train_frame = pd.DataFrame(comb_np_array)
train_frame.columns = ['words','count','source']
subless = (train_frame['words'].str.len() > 2)
submore = (train_frame['words'].str.len() < 20)
train_frame = train_frame.loc[subless]
train_frame = train_frame.loc[submore]
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.pivot(index = 'source',columns = 'words', values = 'count')
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.loc[:, (train_frame.sum(axis = 0) > 10)]

train_frame1 = train_frame.values.astype(int)

Here we make the LDA object and run it for 10 iterations

In [8]:
sampler = LdaSampler(data = train_frame1, ntopics = 5, alpha = .0001, beta = .0001)
test_lda = sampler.run(maxiter =10)

Running this code will let you examine the loglikelihood

In [9]:
likelihood = test_lda[1]
import matplotlib.pyplot as plt
plt.plot(likelihood)
plt.ylabel('Negative Log Likelihood')
plt.show()

Most of the analysis was run in R, but we can get a nice taste of some of the output here. For example, below is the most probable words for topic three.

In [12]:
phi_theta = test_lda[0]
phi = pd.DataFrame(data=phi_theta[0], columns = train_frame.columns)
theta = pd.DataFrame(data=phi_theta[1])
dat_phi = phi.transpose()

col_ = 2
dat_phi = dat_phi.sort_values(col_,ascending=False)
print "topic"
print col_
dat_phi.iloc[1:10,col_]

topic
2


words
drug           0.538449
pronounced     0.538449
dekker         0.538449
acute          0.538449
judged         0.538449
hundred        0.499992
tetrahedron    0.499988
argue          0.499988
perfect        0.499988
Name: 2, dtype: float64