In [4]:
import os
import sys
sys.path.append('./fortran')
import gibbsSampler6th
print gibbsSampler6th.gibbs_sampler.gibbssampler.__doc__
import numpy as np
import scipy as sp
from scipy.special import gammaln
import scipy.misc
import random
import os
import glob
import random
random.seed(1234)
#
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln

def index_sample(p):
    """
    Desc: Samples from n topics distributed multinomially and returns topic number
    input: p - A one dimensional array of float64 type that contains the probability for each topic
    output: an Integer specifying which topic was chosen from a multinomial distribution
    """
    r = random.random()
    for i in range(len(p)):
        r = r - p[i]
        if r < 0:
            return i
    return len(p) - 1

def word_indices(vec):
    """
    Desc: Take a vector of word counts from a document and create a generator for word indices
    input: A vector from a Document Term Frequency matrix for one document.
    output: A generator object to store the word indices when called
    """
    for idx in vec.nonzero()[0]:
        for i in xrange(int(vec[idx])):
            yield idx

def log_multi_beta(alpha, K = None):
    """
    Desc: Compute the logarithm of the multinomial beta function
    input: alpha - A vector with type float64 or a scaler of float64
           K - An integer that, if alpha is a scalar, multiplies the log by K
    output: a float64 with value of the logarithm of the multinomial beta
    """

    if K is None:
        return np.sum(gammaln(alpha) - gammaln(np.sum(alpha)))
    else:
        return K * gammaln(alpha) - gammaln(K * alpha)

class LdaSampler(object):

    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        #if not isinstance(alpha, float):
        #    raise Exception(" Initial value for alpha must be a floating point number (.3)")

        #if not isinstance(beta, float):
        #    raise Exception(" Initial value for beta must be a floating point number (.3)")

        #if not isinstance(ntopics, int):
        #    raise Exception(" The number of topics must be an integer")

        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        """
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact

        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact

        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word

        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document

        Topics: size(?) An empty set
           Will come back to this
        """
        ndocs, vsize = self.matrix.shape

        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = []
        self.logL = []
        
        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for n in xrange(vsize):
                if self.matrix[m,n] == 0:
                    continue
                for w in xrange(self.matrix[m,n]):
                # Initialize a random topic for each word
                    z = np.random.randint(self.ntopics)
                    self.topics.append(z)
                    self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                    self.NM[m] += 1
                    self.NZW[z,n] += 1
                    self.NZ[z] += 1
                # Keep document, iterator for word, word index, and assignment
                #self.topics.append([i,w,z])
        
        self.topics = np.vstack(self.topics)
    def _conditional_distribution(self, m, n):
        """
        Desc: Compute the conditional distribution of words in document and topic
        Input: m: An integer representing the column index of the document
               w: The generator object from word_indices

        Output: p_z: An array size(1 X ntopics) containing
                  probabilities for topics of word
        
        The formula is:
        ((n_{k,-i}^(t) + \beta_t)/\sum_{t=1}^V(n_{k,-i}^(t) + \beta_t)) *
        ((n_{m,-i}^(t) + \alpha_k)/(\sum_{k=1}^K(n_m^k + \alpha_k) - 1))
        """
        vsize = self.matrix[m,:].nonzero()[0].size
        p_z = np.zeros(self.ntopics)
        for ii in xrange(self.ntopics):
            p_z[ii] = (self.NZM[m,ii] + self.alpha) \
            *(self.NZW[ii,n] + self.beta) \
            / (self.NZ[ii] + vsize * self.beta)
        
        p_z /= np.sum(p_z)

        return p_z

    def loglikelihood(self):
        """
        Desc: Compute the log likelihood that the model generated the data
        Input: self references
        Output: lik: float of the log likelihood
        """
        # Why are these being repeated here?
        vsize = self.matrix[m,:].nonzero()[0].size
        ndocs = self.NZM.shape[0]
        lik = 0

        for z in xrange(self.ntopics):
            lik += log_multi_beta(self.NZW[z,:] + self.beta)
            lik -= log_multi_beta(self.beta, vsize)

        for m in xrange(ndocs):
            lik += log_multi_beta(self.NZM[m,:] + self.alpha)
            lik -= log_multi_beta(self.alpha, self.ntopics)

        return lik

    def phi_theta(self):
        """
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding
            [0] phi: Probability of topic by word
            [1] theta: Probability of document by topic
        """
        

        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 0)[np.newaxis,:]

        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 1)[:,np.newaxis ]

        return num_phi, num_theta


    def run(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        n_docs, vsize = self.matrix.shape
        topics2 = self.topics


        for iteration in xrange(maxiter + 2):
            # Idea: After each iteration we now want to
            # make assignments relative to the newly generated topics
            if iteration > 1:
                self.topics = topics2
            for m in xrange(n_docs):
                for n in xrange(vsize):
                    if self.matrix[m,n] == 0:
                        continue
                    for w in xrange(self.matrix[m,n]):
                        
                        z = self.topics[m,n]
                    
                        self.NZM[m,z] -= 1
                        self.NM[m] -= 1
                        self.NZW[z,n] -= 1
                        self.NZ[z] -= 1

                        p_z = self._conditional_distribution(m,n)
                        # Choosing a random topic row
                        ind_z = np.random.randint(self.ntopics)
                    
                        # Sampling random topic
                        z = index_sample(p_z)
                        
                        #Self.topics needs to change after we iterate over this word
                        # Otherwise at each iteration we subtract one from that space w*n times
                        # giving us a negative number
                        topics2[m,n] = z

                        self.NZM[m,z] += 1
                        self.NM[m] += 1
                        self.NZW[z,n] += 1
                        self.NZ[z] += 1

            if iteration > burnin:
                yield self.phi_theta()

                
    def runfort(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        M,N = self.matrix.shape

        p_z = np.zeros(self.ntopics)
        p_z += 1./self.ntopics
        
 
        # Make everything fortan contiguous
        p_z=p_z.flatten() # Flatten array (Make 1-D)
        p_z=p_z.reshape(self.ntopics, order='F')
        run_matrix = np.array(self.matrix.transpose(),order='F')
        run_NZM = np.array(self.NZM.transpose(),order='F')
        run_NZW = np.array(self.NZW,order='F')
        run_NZ = np.array(self.NZ,order='F')
        run_NM = np.array(self.NM,order='F')
        # index starts at 1 in fortran
        run_topics = np.array(self.topics,order='F') + 1
        run_topics2 = run_topics
        
        
        topics2 = self.topics.transpose()
        loglik = np.zeros(maxiter)
        
        gibbsSampler6th.gibbs_sampler.gibbssampler(matrix = run_matrix,
                                                nzw = run_NZW,
                                                nzm = run_NZM,
                                                nz = run_NZ,
                                                nm = run_NM,
                                                max_iter = maxiter,
                                                p_z = p_z,
                                                m = M,
                                                n = N,
                                                topics = run_topics,
                                                alpha = self.alpha,
                                                beta = self.beta,
                                                lik = loglik)
                                                
        
        self.NZM = run_NZM.transpose()
        self.matrix = run_matrix.transpose()
        self.NZW = run_NZW
        self.NZ = run_NZ
        self.NM = run_NM

        #if iteration > burnin:
        return self.phi_theta(),loglik
                
                

    def prn(self,x = None):
        print x

    # For some reason this returns (maxiter - burnin) - 2 iterations?
    def update(self, maxiter = 20, burnin = 0):
        """
        Desc: Runs gibbs sampler for maxiter iterations
            Input: maxiter - integer specifying maximum number of iterations
                   burnin  - integer specifying number of iterations to burn through.
                                should be set to zero after initial burnin
            Output: phi_theta() Two arrays, holding
                [0] Probability of topic by word
                [1] Probability of document by topic
        """
        
        for iteration, phi_theta in enumerate(self.run( maxiter, burnin)):
            self.prn(iteration)
            self.prn(self.loglikelihood())
            self.logL.append(self.loglikelihood())
        return self.phi_theta(), self.logL

    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ
        self.logL = self.logL
        


gibbssampler(matrix,nzw,nzm,nz,nm,p_z,topics,alpha,beta,lik,[ntopics,max_iter,m,n,top_size])

Wrapper for ``gibbssampler``.

Parameters
----------
matrix : input rank-2 array('q') with bounds (n,m)
nzw : input rank-2 array('q') with bounds (ntopics,n)
nzm : input rank-2 array('q') with bounds (ntopics,m)
nz : input rank-1 array('q') with bounds (ntopics)
nm : input rank-1 array('q') with bounds (m)
p_z : input rank-1 array('d') with bounds (ntopics)
topics : input rank-1 array('q') with bounds (top_size)
alpha : input float
beta : input float
lik : in/output rank-1 array('d') with bounds (max_iter)

Other Parameters
----------------
ntopics : input long, optional
    Default: shape(nzw,0)
max_iter : input long, optional
    Default: len(lik)
m : input long, optional
    Default: shape(matrix,1)
n : input long, optional
    Default: shape(matrix,0)
top_size : input long, optional
    Default: len(topics)



In [5]:
# Little data for testing purposes
testing_frame = np.array([[1,0,4,2],[2,4,0,5]])
sampler = LdaSampler(data = testing_frame, ntopics = 4, alpha =.1,
                     beta = .1)


In [6]:
LDA_fort_test = sampler.runfort(maxiter=10000)

In [7]:
#
DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))
p=.2
# sample files for train
gen_sample = np.array(sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))))
rand_sample = [ allfiles[i] for i in gen_sample ]
#
# take rest for test
rand_sample2 = []
for i in xrange(len(allfiles)):
    if i not in gen_sample:
        rand_sample2.append(allfiles[i])
#
# train data

np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
#
# test data
np_array_list_test = []
for file_ in rand_sample2:
    df = pd.read_csv(file_, index_col = None, header = 0)
    df['source'] = file_
    np_array_list_test.append(df.as_matrix())
    
#
# train data frame
comb_np_array = np.vstack(np_array_list)
train_frame = pd.DataFrame(comb_np_array)
train_frame.columns = ['words','count','source']
subless = (train_frame['words'].str.len() > 2)
submore = (train_frame['words'].str.len() < 20)
train_frame = train_frame.loc[subless]
train_frame = train_frame.loc[submore]
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.pivot(index = 'source',columns = 'words', values = 'count')
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.loc[:, (train_frame.sum(axis = 0) > 10)]
#

# test data frame
comb_np_array_test = np.vstack(np_array_list_test)
test_frame = pd.DataFrame(comb_np_array_test)
test_frame.columns = ['words','count','source']
test_frame = test_frame.fillna(value=0)
test_frame = test_frame.pivot(index = 'source', columns = 'words', values = 'count')
test_frame = test_frame.fillna(value = 0)

train_frame1 = train_frame.values.astype(int)




In [52]:
citations = pd.read_csv("./data_folder/citations.tsv", sep="\t",index_col=False)

In [102]:
citations.to_csv("./Models/topics5/citations.csv")

In [59]:
train_frame.head()

words,abb,ability,able,about,above,absence,absolute,abstract,academic,academy,...,yobs,yohai,york,you,young,your,youth,zeger,zero,zin
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276722.CSV,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_folder/wordcounts/wordcounts_10.2307_2276892.CSV,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_folder/wordcounts/wordcounts_10.2307_2277014.CSV,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0
data_folder/wordcounts/wordcounts_10.2307_2277020.CSV,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
for i in xrange(10):
    print train_frame.index[i][34:49]

10.2307_2276722
10.2307_2276818
10.2307_2276892
10.2307_2277014
10.2307_2277020
10.2307_2277078
10.2307_2277130
10.2307_2277258
10.2307_2277454
10.2307_2277672


In [85]:
citations

Unnamed: 0,id,doi,title,author,journaltitle,volume,issue,pubdate,pagerange,publisher,type,reviewed-work,abstract
0,10.2307/2287721,10.2307/2287721,The Well-Calibrated Bayesian: Comment,Joseph B. Kadane,Journal of the American Statistical Association,77,379,1982-09-01T00:00:00Z,pp. 610-611,American Statistical Association,fla,,
1,10.2307/2286109,10.2307/2286109,,Andrew Sterrett,Journal of the American Statistical Association,64,328,1969-12-01T00:00:00Z,pp. 1676-1677,American Statistical Association,brv,Statistics for Mathematicians--An Introduction...,
2,10.2307/2287992,10.2307/2287992,The Equivalence of Regression-Simple and Best-...,Luis A. Escobar,Journal of the American Statistical Association,81,393,1986-03-01T00:00:00Z,pp. 210-214,American Statistical Association,fla,,This article gives necessary and sufficient co...
3,10.2307/2284631,10.2307/2284631,The Distribution by Age of the Frequency of Fi...,"A. J. Coale, D. R. McNeil",Journal of the American Statistical Association,67,340,1972-12-01T00:00:00Z,pp. 743-749,American Statistical Association,fla,,The schedule recording first marriage frequenc...
4,10.2307/2288680,10.2307/2288680,,Roy Gardner,Journal of the American Statistical Association,78,382,1983-06-01T00:00:00Z,p. 502,American Statistical Association,brv,Game Theory (2nd Ed.).|Guillermo Owen,
5,10.2307/2291505,10.2307/2291505,Front Matter,,Journal of the American Statistical Association,90,432,1995-12-01T00:00:00Z,,American Statistical Association,mis,,
6,10.2307/2291642,10.2307/2291642,On Variance Estimation With Imputed Survey Dat...,Robert E. Fay,Journal of the American Statistical Association,91,434,1996-06-01T00:00:00Z,pp. 517-519,American Statistical Association,fla,,
7,10.2307/2282207,10.2307/2282207,Problems Met by Companies that Instruct their ...,Theodore H. Brown,Journal of the American Statistical Association,28,181,1933-03-01T00:00:00Z,pp. 10-14,American Statistical Association,fla,,
8,10.2307/2283290,10.2307/2283290,Notes About Authors,,Journal of the American Statistical Association,58,302,1963-06-01T00:00:00Z,pp. 538-540,American Statistical Association,mis,,
9,10.2307/2281963,10.2307/2281963,Notes on Immigration Statistics of the United ...,E. P. Hutchinson,Journal of the American Statistical Association,53,284,1958-12-01T00:00:00Z,pp. 963-1025,American Statistical Association,fla,,


In [9]:
#sampler1 = LdaSampler(data = train_frame1, ntopics = 8, alpha = .0001, beta = .0001)

In [105]:
test_lda = sampler1.runfort( maxiter =3000, burnin = 0)

In [107]:
phi_theta = test_lda[0]
phi = pd.DataFrame(data=phi_theta[0], columns = train_frame.columns)
theta = pd.DataFrame(data=phi_theta[1])
dat_phi = phi.transpose()

In [93]:
import matplotlib.pyplot as plt
plt.plot(likelihood)
plt.ylabel('Negative Log Likelihood')
plt.show()

In [94]:
theta = pd.DataFrame(data=phi_theta[0], columns = train_frame.columns)
phi = pd.DataFrame(data=phi_theta[1])

In [95]:
theta

words,abb,ability,able,about,above,absence,absolute,abstract,academic,academy,...,yobs,yohai,york,you,young,your,youth,zeger,zero,zin
0,0.052635,0.199999,0.195651,0.10137,0.137255,0.043481,0.214285,6e-06,0.142857,0.23333,...,0.119403,0.090912,0.145455,0.107143,0.149999,0.02778,0.090912,0.083335,0.095478,9e-06
1,5e-06,0.066668,0.086957,0.131507,0.124183,0.130435,0.057144,0.125,0.085715,0.033336,...,0.089553,0.181814,0.142857,0.107143,0.050003,0.111111,0.090912,0.208331,0.100503,9e-06
2,0.210523,0.199999,0.130435,0.153425,0.156863,0.217388,0.085715,0.249994,0.142857,0.133333,...,0.104478,0.181814,0.109091,0.125,0.199997,0.166666,0.181814,0.166665,0.115578,0.090912
3,0.105264,0.066668,0.108696,0.109589,0.104575,0.217388,0.128571,0.125,0.114286,0.066668,...,0.194029,0.090912,0.124675,0.178571,0.100001,0.055557,0.090912,0.208331,0.145729,0.181814
4,0.105264,0.111111,0.086957,0.139726,0.137255,0.086958,0.171428,0.187497,0.114286,0.23333,...,0.149253,9e-06,0.109091,0.125,0.100001,0.166666,0.272717,0.166665,0.125628,0.181814
5,0.210523,0.111111,0.152173,0.123288,0.091503,0.086958,0.071429,0.125,0.199998,0.033336,...,0.059702,0.090912,0.093507,0.053572,0.149999,0.22222,0.181814,0.041669,0.110553,0.090912
6,0.105264,0.155555,0.108696,0.115069,0.130719,0.173911,0.185714,0.125,0.142857,0.133333,...,0.119403,0.181814,0.145455,0.232141,0.050003,0.138889,9e-06,4e-06,0.135678,0.181814
7,0.210523,0.08889,0.130435,0.126027,0.117647,0.043481,0.085715,0.062503,0.057144,0.133333,...,0.164179,0.181814,0.12987,0.071429,0.199997,0.111111,0.090912,0.125,0.170854,0.272717


In [110]:
phi_new = pd.DataFrame(sampler1.NZW/np.sum( (sampler1.NZW + sampler1.beta),axis = 1)[:,np.newaxis],columns = train_frame.columns)
phi_new.to_csv("./Models/topics8/phi_new.csv")
phi.to_csv("./Models/topics8/phi.csv" )
theta.to_csv("./Models/topics8/theta.csv" )
pd.DataFrame(likelihood).to_csv("./Models/topics8/likelihood.csv" )
train_frame.to_csv("./Models/topics8/dtm_dat.csv" )

In [11]:
sampler2 = LdaSampler(data = train_frame1, ntopics = 20, alpha = .1, beta = .1)
test_lda2 = sampler2.runfort( maxiter =12000, burnin = 0)

NameError: name 'test_frame1' is not defined

In [None]:
sampler3 = LdaSampler(data = train_frame1, ntopics = 5, alpha = .1, beta = .1)
test_lda3 = sampler3.runfort( maxiter =12000, burnin = 0)

In [None]:
sampler4 = LdaSampler(data = train_frame1, ntopics = 30, alpha = .1, beta = .1)
test_lda4 = sampler4.runfort( maxiter =12000, burnin = 0)

In [None]:
test_lda3

In [None]:
# Grab made up data

import os
import shutil

N_TOPICS = 10
DOCUMENT_LENGTH = 100
FOLDER = "topicimg"

def vertical_topic(width, topic_index, document_length):
    """
    Generate a topic whose words form a vertical bar.
    """
    m = np.zeros((width, width))
    m[:, topic_index] = int(document_length / width)
    return m.flatten()

def horizontal_topic(width, topic_index, document_length):
    """
    Generate a topic whose words form a horizontal bar.
    """
    m = np.zeros((width, width))
    m[topic_index, :] = int(document_length / width)
    return m.flatten()

def save_document_image(filename, doc, zoom=2):
    """
    Save document as an image.
    doc must be a square matrix
    """
    height, width = doc.shape
    zoom = np.ones((width*zoom, width*zoom))
    # imsave scales pixels between 0 and 255 automatically
    sp.misc.imsave(filename, np.kron(doc, zoom))

def gen_word_distribution(ntopics, document_length):
    """
    Generate a word distribution for each of the ntopics.
    """
    width = ntopics / 2
    vsize = width ** 2
    m = np.zeros((ntopics, vsize))

    for k in range(width):
        m[k,:] = vertical_topic(width, k, document_length)

    for k in range(width):
        m[k+width,:] = horizontal_topic(width, k, document_length)

    m /= m.sum(axis=1)[:, np.newaxis] # turn counts into probabilities

    return m

def gen_document(word_dist, ntopics, vsize, length=DOCUMENT_LENGTH, alpha=0.1):
    """
    Generate a document:
    1) Sample topic proportions from the Dirichlet distribution.
    2) Sample a topic index from the Multinomial with the topic
       proportions from 1).
    3) Sample a word from the Multinomial corresponding to the topic
       index from 2).
    4) Go to 2) if need another word.
    """
    theta = np.random.mtrand.dirichlet([alpha] * ntopics)
    v = np.zeros(vsize)
    for n in range(length):
        z = index_sample(theta)
        w = index_sample(word_dist[z,:])
        v[w] += 1
    return v

def gen_documents(word_dist, ntopics, vsize, n=500):
    """
    Generate a document-term matrix.
    """
    m = np.zeros((n, vsize))
    for i in xrange(n):
        m[i, :] = gen_document(word_dist, ntopics, vsize)
    
    return m

if os.path.exists(FOLDER):
    shutil.rmtree(FOLDER)
os.mkdir(FOLDER)

width = N_TOPICS / 2
vocab_size = width ** 2
word_dist = gen_word_distribution(N_TOPICS, DOCUMENT_LENGTH)
matrix = gen_documents(word_dist, N_TOPICS, vocab_size)
