In [1]:
import os
import sys
sys.path.append('./fortran')
import gibbsSampler6th
print gibbsSampler6th.gibbs_sampler.gibbssampler.__doc__
import numpy as np
import scipy as sp
from scipy.special import gammaln
import scipy.misc
import random
import numpy as np
import pandas as pd
import os
import glob
import random
random.seed(1234)
#
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln

def index_sample(p):
    """
    Desc: Samples from n topics distributed multinomially and returns topic number
    input: p - A one dimensional array of float64 type that contains the probability for each topic
    output: an Integer specifying which topic was chosen from a multinomial distribution
    """
    r = random.random()
    for i in range(len(p)):
        r = r - p[i]
        if r < 0:
            return i
    return len(p) - 1

def word_indices(vec):
    """
    Desc: Take a vector of word counts from a document and create a generator for word indices
    input: A vector from a Document Term Frequency matrix for one document.
    output: A generator object to store the word indices when called
    """
    for idx in vec.nonzero()[0]:
        for i in xrange(int(vec[idx])):
            yield idx

def log_multi_beta(alpha, K = None):
    """
    Desc: Compute the logarithm of the multinomial beta function
    input: alpha - A vector with type float64 or a scaler of float64
           K - An integer that, if alpha is a scalar, multiplies the log by K
    output: a float64 with value of the logarithm of the multinomial beta
    """

    if K is None:
        return np.sum(gammaln(alpha) - gammaln(np.sum(alpha)))
    else:
        return K * gammaln(alpha) - gammaln(K * alpha)

class LdaSampler(object):

    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        #if not isinstance(alpha, float):
        #    raise Exception(" Initial value for alpha must be a floating point number (.3)")

        #if not isinstance(beta, float):
        #    raise Exception(" Initial value for beta must be a floating point number (.3)")

        #if not isinstance(ntopics, int):
        #    raise Exception(" The number of topics must be an integer")

        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        """
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact

        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact

        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word

        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document

        Topics: size(?) An empty set
           Will come back to this
        """
        ndocs, vsize = self.matrix.shape

        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = []
        self.logL = []
        
        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for n in xrange(vsize):
                if self.matrix[m,n] == 0:
                    continue
                for w in xrange(self.matrix[m,n]):
                # Initialize a random topic for each word
                    z = np.random.randint(self.ntopics)
                    self.topics.append(z)
                    self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                    self.NM[m] += 1
                    self.NZW[z,n] += 1
                    self.NZ[z] += 1
                # Keep document, iterator for word, word index, and assignment
                #self.topics.append([i,w,z])
        
        self.topics = np.vstack(self.topics)
    def _conditional_distribution(self, m, n):
        """
        Desc: Compute the conditional distribution of words in document and topic
        Input: m: An integer representing the column index of the document
               w: The generator object from word_indices

        Output: p_z: An array size(1 X ntopics) containing
                  probabilities for topics of word
        
        The formula is:
        ((n_{k,-i}^(t) + \beta_t)/\sum_{t=1}^V(n_{k,-i}^(t) + \beta_t)) *
        ((n_{m,-i}^(t) + \alpha_k)/(\sum_{k=1}^K(n_m^k + \alpha_k) - 1))
        """
        vsize = self.matrix[m,:].nonzero()[0].size
        p_z = np.zeros(self.ntopics)
        for ii in xrange(self.ntopics):
            p_z[ii] = (self.NZM[m,ii] + self.alpha) \
            *(self.NZW[ii,n] + self.beta) \
            / (self.NZ[ii] + vsize * self.beta)
        
        p_z /= np.sum(p_z)

        return p_z

    def loglikelihood(self):
        """
        Desc: Compute the log likelihood that the model generated the data
        Input: self references
        Output: lik: float of the log likelihood
        """
        # Why are these being repeated here?
        vsize = self.matrix[m,:].nonzero()[0].size
        ndocs = self.NZM.shape[0]
        lik = 0

        for z in xrange(self.ntopics):
            lik += log_multi_beta(self.NZW[z,:] + self.beta)
            lik -= log_multi_beta(self.beta, vsize)

        for m in xrange(ndocs):
            lik += log_multi_beta(self.NZM[m,:] + self.alpha)
            lik -= log_multi_beta(self.alpha, self.ntopics)

        return lik

    def phi_theta(self):
        """
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding
            [0] phi: Probability of topic by word
            [1] theta: Probability of document by topic
        """
        

        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 0)[np.newaxis,:]

        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 1)[:,np.newaxis ]

        return num_phi, num_theta


    def run(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        n_docs, vsize = self.matrix.shape
        topics2 = self.topics


        for iteration in xrange(maxiter + 2):
            # Idea: After each iteration we now want to
            # make assignments relative to the newly generated topics
            if iteration > 1:
                self.topics = topics2
            for m in xrange(n_docs):
                for n in xrange(vsize):
                    if self.matrix[m,n] == 0:
                        continue
                    for w in xrange(self.matrix[m,n]):
                        
                        z = self.topics[m,n]
                    
                        self.NZM[m,z] -= 1
                        self.NM[m] -= 1
                        self.NZW[z,n] -= 1
                        self.NZ[z] -= 1

                        p_z = self._conditional_distribution(m,n)
                        # Choosing a random topic row
                        ind_z = np.random.randint(self.ntopics)
                    
                        # Sampling random topic
                        z = index_sample(p_z)
                        
                        #Self.topics needs to change after we iterate over this word
                        # Otherwise at each iteration we subtract one from that space w*n times
                        # giving us a negative number
                        topics2[m,n] = z

                        self.NZM[m,z] += 1
                        self.NM[m] += 1
                        self.NZW[z,n] += 1
                        self.NZ[z] += 1

            if iteration > burnin:
                yield self.phi_theta()

                
    def runfort(self, maxiter = 30, burnin= 0):
        """
        Desc: Perform Gibbs sampling for maxiter iterations

        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins

        Output: phi_theta() Two arrays, holding
        [0] Probability of topic by word
        [1] Probability of document by topic
        """

        M,N = self.matrix.shape

        p_z = np.zeros(self.ntopics)
        p_z += 1./self.ntopics
        
 
        # Make everything fortan contiguous
        p_z=p_z.flatten() # Flatten array (Make 1-D)
        p_z=p_z.reshape(self.ntopics, order='F')
        run_matrix = np.array(self.matrix.transpose(),order='F')
        run_NZM = np.array(self.NZM.transpose(),order='F')
        run_NZW = np.array(self.NZW,order='F')
        run_NZ = np.array(self.NZ,order='F')
        run_NM = np.array(self.NM,order='F')
        # index starts at 1 in fortran
        run_topics = np.array(self.topics,order='F') + 1
        run_topics2 = run_topics
        
        
        topics2 = self.topics.transpose()
        loglik = np.zeros(maxiter)
        
        gibbsSampler6th.gibbs_sampler.gibbssampler(matrix = run_matrix,
                                                nzw = run_NZW,
                                                nzm = run_NZM,
                                                nz = run_NZ,
                                                nm = run_NM,
                                                max_iter = maxiter,
                                                p_z = p_z,
                                                m = M,
                                                n = N,
                                                topics = run_topics,
                                                alpha = self.alpha,
                                                beta = self.beta,
                                                lik = loglik)
                                                
        
        self.NZM = run_NZM.transpose()
        self.matrix = run_matrix.transpose()
        self.NZW = run_NZW
        self.NZ = run_NZ
        self.NM = run_NM

        #if iteration > burnin:
        return self.phi_theta(),loglik
                
                

    def prn(self,x = None):
        print x

    # For some reason this returns (maxiter - burnin) - 2 iterations?
    def update(self, maxiter = 20, burnin = 0):
        """
        Desc: Runs gibbs sampler for maxiter iterations
            Input: maxiter - integer specifying maximum number of iterations
                   burnin  - integer specifying number of iterations to burn through.
                                should be set to zero after initial burnin
            Output: phi_theta() Two arrays, holding
                [0] Probability of topic by word
                [1] Probability of document by topic
        """
        
        for iteration, phi_theta in enumerate(self.run( maxiter, burnin)):
            self.prn(iteration)
            self.prn(self.loglikelihood())
            self.logL.append(self.loglikelihood())
        return self.phi_theta(), self.logL

    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ
        self.logL = self.logL
        
        
#
DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))
p=.2
# sample files for train
gen_sample = np.array(sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))))
rand_sample = [ allfiles[i] for i in gen_sample ]
#
# take rest for test
rand_sample2 = []
for i in xrange(len(allfiles)):
    if i not in gen_sample:
        rand_sample2.append(allfiles[i])
#
# train data

np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
#
# test data
np_array_list_test = []
for file_ in rand_sample2:
    df = pd.read_csv(file_, index_col = None, header = 0)
    df['source'] = file_
    np_array_list_test.append(df.as_matrix())
    
#
# train data frame
comb_np_array = np.vstack(np_array_list)
train_frame = pd.DataFrame(comb_np_array)
train_frame.columns = ['words','count','source']
subless = (train_frame['words'].str.len() > 2)
submore = (train_frame['words'].str.len() < 20)
train_frame = train_frame.loc[subless]
train_frame = train_frame.loc[submore]
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.pivot(index = 'source',columns = 'words', values = 'count')
train_frame = train_frame.fillna(value = 0)
train_frame = train_frame.loc[:, (train_frame.sum(axis = 0) > 10)]
#

# test data frame
comb_np_array_test = np.vstack(np_array_list_test)
test_frame = pd.DataFrame(comb_np_array_test)
test_frame.columns = ['words','count','source']
test_frame = test_frame.fillna(value=0)
test_frame = test_frame.pivot(index = 'source', columns = 'words', values = 'count')
test_frame = test_frame.fillna(value = 0)

train_frame1 = train_frame.values.astype(int)


gibbssampler(matrix,nzw,nzm,nz,nm,p_z,topics,alpha,beta,lik,[ntopics,max_iter,m,n,top_size])

Wrapper for ``gibbssampler``.

Parameters
----------
matrix : input rank-2 array('q') with bounds (n,m)
nzw : input rank-2 array('q') with bounds (ntopics,n)
nzm : input rank-2 array('q') with bounds (ntopics,m)
nz : input rank-1 array('q') with bounds (ntopics)
nm : input rank-1 array('q') with bounds (m)
p_z : input rank-1 array('d') with bounds (ntopics)
topics : input rank-1 array('q') with bounds (top_size)
alpha : input float
beta : input float
lik : in/output rank-1 array('d') with bounds (max_iter)

Other Parameters
----------------
ntopics : input long, optional
    Default: shape(nzw,0)
max_iter : input long, optional
    Default: len(lik)
m : input long, optional
    Default: shape(matrix,1)
n : input long, optional
    Default: shape(matrix,0)
top_size : input long, optional
    Default: len(topics)



In [2]:
sampler2 = LdaSampler(data = train_frame1, ntopics = 5, alpha = .0001, beta = .0001)
test_lda2 = sampler2.runfort( maxiter =12000, burnin = 0)

In [76]:
test_lda3 = sampler2.runfort( maxiter =3000, burnin = 0)

In [69]:
likelihood = test_lda2[1]
import matplotlib.pyplot as plt
plt.plot(likelihood)
plt.ylabel('Negative Log Likelihood')
plt.show()

In [70]:
phi_theta = test_lda2[0]
phi = pd.DataFrame(data=phi_theta[0], columns = train_frame.columns)
theta = pd.DataFrame(data=phi_theta[1])
print phi.shape



(5, 3569)


In [80]:
test_lda2

((array([[  2.10526039e-01,   1.33334074e-01,   2.82607798e-01, ...,
            1.25001562e-01,   2.11055249e-01,   9.09140494e-02],
         [  2.63156233e-01,   3.33331852e-01,   1.52174433e-01, ...,
            1.66667361e-01,   2.41205927e-01,   9.09049589e-06],
         [  2.10526039e-01,   3.33331852e-01,   2.39130009e-01, ...,
            1.66667361e-01,   1.80904571e-01,   9.09049589e-06],
         [  2.10526039e-01,   8.88901234e-02,   1.30435539e-01, ...,
            2.08333160e-01,   1.65829232e-01,   6.36343803e-01],
         [  1.05265651e-01,   1.11112099e-01,   1.95652221e-01, ...,
            3.33330556e-01,   2.01005023e-01,   2.72723967e-01]]),
  array([[ 0.21479713,  0.17303104,  0.24224341,  0.19093079,  0.17899763],
         [ 0.19714286,  0.18142858,  0.21571427,  0.19714286,  0.20857142],
         [ 0.20491803,  0.21639343,  0.19508197,  0.20983606,  0.17377051],
         [ 0.21205357,  0.20535714,  0.20238095,  0.19196429,  0.18824405],
         [ 0.21134492,  

In [83]:
phi_theta = test_lda2[0]
phi = pd.DataFrame(data=phi_theta[0], columns = train_frame.columns)
theta = pd.DataFrame(data=phi_theta[1])
dat_phi = phi.transpose()

In [84]:
phi


words,abb,ability,able,about,above,absence,absolute,abstract,academic,academy,...,yobs,yohai,york,you,young,your,youth,zeger,zero,zin
0,0.210526,0.133334,0.282608,0.191781,0.215686,0.217391,0.142858,0.312496,0.142858,0.166667,...,0.208955,0.363629,0.197403,0.196429,0.050004,0.222222,0.272724,0.125002,0.211055,0.090914
1,0.263156,0.333332,0.152174,0.232877,0.189543,0.217391,0.171429,0.312496,0.228571,0.2,...,0.22388,0.090914,0.184416,0.196429,0.100002,0.249999,0.181819,0.166667,0.241206,9e-06
2,0.210526,0.333332,0.23913,0.219178,0.24183,0.217391,0.2,0.125002,0.257142,0.266666,...,0.119404,0.090914,0.179221,0.214286,0.249999,0.166667,0.272724,0.166667,0.180905,9e-06
3,0.210526,0.08889,0.130436,0.19726,0.215686,0.260868,0.2,0.062504,0.257142,0.166667,...,0.22388,0.363629,0.215584,0.178572,0.399995,0.222222,0.090914,0.208333,0.165829,0.636344
4,0.105266,0.111112,0.195652,0.158904,0.137255,0.086959,0.285714,0.1875,0.114287,0.2,...,0.22388,0.090914,0.223377,0.214286,0.2,0.13889,0.181819,0.333331,0.201005,0.272724


In [88]:
col_ = 2
dat_phi = dat_phi.sort_values(col_,ascending=False)
dat_phi.iloc[1:20,col_]

words
snedecor      0.538449
exposition    0.529402
gets          0.526307
averaging     0.499993
mcleod        0.499989
bls           0.499989
marvin        0.499988
payments      0.499988
residents     0.499988
anything      0.499988
gupta         0.473677
highest       0.470580
norman        0.470580
wilson        0.466658
hastings      0.466658
revealed      0.461528
mgf           0.461528
jersey        0.461528
thesis        0.461528
Name: 2, dtype: float64

In [73]:
phi_new = pd.DataFrame(sampler2.NZW/np.sum( (sampler2.NZW + sampler2.beta),axis = 1)[:,np.newaxis],columns = train_frame.columns)
phi_new.to_csv("./Models/topics5/phi_new.csv")

In [74]:
phi.to_csv("./Models/topics5/phi.csv" )
theta.to_csv("./Models/topics5/theta.csv" )

In [75]:
pd.DataFrame(likelihood).to_csv("./Models/topics5/likelihood.csv" )

In [58]:
train_frame.to_csv("./Models/topics5/dtm_dat.csv" )