In [None]:

import os
import shutil

N_TOPICS = 10
DOCUMENT_LENGTH = 100
FOLDER = "topicimg"

def vertical_topic(width, topic_index, document_length):
    """
    Generate a topic whose words form a vertical bar.
    """
    m = np.zeros((width, width))
    m[:, topic_index] = int(document_length / width)
    return m.flatten()

def horizontal_topic(width, topic_index, document_length):
    """
    Generate a topic whose words form a horizontal bar.
    """
    m = np.zeros((width, width))
    m[topic_index, :] = int(document_length / width)
    return m.flatten()

def save_document_image(filename, doc, zoom=2):
    """
    Save document as an image.
    doc must be a square matrix
    """
    height, width = doc.shape
    zoom = np.ones((width*zoom, width*zoom))
    # imsave scales pixels between 0 and 255 automatically
    sp.misc.imsave(filename, np.kron(doc, zoom))

def gen_word_distribution(ntopics, document_length):
    """
    Generate a word distribution for each of the ntopics.
    """
    width = ntopics / 2
    vsize = width ** 2
    m = np.zeros((ntopics, vsize))

    for k in range(width):
        m[k,:] = vertical_topic(width, k, document_length)

    for k in range(width):
        m[k+width,:] = horizontal_topic(width, k, document_length)

    m /= m.sum(axis=1)[:, np.newaxis] # turn counts into probabilities

    return m

def gen_document(word_dist, ntopics, vsize, length=DOCUMENT_LENGTH, alpha=0.1):
    """
    Generate a document:
    1) Sample topic proportions from the Dirichlet distribution.
    2) Sample a topic index from the Multinomial with the topic
       proportions from 1).
    3) Sample a word from the Multinomial corresponding to the topic
       index from 2).
    4) Go to 2) if need another word.
    """
    theta = np.random.mtrand.dirichlet([alpha] * ntopics)
    v = np.zeros(vsize)
    for n in range(length):
        z = index_sample(theta)
        w = index_sample(word_dist[z,:])
        v[w] += 1
    return v

def gen_documents(word_dist, ntopics, vsize, n=500):
    """
    Generate a document-term matrix.
    """
    m = np.zeros((n, vsize))
    for i in xrange(n):
        m[i, :] = gen_document(word_dist, ntopics, vsize)
    
    return m

if os.path.exists(FOLDER):
    shutil.rmtree(FOLDER)
os.mkdir(FOLDER)


In [19]:

import numpy as np
import scipy as sp
from scipy.special import gammaln
import scipy.misc
import random

def index_sample(p):
    '''
    Desc: Samples from n topics distributed multinomially and returns topic number
    input: p - A one dimensional array of float64 type that contains the probability for each topic
    output: an Integer specifying which topic was chosen from a multinomial distribution
    '''
    r = random.random()
    for i in range(len(p)):
        r = r - p[i]
        if r < 0:
            return i
    return len(p) - 1

def word_indices(vec):
    '''
    Desc: Take a vector of word counts from a document and create a generator for word indices
    input: A vector from a Document Term Frequency matrix for one document.
    output: A generator object to store the word indices when called
    '''
    for idx in vec.nonzero()[0]:
        for i in xrange(int(vec[idx])):
            yield idx

def log_multi_beta(alpha, K = None):
    """
    Desc: Compute the logarithm of the multinomial beta function
    input: alpha - A vector with type float64 or a scaler of float64
           K - An integer that, if alpha is a scalar, multiplies the log by K
    output: a float64 with value of the logarithm of the multinomial beta
    """
    
    if K is None:
        return np.sum(gammaln(alpha) - gammaln(np.sum(alpha)))
    else:
        return K * gammaln(alpha) - gammaln(K * alpha)
    
class LdaSampler(object):
    
    def __init__(self,  data, ntopics, alpha = .1, beta = .1):
        """
        Desc: Initialize values for our class object
        alpha: a float scalar
        beta: a float scalar
        ntopics: an integer for the number of topics
        """
        if not isinstance(alpha, float):
            raise Exception(" Initial value for alpha must be a floating point number (.3)")
        
        if not isinstance(beta, float):
            raise Exception(" Initial value for beta must be a floating point number (.3)")
            
        if not isinstance(ntopics, int):
            raise Exception(" The number of topics must be an integer")
        
        self.matrix = data
        self.ntopics = ntopics
        self.alpha = alpha
        self.beta = beta
        self._initialize()
    def _initialize(self):
        '''
        Initialize:
        NZM: size(#Docs X #Topics) numpy array with type float 64
            The number of times document M and topic Z interact
        
        NZW: size(#Topics X #Words) numpy array with type float64
            The number of times topic Z and word W interact
        
        NM:  size(#Docs) numpy array with type float64
            Sum of documents occurances by topic and word
        
        NZ:  size(#Topics) numpy array with type float64
            Sum of Topic occurences by word and document
        
        Topics: size(?) An empty set
           Will come back to this
        '''
        ndocs, vsize = self.matrix.shape
        
        self.NZM = np.zeros((ndocs, self.ntopics))
        self.NZW = np.zeros((self.ntopics, vsize))
        self.NM  = np.zeros(ndocs)
        self.NZ  = np.zeros(self.ntopics)
        self.topics = {}
        
        for m in xrange(ndocs):
            # Iterates over i, doc_length - 1, and w, the size of unique_words - 1
            for i, w, in enumerate(word_indices(self.matrix[m,:])):
                # Initialize a random topic for each word
                z = np.random.randint(self.ntopics)
                self.NZM[m,z] += 1
                # Why is NM being +1'd for each i,w?
                self.NM[m] += 1
                self.NZW[z,w] += 1
                self.NZ[z] += 1
                self.topics[(m,i)] = z
    
    def _conditional_distribution(self, m, w):
        '''
        Desc: Compute the conditional distribution of words in document and topic
        Input: m: An integer representing the column index of the document
               w: The generator object from word_indices
        
        Output: p_z: An array size(w X 1) containing probabilities for topics of word
        '''
        vsize = self.NZW.shape[1]
        left = (self.NZW[:,w] + self.beta) / (self.NZ + self.beta * vsize)
        right = (self.NZM[m,:] + self.alpha) / (self.NM[m] + self.alpha * self.ntopics)
        p_z = abs(left * right)
        p_z /= np.sum(p_z)
        return p_z
    
    def loglikelihood(self):
        '''
        Desc: Compute the log likelihood that the model generated the data
        Input: self references
        Output: lik: float of the log likelihood
        '''
        # Why are these being repeated here?
        vsize = self.NZW.shape[1]
        ndocs = self.NZM.shape[0]
        lik = 0
        
        for z in xrange(self.ntopics):
            lik += log_multi_beta(self.NZW[z,:] + self.beta)
            lik -= log_multi_beta(self.beta, vsize)
        
        for m in xrange(ndocs):
            lik += log_multi_beta(self.NZM[m,:] + self.alpha)
            lik -= log_multi_beta(self.alpha, self.ntopics)
        
        return lik
    
    def phi_theta(self):
        '''
        Desc: Compute phi and theta, our topic by word probs and document by topic probs
        Input: Self references
        Output: Two arrays, holding 
            [0] phi: Probability of topic by word 
            [1] theta: Probability of document by topic
        '''
        num_phi = self.NZW + self.beta
        num_phi /= np.sum(num_phi, axis = 1)[:, np.newaxis]
        
        num_theta = self.NZM + self.alpha
        num_theta /= np.sum(num_theta,axis = 1)[:, np.newaxis]

        return num_phi, num_theta

        
    def run(self, maxiter = 30, burnin= 0):
        '''
        Desc: Perform Gibbs sampling for maxiter iterations
        
        Input: matrix - An array that is a Document Term Frequency Matrix
               maxiter - An integer with the number of iterations
               Burnin - TBA: An integer of the number of burnins
               
        Output: phi_theta() Two arrays, holding 
        [0] Probability of topic by word 
        [1] Probability of document by topic
        '''
        
        n_docs, vsize = self.matrix.shape
        
    
                    
        for iteration in xrange(maxiter):
            for m in xrange(n_docs):
                for i,w in enumerate(word_indices(self.matrix[m,:])):
                    z = self.topics[(m,i)]
                
                    self.NZM[m,z] -= 1
                    self.NM[m] -= 1
                    self.NZW[z,w] -= 1
                    self.NZ[z] -= 1
                  
                    p_z = self._conditional_distribution(m,w)
                    z = index_sample(p_z)
                    
                    self.NZM[m,z] += 1
                    self.NM[m] += 1
                    self.NZW[z,w] += 1
                    self.NZ[z] += 1
            
            if iteration > burnin:
                yield self.phi_theta()
    
    def prn(self,x = None):
        print x
    
    # For some reason this returns (maxiter - burnin) - 2 iterations?
    def update(self, maxiter = 20, burnin = 0):
        """
        Desc: Runs gibbs sampler for maxiter iterations
            Input: maxiter - integer specifying maximum number of iterations
                   burnin  - integer specifying number of iterations to burn through.
                                should be set to zero after initial burnin
            Output: phi_theta() Two arrays, holding 
                [0] Probability of topic by word 
                [1] Probability of document by topic
        """
        for iteration, phi_theta in enumerate(self.run( maxiter, burnin)):
            self.prn(iteration)
            self.prn(self.loglikelihood())
        return self.phi_theta()
    
    def __call__(self):
        self.NZM = self.NZM
        self.NM = self.NM
        self.NZW = self.NZW
        self.NZ = self.NZ
        

                
                  
                    

In [584]:
sampler = LdaSampler(data = matrix, ntopics = 5, alpha = .1, beta = .1)

In [585]:
LDAtest = sampler.update(maxiter = 20)

0
-10794391.6783
1
-10745505.6829
2
-10687161.3311
3
-10627083.6979
4
-10568265.7428
5
-10509290.0355
6
-10451636.6077
7
-10396654.4931
8
-10341041.371
9
-10288298.7021
10
-10239659.5004
11
-10191174.421
12
-10142940.428
13
-10096202.3979
14
-10048151.8639
15
-9999155.39838
16
-9951478.65329
17
-9903321.1695
18
-9855751.61107


In [None]:
LDAtest = sampler.update(maxiter = 20)

0
-9763120.72008
1
-9718356.74564
2
-9673326.71194
3
-9629062.19547


In [None]:
width = N_TOPICS / 2
vsize = width ** 2
word_dist = gen_word_distribution(N_TOPICS, DOCUMENT_LENGTH)
matrix = gen_documents(word_dist, N_TOPICS, vsize)
sampler = LdaSampler(N_TOPICS)



In [4]:
import numpy as np
import pandas as pd
import os
import glob
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln



In [22]:

DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))

p = .5
rand_sample = [ allfiles[i] for i in sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))) ]
rand_sample
    
np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
    
comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ['words','count','source']
big_frame = big_frame.fillna(value = 0)
big_frame = big_frame.pivot(index = 'source',columns = 'words', values = 'count')
big_frame = big_frame.fillna(value = 0)

#big_test = big_frame.head(n=10000)
#big_test = big_test.pivot(index = 'source',columns = 'words', values = 'count')
#big_test = big_test.fillna(value = 0)

In [24]:
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) > 2)]
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) < 20)]
big_frame.head()


words,aac,aacm,aalen,aam,aamse,aar,aare,aaron,abandon,abandoned,abandonment,abb,abbey,abbott,abbreviated,abbreviations,aberdeen,abilities,ables,ably,Unnamed: 21_level_0
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276708.CSV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
data_folder/wordcounts/wordcounts_10.2307_2276722.CSV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
data_folder/wordcounts/wordcounts_10.2307_2276742.CSV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...
data_folder/wordcounts/wordcounts_10.2307_2276953.CSV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...


In [104]:
sampler = LdaSampler(data = big_frame.values, ntopics = 5, alpha = .1, beta = .1)
LDAtest = sampler.update(maxiter = 20)

(500, 34934)

In [30]:
LDAtest[1].shape

(500, 5)