In [83]:
import nltk
import numpy as np
import pandas as pd
import os
import glob
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln



In [84]:

DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))

p = .5
rand_sample = [ allfiles[i] for i in sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))) ]
rand_sample
    
np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
    
comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ['words','count','source']

big_frame = big_frame.pivot(index = 'source',columns = 'words', values = 'count')
big_frame = big_frame.fillna(value = 0)

In [85]:
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) > 2)]
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) < 20)]
big_frame.head()


words,nan,aac,aacm,aalen,aam,aare,aaron,abandon,abandoned,abandonment,...,zurich,zv,zvi,zw,zwet,zx,zy,zygmund,zygosity,zz
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276742.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276825.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276843.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276856.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
def log_multinomial_beta(alpha):
    '''
    Function to take the logarithm of the multinomial beta function
    '''
    return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))

testAlpha = pd.DataFrame(random.sample(range(1,100),10),index = xrange(10), columns = xrange(1))
log_multinomial_beta(testAlpha)


0   -970.881264
dtype: float64

In [89]:
big_frame.shape


(500, 10158)

In [105]:
#Run this, but not the one that looks the same below it
M, N = big_frame.shape

# Dimensions
# M: Number of documents
# N: Number of words
# ntopics: Number of topics

alpha = .3
beta = .4
burn_in = 10
max_iter = 100
ntopics = 6

# Number of times topic z and word w interact
NZW = np.zeros((ntoics,N), dtype=np.float64) + beta

# Number of times document m and topic z interact
NZM = np.zeros((ntopics,M), dtype = np.float64) + alpha

# Summing over words
NZ = NZW.sum(axis=0)

# Summing over topics topics and words

Z = np.zeros((M,N), dtype = np.float64)
Phi = np.zeros((N,ntopics), dtype = np.float64)
Theta = np.zeros((N,ntopics), dtype = np.float64)
topicdraw = np.ones((1,ntopics), dtype = np.float64) / ntopics
read_out_Phi = np.zeros((N,ntopics), dtype = np.float64)
read_out_Theta = np.zeros((ntopics,M))
read_out_sampling_num = 0
logPw_z = np.zeros(max_iter, dtype = np.float64)
betaVec = np.ones(ntopics, dtype = np.float64) * beta
sampling_lag = 10

Z = np.where(np.random.multinomial(1,[1./ntopics]*ntopics,size = M*N )==1)[1]
Z_index = Z.reshape(M*N)
Z = Z.reshape(M,N)
Z_iter = Z.reshape(N,M)

big_frame_index = big_frame.stack()

In [112]:
itemfreq(Z_iter[:,1])[:,1]

array([ 1780.,  1648.,  1681.,  1683.,  1693.,  1673.])

In [113]:
from scipy.stats import itemfreq

index_counts = np.zeros((ntopics,M),dtype = np.float64)

for m in xrange(M):
    index_counts[:,m] = itemfreq(Z_iter[:,m])[:,1]


In [114]:
index_counts

array([[ 1681.,  1780.,  1751., ...,  1651.,  1705.,  1650.],
       [ 1742.,  1648.,  1681., ...,  1680.,  1721.,  1650.],
       [ 1696.,  1681.,  1699., ...,  1707.,  1651.,  1719.],
       [ 1619.,  1683.,  1675., ...,  1746.,  1680.,  1663.],
       [ 1669.,  1693.,  1707., ...,  1695.,  1744.,  1709.],
       [ 1751.,  1673.,  1645., ...,  1679.,  1657.,  1767.]])

In [115]:
NZM

array([[ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3],
       [ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3],
       [ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3],
       [ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3],
       [ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3],
       [ 0.3,  0.3,  0.3, ...,  0.3,  0.3,  0.3]])

In [117]:
NZW.shape

(10158, 6)

In [82]:
# I keep getting memory errors, so had to use a loop to iterate
# is there some way to get around this?
for m in xrange(M):
    NZM[Z_index,m] = index_count[:,m]
    
NZW[big_frame_index,Z_index] +=1
NZ[Z_index] += 1


(6, 500)

In [88]:
# Alternative version of the one above which causes
# a memory error

NZM[Z_index,:] += 1    
NZW[big_frame_index,Z_index] +=1
NZ[Z_index] += 1

MemoryError: 

In [None]:
# Alternative2 of the one above
# This one is very slow

# Draw the initial starting points
for m in xrange(M):
    for n in xrange(N):
        NZM[Z[m,n],m] = NZM[Z[m,n],m] + 1
        NZW[big_frame[m,n],Z[m,n]] = NZW[big_frame[m,n],Z[m,n]] + 1
        NZ[Z[m,n]] = NZ[Z[m,n]] + 1

In [60]:
# This is the meat and potatoes where we
# calculate the posterior distribution for
# Phi and Theta

Z.setflags(write=True)

for iteration in xrange(max_iter):
        for m in xrange(M):
            for n in xrange(N):
                #NZM[Z[m,n],m] -= 1
                #NZW[big_frame.iloc[m,n],Z[m,n]] -= 1
                #NZ[Z[m,n]] -= 1
                p = np.zeros(ntopics, dtype = np.float64)
                for k in xrange(ntopics):
                    p[k] = NZW[big_frame.iloc[m,n],k]/NZ[k] * NZM[k,m]
                
                p = p / np.sum(p)
                Z[m,n] = np.where(np.random.multinomial(1,p,size = 1 ) == 1)[1]
                NZM[Z[m,n],m] += 1
                NZW[big_frame.iloc[m,n],Z[m,n]] += 1
                NZ[Z[m,n]] += 1
        
        
        for ZZ in xrange(ntopics):
            logPw_z[iteration] += log_multinomial_beta(NZW[:,ZZ]) - log_multinomial_beta(betaVec)
        
        if iteration % sampling_lag == 0 or iteration % sampling_lag == 1:
            if iteration >= burn_in:
                read_out_sampling_num = read_out_sampling_num + 1
                for k in xrange(ntopics):
                    read_out_Phi[:,k] += NZW[:,k] / NZ[k]
                
                for m in xrange(M):
                    read_out_Theta[:,m] +=  NZM[:,m]/sum(NZM[:,m])
                
Phi = read_out_Phi / read_out_sampling_num
Theta = read_out_Theta / read_out_sampling_num
        
      
        
        


ValueError: sum(pvals[:-1]) > 1.0

In [92]:
p

0.5

In [23]:
NZW.iloc[Z,1]

0    0.01
Name: 1, dtype: float64