In [11]:
import nltk
import numpy as np
import pandas as pd
import os
import glob
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln


In [3]:

DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))

p = .5
rand_sample = [ allfiles[i] for i in sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))) ]
rand_sample
    
np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
    
comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ['words','count','source']

big_frame = big_frame.pivot(index = 'source',columns = 'words', values = 'count')
big_frame = big_frame.fillna(value = 0)

In [4]:
# There are some nonsense words in here, can we use some sort of dictionary to sweep them out?
''' 
My best guess as to why this is happening is:
1. As JSTOR parses the data things like equations and tables become jibberish.
2. Maybe we are using the wrong character format? Maybe it's unicode?
3. Aliens
'''
big_frame.head()


words,nan,a,aa,aaberge,aac,aachen,aacm,aaericai,aaerican,aai,...,zy,zyeg,zygmund,zyl,zymax,zymin,zyskind,zz,zzi,zzzt
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276742.CSV,0,22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276856.CSV,0,181,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276867.CSV,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276869.CSV,0,28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# We could do a couple of shady things like this to help remove those
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) > 2)]
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) < 20)]
big_frame.head()



words,nan,aac,aacm,aai,aalen,aam,aamse,aare,aaron,abandon,...,zurich,zv,zvi,zw,zwet,zx,zy,zygmund,zz,zzi
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276742.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276856.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276867.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276869.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
class LDA:
    ''' Latent Dirichlet Allocation (LDA) 
    Parameters
    ----------
    words: A D x N DataFrame holding the count of unique words in each document
    
    ntopics: Intiger Number of topics
    
    alpha: Numeric hyperparameter for Dirichlet prior for theta, the prior count of topics in a document
    
    beta: Numeric hyperparameter of Dirichlet prior for Phi, prior on words in a topic
    
    max_iter: Integer Maximum number of iterations
    
    burn_in: Integer burin-in iterations
    
    Output
    ---------
    Phi: An N x K Dataframe with each row being words and columns being probability of word being in topic
    
    Theta: A K x M Dataframe with each row being topics and columns being probability of document in topic
    '''
    
    
        
        

In [32]:
def log_multinomial_beta(alpha):
    '''
    Function to take the logarithm of the multinomial beta function
    '''
    return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))

testAlpha = pd.DataFrame(random.sample(range(1,100),10),index = xrange(10), columns = xrange(1))
log_multinomial_beta(testAlpha)

0   -1388.21078
dtype: float64

In [363]:
M, N = big_frame.shape

# Dimensions
# M: Number of documents
# N: Number of words
# ntopics: Number of topics

alpha = .01
beta = .01
burn_in = 10
max_iter = 100
ntopics = 6

NWZ = np.zeros((M,ntopics), dtype=np.float64) + beta
NZM = np.zeros((ntopics,M), dtype = np.float64) + alpha
NZ = NWZ.sum(axis=0)
Z = np.zeros((M,N), dtype = np.float64)
Phi = np.zeros((M,N), dtype = np.float64)
Theta = np.zeros((N,ntopics), dtype = np.float64)
topicdraw = np.ones((1,ntopics), dtype = np.float64) / ntopics
read_out_Phi = np.zeros((N,ntopics), dtype = np.float64)
read_out_Theta = np.zeros((ntopics,M))
read_out_sampling_num = 0
logPw_z = np.zeros(max_iter, dtype = np.float64)
betaVec = np.ones(ntopics, dtype = np.float64) * beta
sampling_lag = 10

Z = np.where(np.random.multinomial(1,[1./ntopics]*ntopics,size = M*N )==1)[1]
Z_index = Z.reshape(M*N)
Z = Z.reshape(M,N)
big_frame_index = big_frame.stack()

NWZ[Z_index,:] += 1
NWZ[big_frame_index,Z_index] +=1
NZ[Z_index] += 1

In [359]:
M, N = big_frame.shape

# Dimensions
# M: Number of documents
# N: Number of words
# ntopics: Number of topics
alpha = .01
beta = .01
burn_in = 10
max_iter = 100
ntopics = 6

NWZ = np.zeros((m,ntopics), dtype=np.float64) + beta
NZM = np.zeros((ntopics,M), dtype = np.float64) + alpha
NZ = NWZ.sum(axis=0)
Z = np.zeros((M,N), dtype = np.float64)
Phi = np.zeros((M,N), dtype = np.float64)
Theta = np.zeros((N,ntopics), dtype = np.float64)
topicdraw = np.ones((1,ntopics), dtype = np.float64) / ntopics
read_out_Phi = np.zeros((N,ntopics), dtype = np.float64)
read_out_Theta = np.zeros((ntopics,M))
read_out_sampling_num = 0
logPw_z = np.zeros(max_iter, dtype = np.float64)
betaVec = np.ones(ntopics, dtype = np.float64) * beta
sampling_lag = 10


Z = np.where(np.random.multinomial(1,[1./ntopics]*ntopics,size = M*N )==1)[1]
Z = pd.DataFrame(Z.reshape(M,N))

# Draw the initial starting points
for m in xrange(M):
    for n in xrange(N):
        NZM[Z[m,n],m] = NZM[Z[m,n],m] + 1
        NWZ[big_frame[m,n],Z[m,n]] = NWZ[big_frame[m,n],Z[m,n]] + 1
        NZ[Z[m,n]] = NZ[Z[m,n]] + 1

    

KeyError: (0, 0)

In [239]:
m = 1
n=1
big_frame.iloc[m,n]

0

In [354]:
## I had to remove the subtraction because it was giving me negative numbers?
NZM[Z[m,n],m] = NZM[Z[m,n],m]
NWZ[big_frame.iloc[m,n],Z[m,n]] = NWZ[big_frame.iloc[m,n],Z[m,n]] 
NZ[Z[m,n]] = NZ[Z[m,n]] 
p = np.zeros(ntopics, dtype = np.float64)
for k in xrange(ntopics):
    p[k] = NWZ[big_frame.iloc[m,n],k]/NZ[k] * NZM[k,m]

In [362]:
Z[m,n]

4

In [None]:
Z.setflags(write=True)

for iteration in xrange(max_iter):
        for m in xrange(M):
            for n in xrange(N):
                NZM[Z[m,n],m] = NZM[Z[m,n],m]
                NWZ[big_frame.iloc[m,n],Z[m,n]] = NWZ[big_frame.iloc[m,n],Z[m,n]] 
                NZ[Z[m,n]] = NZ[Z[m,n]] 
                p = np.zeros(ntopics, dtype = np.float64)
                for k in xrange(ntopics):
                    p[k] = NWZ[big_frame.iloc[m,n],k]/NZ[k] * NZM[k,m]
                
                p = p / np.sum(p)
                Z[m,n] = np.where(np.random.multinomial(1,p,size = 1 ) == 1)[1]
                NZM[Z[m,n],m] = NZM[Z[m,n],m] + 1
                NWZ[big_frame.iloc[m,n],Z[m,n]] = NWZ[big_frame.iloc[m,n],Z[m,n]] + 1
                NZ[Z[m,n]] = NZ[Z[m,n]] + 1
        
        
        for ZZ in xrange(ntopics):
            logPw_z[iteration] = logPw_z[iteration] + log_multinomial_beta(NWZ[:,ZZ]) - log_multinomial_beta(betaVec)
        
        if iteration % sampling_lag == 0 or iteration % sampling_lag == 1:
            if iteration >= burn_in:
                read_out_sampling_num = read_out_sampling_num + 1
                for k in xrange(K):
                    read_out_Phi[:,k] = read_out_Phi[:,k] + NWZ[:,k] / NZ[k]
                
                for m in xrange(M):
                    read_out_Theta[:,m] = read_out_Theta[:,m] + NZM[:,m]/sum(NZM[:,m])
                
Phi = read_out_Phi / read_out_sampling_num
Theta = read_out_Theta / read_out_sampling_num
        
        
        
        
        

In [59]:
import matplotlib.pyplot as plt


plt.plot(logPw_z.transpose())
plt.show()

KeyboardInterrupt: 