In [1]:
import nltk
import numpy as np
import pandas as pd
import os
import glob
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.special import gammaln



In [2]:

DIR = r'data_folder/wordcounts'
allfiles = glob.glob(os.path.join(DIR,"*.CSV"))

p = .5
rand_sample = [ allfiles[i] for i in sorted(random.sample(xrange(len(allfiles)), int(p * len(allfiles)))) ]
rand_sample
    
np_array_list = []
for file_ in rand_sample:
    df = pd.read_csv(file_,index_col=None, header=0)
    df['source'] = file_
    np_array_list.append(df.as_matrix())
    
comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ['words','count','source']

big_frame = big_frame.pivot(index = 'source',columns = 'words', values = 'count')
big_frame = big_frame.fillna(value = 0)

In [3]:
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) > 2)]
big_frame = big_frame.loc[:, (big_frame.sum(axis = 0) < 20)]
big_frame.head()


words,aaby,aalen,aamse,aaron,abab,ababab,abandon,abandoned,abandonment,abba,...,zurich,zurnal,zv,zvi,zw,zwet,zy,zygmund,zyskind,zz
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data_folder/wordcounts/wordcounts_10.2307_2276722.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276732.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276742.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276818.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
data_folder/wordcounts/wordcounts_10.2307_2276825.CSV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def log_multinomial_beta(alpha):
    '''
    Function to take the logarithm of the multinomial beta function
    '''
    return np.sum(gammaln(alpha)) - gammaln(np.sum(alpha))

testAlpha = pd.DataFrame(random.sample(range(1,100),10),index = xrange(10), columns = xrange(1))
log_multinomial_beta(testAlpha)


0   -923.993245
dtype: float64

In [5]:
M, N = big_frame.shape

# Dimensions
# M: Number of documents
# N: Number of words
# ntopics: Number of topics

alpha = .01
beta = .01
burn_in = 10
max_iter = 100
ntopics = 6

NWZ = np.zeros((M,ntopics), dtype=np.float64) + beta
NZM = np.zeros((ntopics,M), dtype = np.float64) + alpha
NZ = NWZ.sum(axis=0)
Z = np.zeros((M,N), dtype = np.float64)
Phi = np.zeros((M,N), dtype = np.float64)
Theta = np.zeros((N,ntopics), dtype = np.float64)
topicdraw = np.ones((1,ntopics), dtype = np.float64) / ntopics
read_out_Phi = np.zeros((N,ntopics), dtype = np.float64)
read_out_Theta = np.zeros((ntopics,M))
read_out_sampling_num = 0
logPw_z = np.zeros(max_iter, dtype = np.float64)
betaVec = np.ones(ntopics, dtype = np.float64) * beta
sampling_lag = 10

Z = np.where(np.random.multinomial(1,[1./ntopics]*ntopics,size = M*N )==1)[1]
Z_index = Z.reshape(M*N)
Z = Z.reshape(M,N)
big_frame_index = big_frame.stack()

NWZ[Z_index,:] += 1
NWZ[big_frame_index,Z_index] +=1
NZ[Z_index] += 1

In [None]:
        NZM.iloc[Z.iloc[m,n],m] = NZM.iloc[Z.iloc[m,n],m] + 1
        NWZ.iloc[big_frame.iloc[m,n],Z.iloc[m,n]] = NWZ.iloc[big_frame.iloc[m,n],Z.iloc[m,n]] + 1
        NZ.iloc[Z.iloc[m,n]] = NZ.iloc[Z.iloc[m,n]] + 1

In [35]:
Z = np.where(np.random.multinomial(1,[1./ntopics]*ntopics,size = M*N )==1)[1]
Zz = pd.Index(Z.reshape(M*N,1))
Zz

Int64Index([[5], [0], [3], [4], [1], [5], [1], [3], [4], [4],
            ...
            [3], [1], [4], [1], [4], [3], [0], [2], [4], [1]],
           dtype='int64', length=4996500)

In [23]:
NWZ.iloc[Z,1]

0    0.01
Name: 1, dtype: float64