In [26]:
import pymc as pm
import numpy as np
import pandas as pd
import gensim
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models

In [108]:
def documents_to_corpus():
	tokenizer = RegexpTokenizer(r'\w+')

	# create English stop words list
	en_stop = get_stop_words('en')

	# Create p_stemmer of class PorterStemmer
	p_stemmer = PorterStemmer()
	
	texts = []

	df = pd.read_csv('irony-labeled.csv', nrows = 30)    
    
	# loop through document list
	for i in df['comment_text'].iteritems():
        
	    # clean and tokenize document string
	    raw = str(i[1]).lower()
	    tokens = tokenizer.tokenize(raw)

	    # remove stop words from tokens
	    stopped_tokens = [i for i in tokens if not i in en_stop]
	    
	    # stem tokens
	    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
	    
	    # add tokens to list
	    texts.append(stemmed_tokens)

	# turn our tokenized documents into a id <-> term dictionary
	dictionary = corpora.Dictionary(texts)
	    
	# convert tokenized documents into a document-term matrix
	corpus = [dictionary.doc2idx(text) for text in texts]
	dup = []
    
	for i in corpus:
		for k in i:
			dup.append(k)
            
	number_of_words = np.amax(dup) + 1
	number_of_documents = len(corpus)
	number_of_topics = 5

	print("\nCorpus of Documents(last 10):")
	print(corpus)
	print("\nNumber of words:")
	print(number_of_words)
	print("\nNumber of documents:")
	print(number_of_documents)
	print("\nNumber of topics:")
	print(number_of_topics)

	return corpus, number_of_words, number_of_documents, number_of_topics;

In [109]:
data, V, D, K = documents_to_corpus()


Corpus of Documents(last 10):
[[32, 3, 28, 13, 20, 24, 25, 23, 0, 30, 11, 27, 2, 19, 38, 16, 10, 1, 24, 37, 33, 5, 6, 34, 7, 26, 35, 22, 17, 4, 18, 15, 29, 12, 20, 36, 8, 9, 31, 3, 31, 21, 14], [31, 53, 42, 71, 65, 73, 48, 69, 70, 56, 40, 39, 41, 43, 64, 66, 75, 67, 58, 52, 59, 46, 49, 62, 31, 64, 18, 45, 72, 18, 68, 60, 59, 46, 64, 44, 74, 57, 54, 51, 63, 55, 61, 50, 47], [98, 83, 90, 87, 97, 80, 95, 77, 81, 89, 92, 94, 67, 31, 86, 76, 91, 78, 96, 93, 79, 6, 99, 82, 63, 92, 14, 88, 85, 84, 91], [100, 114, 104, 101, 108, 110, 111, 114, 102, 105, 115, 109, 114, 106, 117, 55, 110, 112, 116, 113, 105, 115, 107, 117, 103], [122, 138, 12, 129, 132, 127, 140, 125, 96, 126, 137, 136, 18, 119, 121, 131, 120, 131, 139, 118, 130, 123, 124, 128, 134, 133, 135], [50, 78, 96, 141, 142, 143, 144], [31, 150, 146, 147, 22, 151, 149, 152, 145, 130, 154, 153, 148, 76], [155], [158, 125, 96, 18, 107, 1, 156, 157], [163, 150, 162, 168, 161, 159, 164, 31, 160, 163, 31, 165, 167, 166], [176, 174, 177, 180,

In [110]:
alpha = np.ones(K)
beta = np.ones(V)

# theta recives D (number of documents) dimensional symmetric Dirichlet distribution
theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, pm.Dirichlet("ptheta_%s" % i, theta=alpha)) for i in range(D)])

# phi recives K (number of topics) dimensional symmetric Dirichlet distribution
phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)])

#length of each document in our data
Wd = [len(doc) for doc in data]

#z recives categorical distribution for each value of theta
z = pm.Container([pm.Categorical('z_%i' % d, 
                     p = theta[d], #value of theta for document d
                     size=Wd[d], #length of document d
                     value=np.random.randint(K, size=Wd[d]))
                  for d in range(D)])

#w recives categorical distribution for phi distribution of z
w = pm.Container([pm.Categorical("w_%i_%i" % (d,i),
                    p = pm.Lambda('phi_z_%i_%i' % (d,i), 
                              lambda z=z[d][i], phi=phi: phi[z]),
                    value=data[d][i], 
                    observed=True)
                  for d in range(D) for i in range(Wd[d])])

model = pm.Model([theta, phi, z, w])
mcmc = pm.MCMC(model)
mcmc.sample(5000,1000)


 [-----------------100%-----------------] 5000 of 5000 complete in 65.2 sec

In [111]:
#Visualise our data
#shape D,K
print("\nTheta's:")
for i in range(D):
	theta_temp = np.array(mcmc.trace("theta_%i" % i)[3999])
	print(theta_temp)

print("\nPhi's: \n")
#shape K,V
for j in range(K):
	phi_temp = np.array(mcmc.trace("phi_%i" % j)[3999])
	print(phi_temp)
print("\n")
print("Predicts:\n")
for k in range(D):
	z_temp = np.array(mcmc.trace("z_%i" % k)[3999])
	print(z_temp)


Theta's:
[[0.28556685 0.09316993 0.02393619 0.00722636 0.59010068]]
[[0.31385709 0.03192219 0.19223481 0.08181271 0.3801732 ]]
[[0.32315142 0.13383716 0.18883118 0.02377549 0.33040474]]
[[0.26000389 0.48944677 0.05788395 0.17370505 0.01896033]]
[[0.03128032 0.0279415  0.32704427 0.02980902 0.5839249 ]]
[[0.41256961 0.08102726 0.16980975 0.25529865 0.08129473]]
[[0.28640134 0.252367   0.21556731 0.17368316 0.07198118]]
[[0.11452481 0.07305272 0.24653491 0.15870091 0.40718666]]
[[0.20754541 0.18677474 0.17228119 0.39995438 0.03344428]]
[[0.04733854 0.27835953 0.18209423 0.35919559 0.13301211]]
[[0.23532387 0.48603497 0.07075798 0.09111515 0.11676803]]
[[0.04660966 0.09617451 0.18223815 0.30038759 0.37459009]]
[[0.06263188 0.10395849 0.03323872 0.50601842 0.29415249]]
[[0.05738144 0.11805053 0.64830818 0.16888825 0.0073716 ]]
[[0.78055842 0.05592985 0.0455533  0.01451188 0.10344656]]
[[0.51608849 0.1818921  0.25564828 0.01275757 0.03361356]]
[[0.01570348 0.12033193 0.39207144 0.20503599 