# Topic Modeling

In [3]:
from nltk.stem import *
import numpy as np
import textmining
import re
import lda
import lda.datasets
import json
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix
stopWords = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

In [4]:
reviewDocs = dict()
with open("../input/review.json") as json_file:
    line_count = 0
    for line in json_file:
        review = json.loads(line)
        text = review['text']
        text = re.sub('[^A-Za-z ]+', '', text)
        text = text.lower()
        text = ' '.join([word.strip() for word in text.split(' ') if word.strip() not in stopWords])
#         text = ' '.join([stemmer.stem(word) for word in text.split() if stemmer.stem(word) not in stopWords])
        reviewDocs[review['review_id']] = text
    print("Processed all reviews!")

Processed all reviews!


In [None]:
#Create Sparse Document-Term Matrix for LDA

n_nonzero = 0
vocab = set()
for terms in reviewDocs.values():
    unique_terms = set(terms.split())    # all unique terms of this doc
    vocab |= unique_terms           # set union: add unique terms of this doc
    n_nonzero += len(unique_terms)  # add count of unique terms in this doc

# The ReviewId vector
docnames = np.array(list(reviewDocs.keys()))
# Create the vocab vector
vocab = np.array(list(vocab)) 
# indices that sort vocab
vocab_sorter = np.argsort(vocab)    

ndocs = len(docnames)
nvocab = len(vocab)
print(ndocs,nvocab,n_nonzero)
data = np.empty(n_nonzero, dtype=np.intc)     # unique terms in the combined corpus of all the document
rows = np.empty(n_nonzero, dtype=np.intc)     # document index where the term is present
cols = np.empty(n_nonzero, dtype=np.intc)     # position of the term in the document
print(len(rows),len(cols),len(data))

In [None]:
ind = 0     # current index in the sparse matrix data
# go through all documents with their terms
for docname, terms in reviewDocs.items():
    # find indices into  such that, if the corresponding elements in  were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in 
    term_indices = vocab_sorter[np.searchsorted(vocab, terms.split(), sorter=vocab_sorter)]

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    cols[ind:ind_end] = uniq_indices            # save the column index: index in 
    doc_idx = np.where(docnames == docname)     # get the document index for the document name
    rows[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value

    ind = ind_end  # resume with next document -> add data to the end
    
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)
print("type(X): {}".format(type(dtm)))
print("shape: {}".format(dtm.shape))


In [None]:
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
model.fit(dtm)
doc_topic = model.doc_topic_
# print("review Topic Distribution\n")
# print doc_topic
# for i, doc_dist in enumerate(doc_topic):
#     print("Doc ", i)
#     print(doc_dist)
#     # for j,topic in enumerate(doc_dist):
#     #     print "Topic ",j," = ",topic

topic_word = model.topic_word_
print("Topic Word Distribution")
# print topic_word
n_top_words = 10
# for i, word_dist in enumerate(topic_word):
#     topic_words = np.array(vocab)[np.argsort(word_dist)][:-(n_top_words + 1):-1]
#     word_dist_sorted = sorted(word_dist,reverse=True)
#     print("Topic ", i)
#     for j,words in enumerate(topic_words):
#         print(words," = ",word_dist_sorted[j])
#     # print "Topic ", i," ", word_dist,"\n"
    
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))    