In [16]:
import numpy as np
import math
import re

In [17]:
def normalize(input_matrix):
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """

    row_sums = input_matrix.sum(axis=1)
    try:
        assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
    except Exception:
        raise Exception("Error while normalizing. Row(s) sum to zero")
    new_matrix = input_matrix / row_sums[:, np.newaxis]
    return new_matrix

In [22]:
class Corpus(object):

    """
    A collection of documents.
    """

    def __init__(self, documents_path):
        """
        Initialize empty document list.
        """
        self.documents = []
        self.vocabulary = []
        self.likelihoods = []
        self.documents_path = documents_path
        self.term_doc_matrix = None
        self.document_topic_prob = None  # P(z | d)
        self.topic_word_prob = None  # P(w | z)
        self.topic_prob = None  # P(z | d, w)

        self.number_of_documents = 0
        self.vocabulary_size = 0

    def build_corpus(self):
        """
        Read document, fill in self.documents, a list of list of word
        self.documents = [["the", "day", "is", "nice", "the", ...], [], []...]
        
        Update self.number_of_documents
        """
        # #############################
        # your code here
        with open(self.documents_path, "r+") as path:
            lines = path.readlines()
        document = []
        for line in lines:
            document = re.split("\t|\n| ", line)
            self.documents.append(document)
        self.number_of_documents = len(self.documents)

        path.close()
       #print(self.number_of_documents)

#         print(self.documents_path)
#         with open(self.documents_path, 'r') as file:
#             for line in file.readlines():
#                 doc = list()
#                 doc.extend(line.split())
#                 self.documents.append(doc)
#                 # self.documents.append(doc)
#                 self.number_of_documents += 1

#         # print(self.documents)
#         print(len(self.documents))
#         print(self.number_of_documents)

    def build_vocabulary(self):
        """
        Construct a list of unique words in the whole corpus. Put it in self.vocabulary
        for example: ["rain", "the", ...]
        Update self.vocabulary_size
        """
        # #############################
        # your code here
        for document in self.documents:
            for word in document:
                if word not in self.vocabulary and word != "":
                    self.vocabulary.append(word)
        self.vocabulary_size = len(self.vocabulary)
        #print(self.vocabulary)

#         res = set()
#         for doc in self.documents:
#             res.update(doc)
#         self.vocabulary = res
#         self.vocabulary_size = len(res)
#         self.vocabulary_dist = {k: i for i, k in enumerate(self.vocabulary)}

    def build_term_doc_matrix(self):
        """
        Construct the term-document matrix where each row represents a document,
        and each column represents a vocabulary term.
        self.term_doc_matrix[i][j] is the count of term j in document i
        """
        # ############################
        # your code here
        self.term_doc_matrix = np.zeros([self.number_of_documents, self.vocabulary_size], dtype = np.int64)
        for index_doc, document in enumerate(self.documents):
            term_count = np.zeros([self.vocabulary_size])
            for word in document:
                if word in self.vocabulary:
                    index_term = self.vocabulary.index(word)
                    term_count[index_term] +=1
            self.term_doc_matrix[index_doc] = term_count
#         print(self.term_doc_matrix)

#         self.term_doc_matrix = np.zeros(shape=(self.number_of_documents, self.vocabulary_size))

#         for i, doc in enumerate(self.documents):
#             for term in doc:
#                 self.term_doc_matrix[i][self.vocabulary_dist[term]] += 1
#         # print(self.term_doc_matrix)


    def initialize_randomly(self, number_of_topics):
        """
        Randomly initialize the matrices: document_topic_prob and topic_word_prob
        which hold the probability distributions for P(z | d) and P(w | z): self.document_topic_prob, and self.topic_word_prob
        Don't forget to normalize!
        HINT: you will find numpy's random matrix useful [https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.random.html]
        """
        # ############################
        self.document_topic_prob = np.random.rand(self.number_of_documents,number_of_topics)
        # normalize
        self.document_topic_prob = normalize(self.document_topic_prob)
        
        self.topic_word_prob = np.random.rand(number_of_topics, self.vocabulary_size)
        # normalize
        self.topic_word_prob = normalize(self.topic_word_prob)

        

    def initialize_uniformly(self, number_of_topics):
        """
        Initializes the matrices: self.document_topic_prob and self.topic_word_prob with a uniform
        probability distribution. This is used for testing purposes.
        DO NOT CHANGE THIS FUNCTION
        """
        self.document_topic_prob = np.ones((self.number_of_documents, number_of_topics))
        self.document_topic_prob = normalize(self.document_topic_prob)

        self.topic_word_prob = np.ones((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)

    def initialize(self, number_of_topics, random=False):
        """ Call the functions to initialize the matrices document_topic_prob and topic_word_prob
        """
        print("Initializing...")

        if random:
            self.initialize_randomly(number_of_topics)
        else:
            self.initialize_uniformly(number_of_topics)

    def expectation_step(self, number_of_topics):
        """ The E-step updates P(z | w, d)
        """
        print("E step:")
        
        # ############################
        # your code here
#        for index_doc in range(len(self.documents)):
#            for index_word in range(len(self.vocabulary)):
#                denomitor = 0
#                for index_topic in range(number_of_topics):
#                    self.topic_prob[index_doc, index_topic, index_word] = self.document_topic_prob[index_doc, index_topic] * self.topic_word_prob[index_topic, index_word]
#                    denomitor += self.topic_prob[index_doc, index_topic, index_word]
#                for index_topic in range(number_of_topics):
#                    self.topic_prob[index_doc, index_topic, index_word] /= denomitor

        self.topic_word_prob = np.nan_to_num(self.topic_word_prob)
        for doc in range(self.topic_prob.shape[0]):
            for voc in range(self.topic_prob.shape[2]):
                self.topic_prob[doc, :, voc] = self.document_topic_prob[doc, :] * self.topic_word_prob[:, voc]
                self.topic_prob[doc, :, voc] /= self.topic_prob[doc, :, voc].sum()
        self.topic_word_prob = np.nan_to_num(self.topic_word_prob)
        
    def maximization_step(self, number_of_topics):
        """ The M-step updates P(w | z)
        """
        print("M step:")
        
        # update P(w | z)
        
        # ############################
        # your code here
#        for index_topic in range(number_of_topics):
#            for index_word in range(len(self.vocabulary)):
#
#                for index_doc in range(len(self.documents)):
#                    count = self.term_doc_matrix[index_doc, index_word]
#                    self.topic_word_prob[index_topic, index_word] += count * self.topic_prob[index_doc, index_topic, index_word]
#            self.topic_word_prob[index_topic, :] /= self.topic_word_prob[index_topic, :].sum()
            
        for topic in range(self.topic_prob.shape[1]):
            for voc in range(self.topic_prob.shape[2]):
                self.topic_word_prob[topic, voc] = self.term_doc_matrix[:, voc].dot(self.topic_prob[:, topic, voc])
            self.topic_word_prob[topic, :] /= self.topic_word_prob[topic, :].sum()
        self.topic_word_prob = np.nan_to_num(self.topic_word_prob)
        
        # update P(z | d)  Pi

        # ############################
        # your code here
#        for index_doc in range(len(self.documents)):
#            for index_topic in range(number_of_topics):
#
#                for index_word in range(len(self.vocabulary)):
#                    count = self.term_doc_matrix[index_doc, index_word]
#                    self.document_topic_prob[index_doc, index_topic] += count * self.topic_prob[index_doc, index_topic, index_word]
#            self.document_topic_prob[index_doc, :] /= self.document_topic_prob[index_doc, :].sum()

        for doc in range(self.topic_prob.shape[0]):
            for topic in range(self.topic_prob.shape[1]):
                self.document_topic_prob[doc, topic] = self.term_doc_matrix[doc, :].dot(self.topic_prob[doc, topic, :])
            self.document_topic_prob[doc, :] /= self.document_topic_prob[doc, :].sum()
        self.document_topic_prob = np.nan_to_num(self.document_topic_prob)

    def calculate_likelihood(self, number_of_topics):
        """ Calculate the current log-likelihood of the model using
        the model's updated probability matrices
        
        Append the calculated log-likelihood to self.likelihoods
        """
        # ############################
        # your code here
#         likelihood = 0.0
#         for index_doc in range(len(self.documents)):
#             sum = 0
#             for index_word in range(len(self.vocabulary)):
#                 sum1 = 0;
#                 for index_topic in range(number_of_topics):
#                     sum1 += self.document_topic_prob[index_doc,index_topic]*self.topic_word_prob[index_topic,index_word]
#                 sum1 = np.log(sum1)
#                 sum1 *= self.term_doc_matrix[index_doc, index_word]
#             sum += sum1
#         likelihood = sum
                
        
#         self.likelihoods.append(np.sum(np.log(self.document_topic_prob @ self.topic_word_prob) * self.term_doc_matrix))
        likelihood = np.sum(self.term_doc_matrix * np.log(np.matmul(self.document_topic_prob, self.topic_word_prob)))
        self.likelihoods.append(likelihood)
        
        print(self.likelihoods[-1])
        return self.likelihoods[-1]

    def plsa(self, number_of_topics, max_iter, epsilon):

        """
        Model topics.
        """
        print ("EM iteration begins...")
        
        # build term-doc matrix
        self.build_term_doc_matrix()
        
        # Create the counter arrays.
        
        # P(z | d, w)
        self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=float)

        # P(z | d) P(w | z)
        self.initialize(number_of_topics, random=True)

        # Run the EM algorithm
        current_likelihood = 0.0
        last_topic_prob = self.topic_prob.copy()
        
        for iteration in range(max_iter):
            print("Iteration #" + str(iteration + 1) + "...")

            # ############################
            # your code here
            self.expectation_step(number_of_topics)
            diff = abs(self.topic_prob - last_topic_prob)
            L1 = diff.sum()
            print ("L1: ", L1)
            print (last_topic_prob)
            # assert L1 > 0
            last_topic_prob = self.topic_prob.copy()

            self.maximization_step(number_of_topics)
            self.calculate_likelihood(number_of_topics)
            tmp_likelihood = self.calculate_likelihood(number_of_topics)
            if iteration > 100 and abs(current_likelihood - tmp_likelihood) < epsilon/10:
                print('Stopping', tmp_likelihood)
                return tmp_likelihood
            current_likelihood = tmp_likelihood
            print(max(self.likelihoods))
            
#             self.maximization_step(number_of_topics)
#             self.calculate_likelihood(number_of_topics)
            
#             gap = np.abs(self.calculate_likelihood(number_of_topics) - current_likelihood)
            
#             if gap < epsilon:
#                 break;
#             else:
#                 current_likelihood = self.calculate_likelihood(number_of_topics)
                
#         return self.topic_word_prob, self.document_topic_prob

In [23]:
def main():
    documents_path = '/Users/sunnie/Desktop/School/UIUC/CS410/MP3/data/test.txt'
    corpus = Corpus(documents_path)  # instantiate corpus
    corpus.build_corpus()
    corpus.build_vocabulary()
    print(corpus.vocabulary)
    print("Vocabulary size:" + str(len(corpus.vocabulary)))
    print("Number of documents:" + str(len(corpus.documents)))
    number_of_topics = 2
    max_iterations = 50
    epsilon = 0.001
    corpus.plsa(number_of_topics, max_iterations, epsilon)



if __name__ == '__main__':
    main()

['0', 'mount', 'rainier', 'seattle', '1', 'willis', 'tower', 'chicago']
Vocabulary size:8
Number of documents:1000
EM iteration begins...
Initializing...
Iteration #1...
E step:
L1:  8000.0
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
M step:
-179053.05628623615
-179053.05628623615
-179053.05628623615
Iteration #2...
E step:
L1:  908.1129633527162
[[[0.22918171 0.53312337 0.211445   ... 0.40481909 0.84402709 0.54911599]
  [0.77081829 0.46687663 0.788555   ... 0.59518091 0.15597291 0.45088401]]

 [[0.86601316 0.96127532 0.85356734 ... 0.93665214 0.99157088 0.96360309]
  [0.13398684 0.03872468 0.14643266 ... 0.06334786 0.00842912 0.03639691]]

 [[0.01798139 0.06570313 0.0162453  ... 0.04020375 0.24995856 0.0

L1:  309.1260342109958
[[[4.05581234e-06 4.61787532e-03 1.81436003e-03 ... 1.03132958e-01
   1.53588210e-01 1.06324412e-01]
  [9.99995944e-01 9.95382125e-01 9.98185640e-01 ... 8.96867042e-01
   8.46411790e-01 8.93675588e-01]]

 [[1.23792939e-05 1.39626347e-02 5.51735585e-03 ... 2.59800435e-01
   3.56440222e-01 2.66399925e-01]
  [9.99987621e-01 9.86037365e-01 9.94482644e-01 ... 7.40199565e-01
   6.43559778e-01 7.33600075e-01]]

 [[7.26264683e-03 8.93256186e-01 7.66280613e-01 ... 9.95201993e-01
   9.96954083e-01 9.95361827e-01]
  [9.92737353e-01 1.06743814e-01 2.33719387e-01 ... 4.79800700e-03
   3.04591677e-03 4.63817295e-03]]

 ...

 [[9.17098407e-02 9.91415958e-01 9.78378636e-01 ... 9.99650806e-01
   9.99778682e-01 9.99662488e-01]
  [9.08290159e-01 8.58404178e-03 2.16213643e-02 ... 3.49194338e-04
   2.21317615e-04 3.37511508e-04]]

 [[5.53558657e-03 8.64262859e-01 7.13847048e-01 ... 9.93703595e-01
   9.96000657e-01 9.93913039e-01]
  [9.94464413e-01 1.35737141e-01 2.86152952e-01 ... 6.

L1:  70.83695480762177
[[[5.38328100e-53 2.13186166e-09 4.22223769e-10 ... 4.21400408e-07
   4.54413805e-07 3.35620057e-07]
  [1.00000000e+00 9.99999998e-01 1.00000000e+00 ... 9.99999579e-01
   9.99999546e-01 9.99999664e-01]]

 [[2.60426174e-53 1.03132751e-09 2.04258557e-10 ... 2.03860285e-07
   2.19831134e-07 1.62362438e-07]
  [1.00000000e+00 9.99999999e-01 1.00000000e+00 ... 9.99999796e-01
   9.99999780e-01 9.99999838e-01]]

 [[6.55770416e-41 9.99615081e-01 9.98059521e-01 ... 9.99998052e-01
   9.99998193e-01 9.99997554e-01]
  [1.00000000e+00 3.84918699e-04 1.94047921e-03 ... 1.94804619e-06
   1.80651982e-06 2.44594177e-06]]

 ...

 [[4.46368758e-38 9.99999434e-01 9.99997144e-01 ... 9.99999997e-01
   9.99999997e-01 9.99999996e-01]
  [1.00000000e+00 5.65710180e-07 2.85633631e-06 ... 2.86192418e-09
   2.65400383e-09 3.59339704e-09]]

 [[1.06956821e-43 8.09001886e-01 4.56193166e-01 ... 9.98807042e-01
   9.98893615e-01 9.98502594e-01]
  [1.00000000e+00 1.90998114e-01 5.43806834e-01 ... 1.

L1:  37.631541644964415
[[[9.50440456e-149 8.05663273e-016 9.12475303e-017 ... 4.76158306e-013
   4.42845722e-013 3.04640940e-013]
  [1.00000000e+000 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]]

 [[5.62290216e-150 4.76638566e-017 5.39829647e-018 ... 2.81700085e-014
   2.61992022e-014 1.80228671e-014]
  [1.00000000e+000 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]]

 [[1.80140612e-128 9.99993451e-001 9.99942181e-001 ... 9.99999989e-001
   9.99999988e-001 9.99999983e-001]
  [1.00000000e+000 6.54872674e-006 5.78185305e-005 ... 1.10805651e-008
   1.19140885e-008 1.73190874e-008]]

 ...

 [[6.95057904e-124 1.00000000e+000 9.99999999e-001 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]
  [1.00000000e+000 1.69726776e-010 1.49858992e-009 ... 2.87178923e-013
   3.08781643e-013 4.48864915e-013]]

 [[2.59849461e-133 6.87761037e-001 1.99660545e-001 ... 9.99232430e-001
   9.99174738e-001 9.98800794e-001]

-159752.37612876086
-159752.37612876086
-159752.37612876086
Iteration #35...
E step:
L1:  18.701621009225423
[[[2.68700218e-299 1.38364485e-022 8.42549723e-024 ... 2.41761042e-019
   1.92383320e-019 1.08446850e-019]
  [1.00000000e+000 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]]

 [[1.46374353e-301 7.53740066e-025 4.58978679e-026 ... 1.31699247e-021
   1.04800749e-021 5.90763852e-022]
  [1.00000000e+000 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]]

 [[1.07931066e-270 9.99999820e-001 9.99997045e-001 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]
  [1.00000000e+000 1.79927209e-007 2.95477712e-006 ... 1.02975814e-010
   1.29405917e-010 2.29564436e-010]]

 ...

 [[2.60058773e-264 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
   1.00000000e+000 1.00000000e+000]
  [1.00000000e+000 7.46744234e-014 1.22631197e-012 ... 4.27376058e-017
   5.37067773e-017 9.52751335e-017]]

 [[3.45746437e-277 6.

L1:  13.302011273213932
[[[0.00000000e+00 1.54191471e-29 5.35051257e-31 ... 8.67968403e-26
   5.92343285e-26 1.98476513e-26]
  [1.00000000e+00 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
   1.00000000e+00 1.00000000e+00]]

 [[0.00000000e+00 6.53227507e-33 2.26672848e-34 ... 3.67712190e-29
   2.50944442e-29 8.40839749e-30]
  [1.00000000e+00 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
   1.00000000e+00 1.00000000e+00]]

 [[0.00000000e+00 9.99999993e-01 9.99999812e-01 ... 1.00000000e+00
   1.00000000e+00 1.00000000e+00]
  [1.00000000e+00 6.53686101e-09 1.88379715e-07 ... 1.16124990e-12
   1.70159475e-12 5.07832492e-12]]

 ...

 [[0.00000000e+00 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
   1.00000000e+00 1.00000000e+00]
  [1.00000000e+00 4.19598778e-17 1.20920290e-15 ... 7.45402168e-21
   1.09224760e-20 3.25975864e-20]]

 [[0.00000000e+00 6.18672207e-01 5.32979495e-02 ... 9.99890517e-01
   9.99839581e-01 9.99521391e-01]
  [1.00000000e+00 3.81327793e-01 9.46702051e-01 ... 1