In [49]:
import numpy as np
import math
import re

In [61]:
def normalize(input_matrix): #(2d array)
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """

    row_sums = input_matrix.sum(axis=1) #(row,)
    try:
        assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
    except Exception:
        raise Exception("Error while normalizing. Row(s) sum to zero")
    new_matrix = input_matrix / row_sums[:, np.newaxis] #(row,col)/(row,1)
    return new_matrix #(row,col)


In [95]:
class Corpus(object):

    """
    A collection of documents.
    """

    def __init__(self, documents_path):
        """
        Initialize empty document list.
        """
        self.documents = []
        self.vocabulary = []
        self.likelihoods = []
        self.documents_path = documents_path
        self.term_doc_matrix = None 
        self.document_topic_prob = None  # P(z | d)
        self.topic_word_prob = None  # P(w | z)
        self.topic_prob = None  # P(z | d, w)

        self.number_of_documents = 0
        self.vocabulary_size = 0

    def build_corpus(self):
        """
        Read document, fill in self.documents, a list of list of word
        self.documents = [["the", "day", "is", "nice", "the", ...], [], []...]
        
        Update self.number_of_documents
        """
        # #############################
        # your code here
        with open(self.documents_path, "r+") as path:
            lines = path.readlines()   
        document = []
        for line in lines:
            document = re.split("\t|\n| |0|1", line)
            self.documents.append(document)        
        self.number_of_documents = len(self.documents)
        
        path.close()
        #print(self.number_of_documents)

    def build_vocabulary(self):
        """
        Construct a list of unique words in the whole corpus. Put it in self.vocabulary
        for example: ["rain", "the", ...]
        Update self.vocabulary_size
        """
        # #############################
        # your code here
        for document in self.documents:
            for word in document:
                if word not in self.vocabulary and word != "":
                    self.vocabulary.append(word)
        self.vocabulary_size = len(self.vocabulary)
        #print(self.vocabulary)

    def build_term_doc_matrix(self):
        """
        Construct the term-document matrix where each row represents a document, 
        and each column represents a vocabulary term.
        self.term_doc_matrix[i][j] is the count of term j in document i
        """
        # ############################
        # your code here
        self.term_doc_matrix = np.zeros([self.number_of_documents, self.vocabulary_size], dtype = np.int64)
        for index_doc, document in enumerate(self.documents):
            term_count = np.zeros([self.vocabulary_size])
            for word in document:
                if word in self.vocabulary:
                    index_term = self.vocabulary.index(word)
                    term_count[index_term] +=1        
            self.term_doc_matrix[index_doc] = term_count
#         print(self.term_doc_matrix)


    def initialize_randomly(self, number_of_topics):
        """
        Randomly initialize the matrices: document_topic_prob and topic_word_prob
        which hold the probability distributions for P(z | d) and P(w | z): self.document_topic_prob, and self.topic_word_prob
        Don't forget to normalize! 
        HINT: you will find numpy's random matrix useful [https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.random.html]
        """
        # ############################
        document_topic_prob = np.random.rand(self.number_of_documents,number_of_topics)
        # normalize
        document_topic_prob = normalize(document_topic_prob)
#         norm_d = np.linalg.norm(document_topic_prob)
#         document_topic_prob = document_topic_prob/norm_d
        
        topic_word_prob = np.random.rand(number_of_topics, self.vocabulary_size)
        # normalize
        topic_word_prob = normalize(topic_word_prob)
#         norm_t = np.linalg.norm(topic_word_prob)
#         topic_word_prob = topic_word_prob/norm_t
        print(document_topic_prob)
        

    def initialize_uniformly(self, number_of_topics):
        """
        Initializes the matrices: self.document_topic_prob and self.topic_word_prob with a uniform 
        probability distribution. This is used for testing purposes.
        DO NOT CHANGE THIS FUNCTION
        """
        self.document_topic_prob = np.ones((self.number_of_documents, number_of_topics))
        self.document_topic_prob = normalize(self.document_topic_prob)

        self.topic_word_prob = np.ones((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)

    def initialize(self, number_of_topics, random=False):
        """ Call the functions to initialize the matrices document_topic_prob and topic_word_prob
        """
        print("Initializing...")

        if random:
            self.initialize_randomly(number_of_topics)
        else:
            self.initialize_uniformly(number_of_topics)

    def expectation_step(self):
        """ The E-step updates P(z | w, d)
        """
        print("E step:")
        
        # ############################
        # your code here
        for index_doc, document in enumerate(self.documents):
            for index_topic in range(len(self.vocabulary)):
                prob=self.document_topic_prob[index_doc,:]*self.topic_word_prob[;,index_topic]
                prob=normalize(prob)
            
            

    def maximization_step(self, number_of_topics):
        """ The M-step updates P(w | z)
        """
        print("M step:")
        
        # update P(w | z) prob. of word w in topic theta_z
        
        # ############################
        # your code here
        for z in range(number_of_topics):
                for w_index in range(vocabulary_size):
                    s = 0
                    for d_index in range(len(self.documents)):
                        count = term_doc_matrix[d_index][w_index]
                        s = s + count * self.topic_prob[d_index, w_index, z]
                    self.topic_word_prob[z][w_index] = s
                normalize(self.topic_word_prob[z])

        
        # update P(z | d) coverage of topic theta_z in doc_d/prob. of doc_d covering topic theta_z

        # ############################
        # your code here
        for d_index in range(len(self.documents)):
                for z in range(number_of_topics):
                    s = 0
                    for w_index in range(vocabulary_size):
                        count = term_doc_matrix[d_index][w_index]
                        s = s + count * self.topic_prob[d_index, w_index, z]
                    self.document_topic_prob[d_index][z] = s
#                print self.document_topic_prob[d_index]
#                assert(sum(self.document_topic_prob[d_index]) != 0)
                normalize(self.document_topic_prob[d_index])


    def calculate_likelihood(self, number_of_topics):
        """ Calculate the current log-likelihood of the model using
        the model's updated probability matrices
        
        Append the calculated log-likelihood to self.likelihoods
        """
        # ############################
        # your code here
        
        
        return

    def plsa(self, number_of_topics, max_iter, epsilon):

        """
        Model topics.
        """
        print ("EM iteration begins...")
        
        # build term-doc matrix
        self.build_term_doc_matrix()
        
        # Create the counter arrays.
        
        # P(z | d, w)
        self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=float)

        # P(z | d) P(w | z)
        self.initialize(number_of_topics, random=True)

        # Run the EM algorithm
        current_likelihood = 0.0

        for iteration in range(max_iter):
            print("Iteration #" + str(iteration + 1) + "...")

            # ############################
            # your code here
            # ############################

            pass    # REMOVE THIS


In [113]:
def main():
    documents_path = 'data/test.txt'
    corpus = Corpus(documents_path)  # instantiate corpus
    corpus.build_corpus()
    corpus.build_vocabulary()
    print(corpus.vocabulary)    
    print("Vocabulary size:" + str(len(corpus.vocabulary)))
    print("Number of documents:" + str(len(corpus.documents)))
    number_of_topics = 2
    max_iterations = 50
    epsilon = 0.001
    corpus.plsa(number_of_topics, max_iterations, epsilon)



if __name__ == '__main__':
    main()
# https://github.com/hitalex/PLSA/blob/master/plsa_multi.py
# https://github.com/laserwave/plsa/blob/master/plsa.py

['mount', 'rainier', 'seattle', 'willis', 'tower', 'chicago']
Vocabulary size:6
Number of documents:1000
EM iteration begins...
Initializing...
[[0.52017882 0.47982118]
 [0.66538646 0.33461354]
 [0.62164561 0.37835439]
 ...
 [0.19904426 0.80095574]
 [0.73286062 0.26713938]
 [0.75899453 0.24100547]]
Iteration #1...
Iteration #2...
Iteration #3...
Iteration #4...
Iteration #5...
Iteration #6...
Iteration #7...
Iteration #8...
Iteration #9...
Iteration #10...
Iteration #11...
Iteration #12...
Iteration #13...
Iteration #14...
Iteration #15...
Iteration #16...
Iteration #17...
Iteration #18...
Iteration #19...
Iteration #20...
Iteration #21...
Iteration #22...
Iteration #23...
Iteration #24...
Iteration #25...
Iteration #26...
Iteration #27...
Iteration #28...
Iteration #29...
Iteration #30...
Iteration #31...
Iteration #32...
Iteration #33...
Iteration #34...
Iteration #35...
Iteration #36...
Iteration #37...
Iteration #38...
Iteration #39...
Iteration #40...
Iteration #41...
Iteration #4

In [107]:
a=np.random.rand(2,3)
a=normalize(a)
a

array([[0.02751781, 0.38542806, 0.58705413],
       [0.25619937, 0.39515258, 0.34864805]])

In [108]:
b=np.random.rand(3,4)
b=normalize(b)
b

array([[0.14575388, 0.32697533, 0.35833856, 0.16893223],
       [0.22992773, 0.12777884, 0.32228198, 0.32001145],
       [0.36997853, 0.14392686, 0.24522885, 0.24086577]])

In [111]:
c=a[0,:]*b[:,0]
c=c.reshape(1,-1)
c

array([[0.00401083, 0.0886206 , 0.21719742]])

In [112]:
c=normalize(c)
c

array([[0.0129453 , 0.28603082, 0.70102388]])