In [9]:
pip install indic-nlp-library



In [10]:
from indicnlp.tokenize import indic_tokenize

In [11]:
from numpy import zeros, int8, log
from pylab import random
import sys
import re
import time
import codecs
import nltk
from nltk.tokenize import word_tokenize

In [14]:
# segmentation, stopwords filtering and document-word matrix generating
# [return]:
# N : number of documents
# M : length of dictionary
# word2id : a map mapping terms to their corresponding ids
# id2word : a map mapping ids to terms
# X : document-word matrix, N*M, each line is the number of terms that show up in the document
def preprocessing_plsa(datasetFilePath, stopwordsFilePath):

    # read the stopwords file
    file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
    stopwords = [line.strip() for line in file]
    file.close()

    # read the documents
    file = codecs.open(datasetFilePath, 'r', 'utf-8')
    documents = [document.strip() for document in file]
    file.close()

    # number of documents
    N = len(documents)

    wordCounts = [];
    word2id = {}
    id2word = {}
    currentId = 0;
    # generate the word2id and id2word maps and count the number of times of words showing up in documents
    for document in documents:
        segList = word_tokenize(document)
        wordCount = {}
        for word in segList:
            word = word.lower().strip()
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
                if word not in word2id.keys():
                    word2id[word] = currentId;
                    id2word[currentId] = word;
                    currentId += 1;
                if word in wordCount:
                    wordCount[word] += 1
                else:
                    wordCount[word] = 1
        wordCounts.append(wordCount);
    # print(wordCount[:10])

    # length of dictionary
    M = len(word2id)

    # generate the document-word matrix
    X = zeros([N, M], int8)
    for word in word2id.keys():
        j = word2id[word]
        for i in range(0, N):
            if word in wordCounts[i]:
                X[i, j] = wordCounts[i][word];

    return N, M, word2id, id2word, X

def initializeParameters_plsa():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

In [15]:
beta = 0.05
def EStep_plsa():
    for i in range(0, N):
        for j in range(0, M):
            denominator = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denominator += p[i, j, k];
            if denominator == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denominator;

def MStep_plsa():
    # update theta
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += X[i, j] * p[i, j, k]
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator + beta

    # update lamda
    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denominator = 0
            for j in range(0, M):
                lamda[i, k] += X[i, j] * p[i, j, k]
                denominator += X[i, j];
            if denominator == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denominator + beta

# calculate the log likelihood
def LogLikelihood_plsa():
    loglikelihood = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# output the params of model and top words of topics to files
def output_plsa():
    # document-topic distribution
    file = codecs.open(docTopicDist,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # topic-word distribution
    file = codecs.open(topicWordDist,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # dictionary
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()

    # top words of each topic
    file = codecs.open(topicWords,'w','utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()

In [16]:
# set the default params and read the params from cmd
datasetFilePath = '/content/paragraphs_output.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10    # number of topic
maxIteration = 30
threshold = 5.0
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_eng_plsa1.txt'
topicWordDist = 'topicWordDistribution_eng_plsa1.txt'
dictionary = 'dictionary_eng_plsa1.dic'
topicWords = 'topics_eng_plsa1.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

# preprocessing
N, M, word2id, id2word, X = preprocessing_plsa(datasetFilePath, stopwordsFilePath)

# lamda[i, j] : p(zj|di)
lamda = random([N, K])

# theta[i, j] : p(wj|zi)
theta = random([K, M])

# p[i, j, k] : p(zk|di,wj)
p = zeros([N, M, K])

initializeParameters_plsa()

# EM algorithm
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    EStep_plsa()
    MStep_plsa()
    newLoglikelihood = LogLikelihood_plsa()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
        break
    oldLoglikelihood = newLoglikelihood

print(f"The latent space dimension is {K}")
output_plsa()

[ 2024-11-27 22:20:29 ]  1  iteration   -5294.924421439035
[ 2024-11-27 22:20:31 ]  2  iteration   -5110.571736525161
[ 2024-11-27 22:20:33 ]  3  iteration   -4865.420227563053
[ 2024-11-27 22:20:35 ]  4  iteration   -4619.572319167931
[ 2024-11-27 22:20:36 ]  5  iteration   -4417.756379328368
[ 2024-11-27 22:20:39 ]  6  iteration   -4271.481623628496
[ 2024-11-27 22:20:41 ]  7  iteration   -4176.235229866569
[ 2024-11-27 22:20:43 ]  8  iteration   -4113.174386300537
[ 2024-11-27 22:20:45 ]  9  iteration   -4067.440883371375
[ 2024-11-27 22:20:47 ]  10  iteration   -4033.924164772707
[ 2024-11-27 22:20:49 ]  11  iteration   -4007.3260892149415
[ 2024-11-27 22:20:52 ]  12  iteration   -3983.357414010418
[ 2024-11-27 22:20:54 ]  13  iteration   -3964.6403796353966
[ 2024-11-27 22:20:56 ]  14  iteration   -3954.8241815164893
[ 2024-11-27 22:20:58 ]  15  iteration   -3949.4828721988997
[ 2024-11-27 22:21:00 ]  16  iteration   -3944.4412733809
[ 2024-11-27 22:21:02 ]  17  iteration   -3938.

In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
# segmentation, stopwords filtering and document-word matrix generating
# [return]:
# N : number of documents
# M : length of dictionary
# word2id : a map mapping terms to their corresponding ids
# id2word : a map mapping ids to terms
# X : document-word matrix, N*M, each line is the number of terms that show up in the document
def preprocessing_hindi(datasetFilePath, stopwordsFilePath):

    # read the stopwords file
    file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
    stopwords = [line.strip() for line in file]
    file.close()

    # read the documents
    file = codecs.open(datasetFilePath, 'r', 'utf-8')
    documents = [document.strip() for document in file]
    file.close()

    # number of documents
    N = len(documents)

    wordCounts = [];
    word2id = {}
    id2word = {}
    currentId = 0;
    # generate the word2id and id2word maps and count the number of times of words showing up in documents
    for document in documents:
        segList = word_tokenize(document)
        wordCount = {}
        for word in segList:
            word = word.lower().strip()
            if len(word) > 1 and not re.search('[0-9\u0966-\u096F]', word) and word not in stopwords:
                if word not in word2id.keys():
                    word2id[word] = currentId;
                    id2word[currentId] = word;
                    currentId += 1;
                if word in wordCount:
                    wordCount[word] += 1
                else:
                    wordCount[word] = 1
        wordCounts.append(wordCount);

    # length of dictionary
    M = len(word2id)

    # generate the document-word matrix
    X = zeros([N, M], int8)
    for word in word2id.keys():
        j = word2id[word]
        for i in range(0, N):
            if word in wordCounts[i]:
                X[i, j] = wordCounts[i][word];

    return N, M, word2id, id2word, X

def initializeParameters_hindi():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

def EStep_hindi():
    for i in range(0, N):
        for j in range(0, M):
            denominator = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denominator += p[i, j, k];
            if denominator == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denominator;

def MStep_hindi():
    # update theta
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += X[i, j] * p[i, j, k]
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator

    # update lamda
    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denominator = 0
            for j in range(0, M):
                lamda[i, k] += X[i, j] * p[i, j, k]
                denominator += X[i, j];
            if denominator == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denominator

# calculate the log likelihood
def LogLikelihood_hindi():
    loglikelihood = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# output the params of model and top words of topics to files
def output_hindi():
    # document-topic distribution
    file = codecs.open(docTopicDist,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # topic-word distribution
    file = codecs.open(topicWordDist,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # dictionary
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()

    # top words of each topic
    file = codecs.open(topicWords,'w','utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()

# set the default params and read the params from cmd
datasetFilePath = '/content/hindi.txt'
stopwordsFilePath = '/content/Hindi_stopwords.txt'
K = 10    # number of topic
maxIteration = 20
threshold = 1.0
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_hindi.txt'
topicWordDist = 'topicWordDistribution_hindi.txt'
dictionary = 'dictionary_hindi.dic'
topicWords = 'topics_hindi.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

# preprocessing
N, M, word2id, id2word, X = preprocessing_hindi(datasetFilePath, stopwordsFilePath)

# lamda[i, j] : p(zj|di)
lamda = random([N, K])

# theta[i, j] : p(wj|zi)
theta = random([K, M])

# p[i, j, k] : p(zk|di,wj)
p = zeros([N, M, K])

initializeParameters_hindi()

# EM algorithm
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    EStep_hindi()
    MStep_hindi()
    newLoglikelihood = LogLikelihood_hindi()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
        break
    oldLoglikelihood = newLoglikelihood

print(f"The latent space dimension is {K}")
output_hindi()

[ 2024-11-27 22:25:14 ]  1  iteration   -7618.86813718769
[ 2024-11-27 22:25:15 ]  2  iteration   -7469.451531227769
[ 2024-11-27 22:25:16 ]  3  iteration   -7260.341798862027
[ 2024-11-27 22:25:17 ]  4  iteration   -7013.581053483326
[ 2024-11-27 22:25:19 ]  5  iteration   -6786.210089117876
[ 2024-11-27 22:25:20 ]  6  iteration   -6600.357207200243
[ 2024-11-27 22:25:21 ]  7  iteration   -6442.373263623059
[ 2024-11-27 22:25:23 ]  8  iteration   -6304.695756519337
[ 2024-11-27 22:25:24 ]  9  iteration   -6188.455909107235
[ 2024-11-27 22:25:27 ]  10  iteration   -6107.351404932233
[ 2024-11-27 22:25:28 ]  11  iteration   -6058.021252177478
[ 2024-11-27 22:25:29 ]  12  iteration   -6027.845924912417
[ 2024-11-27 22:25:30 ]  13  iteration   -6008.982738960089
[ 2024-11-27 22:25:32 ]  14  iteration   -5994.5306043044475
[ 2024-11-27 22:25:33 ]  15  iteration   -5979.995892705672
[ 2024-11-27 22:25:34 ]  16  iteration   -5969.110917694076
[ 2024-11-27 22:25:35 ]  17  iteration   -5961.64

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import TruncatedSVD
# from sklearn.pipeline import Pipeline
# documents = ["doc1.txt", "doc2.txt", "doc3.txt"]

# # raw documents to tf-idf matrix:
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True,
#                              smooth_idf=True)
# # SVD to reduce dimensionality:
# svd_model = TruncatedSVD(n_components=100,         // num dimensions
#                          algorithm='randomized',
#                          n_iter=10)
# # pipeline of tf-idf + SVD, fit to and applied to documents:
# svd_transformer = Pipeline([('tfidf', vectorizer),
#                             ('svd', svd_model)])
# svd_matrix = svd_transformer.fit_transform(documents)

# # svd_matrix can later be used to compare documents, compare words, or compare queries with documents

In [None]:
# import numpy as np
# from numpy.linalg import svd
# import jieba
# import re
# import codecs

# # Preprocessing Function
# def preprocessing(datasetFilePath, stopwordsFilePath):
#     """
#     Preprocess the dataset and generate a document-word matrix.

#     Args:
#     datasetFilePath : str
#         Path to the dataset file.
#     stopwordsFilePath : str
#         Path to the stopwords file.

#     Returns:
#     N : int
#         Number of documents.
#     M : int
#         Number of unique terms.
#     word2id : dict
#         Mapping from words to their IDs.
#     id2word : dict
#         Mapping from IDs to words.
#     X : numpy array
#         Document-word matrix.
#     """
#     # Load stopwords
#     with codecs.open(stopwordsFilePath, 'r', 'utf-8') as file:
#         stopwords = set(line.strip() for line in file)

#     # Load documents
#     with codecs.open(datasetFilePath, 'r', 'utf-8') as file:
#         documents = [line.strip() for line in file]

#     word2id = {}
#     id2word = {}
#     currentId = 0
#     wordCounts = []

#     for document in documents:
#         segList = jieba.cut(document)
#         wordCount = {}
#         for word in segList:
#             word = word.lower().strip()
#             if len(word) > 1 and word not in stopwords and not re.search(r'\d', word):
#                 if word not in word2id:
#                     word2id[word] = currentId
#                     id2word[currentId] = word
#                     currentId += 1
#                 wordCount[word] = wordCount.get(word, 0) + 1
#         wordCounts.append(wordCount)

#     N = len(documents)
#     M = len(word2id)
#     X = np.zeros((N, M), dtype=int)

#     for i, wordCount in enumerate(wordCounts):
#         for word, count in wordCount.items():
#             X[i, word2id[word]] = count

#     return N, M, word2id, id2word, X

# # LSA Function
# def perform_lsa(X, k):
#     """
#     Perform LSA on document-word matrix X and reduce to k dimensions.

#     Args:
#     X : numpy array
#         Document-word matrix.
#     k : int
#         Number of latent topics to retain.

#     Returns:
#     U_k : numpy array
#         Truncated document-topic matrix.
#     S_k : numpy array
#         Truncated singular values.
#     Vt_k : numpy array
#         Truncated topic-word matrix (transposed).
#     """
#     U, S, Vt = svd(X, full_matrices=False)
#     U_k = U[:, :k]
#     S_k = np.diag(S[:k])
#     Vt_k = Vt[:k, :]
#     return U_k, S_k, Vt_k

# # Save Results Function
# def save_lsa_results(U_k, S_k, Vt_k, id2word, docTopicDist, topicWordDist, topicWords, topicWordsNum):
#     """
#     Save LSA results to files.

#     Args:
#     U_k : numpy array
#         Truncated document-topic matrix.
#     S_k : numpy array
#         Truncated singular values.
#     Vt_k : numpy array
#         Truncated topic-word matrix (transposed).
#     id2word : dict
#         Dictionary mapping word IDs to terms.
#     docTopicDist : str
#         Path to save document-topic distribution.
#     topicWordDist : str
#         Path to save topic-word distribution.
#     topicWords : str
#         Path to save top words for each topic.
#     topicWordsNum : int
#         Number of top words to save for each topic.
#     """
#     with codecs.open(docTopicDist, 'w', 'utf-8') as file:
#         for i in range(U_k.shape[0]):
#             file.write(' '.join(map(str, U_k[i, :])) + '\n')

#     with codecs.open(topicWordDist, 'w', 'utf-8') as file:
#         for i in range(Vt_k.shape[0]):
#             file.write(' '.join(map(str, Vt_k[i, :])) + '\n')

#     with codecs.open(topicWords, 'w', 'utf-8') as file:
#         for i in range(Vt_k.shape[0]):
#             topic_words = [id2word[j] for j in np.argsort(Vt_k[i, :])[-topicWordsNum:][::-1]]
#             file.write(' '.join(topic_words) + '\n')

# # Main Script
# if __name__ == '__main__':
#     datasetFilePath = '/content/dataset1.txt'  # Path to your dataset
#     stopwordsFilePath = '/content/stopwords.dic'  # Path to your stopwords file
#     docTopicDist = 'docTopicDistribution.txt'
#     topicWordDist = 'topicWordDistribution.txt'
#     topicWords = 'topics.txt'
#     topicWordsNum = 10  # Number of top words per topic
#     K = 10  # Number of topics

#     # Preprocess dataset
#     N, M, word2id, id2word, X = preprocessing(datasetFilePath, stopwordsFilePath)

#     # Perform LSA
#     U_k, S_k, Vt_k = perform_lsa(X, K)

#     # Save results
#     save_lsa_results(U_k, S_k, Vt_k, id2word, docTopicDist, topicWordDist, topicWords, topicWordsNum)

#     print("LSA completed and results saved.")


In [None]:
# import matplotlib.pyplot as plot
# # Perplexity Calculation
# def calculate_perplexity(X, U_k, S_k, Vt_k):
#     reconstructed_X = np.dot(np.dot(U_k, S_k), Vt_k)
#     epsilon = 1e-4  # Avoid log(0)
#     prob_X = reconstructed_X / np.sum(reconstructed_X)
#     log_likelihood = np.sum(X * np.log(prob_X + epsilon))
#     perplexity = np.exp(-log_likelihood / np.sum(X))
#     return perplexity

# # Compare Perplexity vs Latent Space Dimensions
# def compare_datasets(dataset1_path, dataset2_path, stopwords_path, latent_dims):
#     results = {"dataset1": [], "dataset2": []}

#     for k in latent_dims:
#         for dataset, dataset_path in zip(["dataset1", "dataset2"], [dataset1_path, dataset2_path]):
#             N, M, word2id, id2word, X = preprocessing(dataset_path, stopwords_path)
#             U_k, S_k, Vt_k = perform_lsa(X, k)
#             perplexity = calculate_perplexity(X, U_k, S_k, Vt_k)
#             results[dataset].append(perplexity)

#     return results

# # Precision-Recall Placeholder (requires ground truth, e.g., topic labels)
# def plot_precision_recall():
#     # This part requires a ground truth and predicted labels for evaluation.
#     pass

# # Plot Results
# def plot_perplexity(results, latent_dims):
#     plt.figure()
#     for dataset, perplexities in results.items():
#         plt.plot(latent_dims, perplexities, label=dataset)
#     plt.xlabel("Latent Dimensions")
#     plt.ylabel("Perplexity")
#     plt.title("Perplexity vs Latent Dimensions")
#     plt.legend()
#     plt.show()

# # Main Execution
# if __name__ == '__main__':
#     dataset1_path = '/content/dataset1.txt'  # Path to first dataset
#     dataset2_path = '/content/dataset2.txt'  # Path to second dataset
#     stopwords_path = '/content/stopwords.dic'  # Path to stopwords file

#     latent_dims = range(2, 21, 2)  # Test various latent dimensions (e.g., 2, 4, ..., 20)

#     # Compare datasets based on perplexity
#     results = compare_datasets(dataset1_path, dataset2_path, stopwords_path, latent_dims)

#     # Plot perplexity vs latent dimensions
#     plot_perplexity(results, latent_dims)

In [None]:
def sparse_priors(alpha=0.2):
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] = theta[k, j] + X[i, j] * p[i, j, k] + alpha
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator


In [None]:
def gibbs_sampling():
    for i in range(0, N):
        for j in range(0, M):
            topic_probs = np.zeros(K)
            for k in range(K):
                topic_probs[k] = theta[k, j] * lamda[i, k]
            p[i, j, :] = topic_probs / (np.sum(topic_probs) + 1e-10)


In [None]:
def entropy_regularization():
    entropy_loss = 0
    for k in range(0, K):
        entropy_loss += -np.sum(theta[k, :] * np.log(theta[k, :] + 1e-10))
    return entropy_loss


In [45]:
import numpy as np
def initializeParameters_gibbs():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

# beta = 0.1
def EStep_gibbs(alpha=0.001):
    for i in range(0, N):
        for j in range(0, M):
            denominator = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denominator += p[i, j, k];
            if denominator == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denominator;

def MStep_gibbs(alpha=0.001):
    # sparse_priors(alpha=0.1)  # Add Dirichlet priors for sparsity
    # gibbs_sampling()  # Perform Gibbs sampling for Bayesian inference

    # Update theta and lamda as before
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += alpha + X[i, j] * p[i, j, k]
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator

    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denominator = 0
            for j in range(0, M):
                lamda[i, k] += alpha + X[i, j] * p[i, j, k]
                denominator += X[i, j];
            if denominator == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denominator

# Calculate the log likelihood
def LogLikelihood_gibbs():
    loglikelihood = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# output the params of model and top words of topics to files
def output_gibbs():
    # document-topic distribution
    file = codecs.open(docTopicDist,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # topic-word distribution
    file = codecs.open(topicWordDist,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # dictionary
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()

    # top words of each topic
    file = codecs.open(topicWords,'w','utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()

In [28]:
# set the default params and read the params from cmd
datasetFilePath = '/content/paragraphs_output.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10    # number of topic
maxIteration = 50
threshold = 0
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_eng_gibbs.txt'
topicWordDist = 'topicWordDistribution_eng_gibbs.txt'
dictionary = 'dictionary_eng_gibbs.dic'
topicWords = 'topics_eng_gibbs.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

# preprocessing
N, M, word2id, id2word, X = preprocessing_plsa(datasetFilePath, stopwordsFilePath)

# lamda[i, j] : p(zj|di)
lamda = random([N, K])

# theta[i, j] : p(wj|zi)
theta = random([K, M])

# p[i, j, k] : p(zk|di,wj)
p = zeros([N, M, K])

initializeParameters_gibbs()

# EM algorithm
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    EStep_gibbs(alpha=0.001)
    MStep_gibbs(alpha=0.001)
    newLoglikelihood = LogLikelihood_gibbs()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
        break
    oldLoglikelihood = newLoglikelihood

print(f"The latent space dimension is {K}")
output_gibbs()

[ 2024-11-27 22:39:10 ]  1  iteration   -5127.499914296321
[ 2024-11-27 22:39:12 ]  2  iteration   -5090.387039993633
[ 2024-11-27 22:39:14 ]  3  iteration   -5030.785950979097
[ 2024-11-27 22:39:16 ]  4  iteration   -4935.238139394036
[ 2024-11-27 22:39:18 ]  5  iteration   -4797.701143633509
[ 2024-11-27 22:39:20 ]  6  iteration   -4639.674176575345
[ 2024-11-27 22:39:23 ]  7  iteration   -4503.5997291529675
[ 2024-11-27 22:39:25 ]  8  iteration   -4406.223656771497
[ 2024-11-27 22:39:27 ]  9  iteration   -4340.238629798519
[ 2024-11-27 22:39:28 ]  10  iteration   -4293.61495367788
[ 2024-11-27 22:39:30 ]  11  iteration   -4256.153294576782
[ 2024-11-27 22:39:33 ]  12  iteration   -4226.421060972108
[ 2024-11-27 22:39:35 ]  13  iteration   -4205.591416272595
[ 2024-11-27 22:39:37 ]  14  iteration   -4190.39825882792
[ 2024-11-27 22:39:39 ]  15  iteration   -4178.210418249416
[ 2024-11-27 22:39:42 ]  16  iteration   -4168.307531765497
[ 2024-11-27 22:39:44 ]  17  iteration   -4160.494

In [31]:
import numpy as np
def initializeParameters_diri():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

beta = 0.05
def EStep_diri(alpha=0.001):
    for i in range(0, N):
        for j in range(0, M):
            denominator = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denominator += p[i, j, k];
            if denominator == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denominator;

def MStep_diri(alpha=0.001):
    # sparse_priors(alpha=0.1)  # Add Dirichlet priors for sparsity
    # gibbs_sampling()  # Perform Gibbs sampling for Bayesian inference

    # Update theta and lamda as before
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += alpha + X[i, j] * p[i, j, k]
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator + beta

    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denominator = 0
            for j in range(0, M):
                lamda[i, k] += alpha + X[i, j] * p[i, j, k]
                denominator += X[i, j];
            if denominator == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denominator + beta

# Calculate the log likelihood
def LogLikelihood_diri():
    loglikelihood = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# output the params of model and top words of topics to files
def output_diri():
    # document-topic distribution
    file = codecs.open(docTopicDist,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # topic-word distribution
    file = codecs.open(topicWordDist,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # dictionary
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()

    # top words of each topic
    file = codecs.open(topicWords,'w','utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()

In [32]:
# set the default params and read the params from cmd
datasetFilePath = '/content/paragraphs_output.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10    # number of topic
maxIteration = 50
threshold = 0.1
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_eng_diri.txt'
topicWordDist = 'topicWordDistribution_eng_diri.txt'
dictionary = 'dictionary_eng_diri.dic'
topicWords = 'topics_eng_diri.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

# preprocessing
N, M, word2id, id2word, X = preprocessing_plsa(datasetFilePath, stopwordsFilePath)

# lamda[i, j] : p(zj|di)
lamda = random([N, K])

# theta[i, j] : p(wj|zi)
theta = random([K, M])

# p[i, j, k] : p(zk|di,wj)
p = zeros([N, M, K])

initializeParameters_diri()

# EM algorithm
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    EStep_diri(alpha=0.001)
    MStep_diri(alpha=0.001)
    newLoglikelihood = LogLikelihood_diri()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
        break
    oldLoglikelihood = newLoglikelihood

print(f"The latent space dimension is {K}")
output_diri()

[ 2024-11-27 22:56:48 ]  1  iteration   -5143.008538888668
[ 2024-11-27 22:56:50 ]  2  iteration   -5116.713824116789
[ 2024-11-27 22:56:52 ]  3  iteration   -5075.309566106987
[ 2024-11-27 22:56:54 ]  4  iteration   -5010.09107907706
[ 2024-11-27 22:56:57 ]  5  iteration   -4915.103699028177
[ 2024-11-27 22:56:59 ]  6  iteration   -4791.225539777555
[ 2024-11-27 22:57:01 ]  7  iteration   -4650.908600143332
[ 2024-11-27 22:57:03 ]  8  iteration   -4517.324191780743
[ 2024-11-27 22:57:05 ]  9  iteration   -4407.398128428811
[ 2024-11-27 22:57:07 ]  10  iteration   -4328.465358248906
[ 2024-11-27 22:57:10 ]  11  iteration   -4277.192664701674
[ 2024-11-27 22:57:12 ]  12  iteration   -4244.802149955324
[ 2024-11-27 22:57:14 ]  13  iteration   -4223.019289465597
[ 2024-11-27 22:57:16 ]  14  iteration   -4207.071896238039
[ 2024-11-27 22:57:18 ]  15  iteration   -4194.898158981157
[ 2024-11-27 22:57:20 ]  16  iteration   -4185.448784289185
[ 2024-11-27 22:57:23 ]  17  iteration   -4178.002

In [41]:
#Testing cell dsjdnsunusentusentunseutnestuesrtj


datasetFilePath = '/content/mammals.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10    # number of topic
maxIteration = 50
threshold = 0.1
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_eng_diri_test.txt'
topicWordDist = 'topicWordDistribution_eng_diri_test.txt'
dictionary = 'dictionary_eng_diri_test.dic'
topicWords = 'topics_eng_diri_test.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

def output_diri_test():
    # Document-topic distribution
    with codecs.open(docTopicDist, 'w', 'utf-8') as file:
        for i in range(0, N):
            tmp = ' '.join(str(lamda[i, j]) for j in range(0, K))
            file.write(tmp + '\n')

    # Topic-word distribution
    with codecs.open(topicWordDist, 'w', 'utf-8') as file:
        for i in range(0, K):
            tmp = ' '.join(str(theta[i, j]) for j in range(0, M))
            print(type(tmp))
            file.write(tmp + '\n')

    # Dictionary
    with codecs.open(dictionary, 'w', 'utf-8') as file:
        for i in range(0, M):
            file.write(id2word[i] + '\n')

    # Top words of each topic
    with codecs.open(topicWords, 'w', 'utf-8') as file:
        for i in range(0, K):
            topicword = []
            ids = theta[i, :].argsort()  # Get sorted word indices for topic i
            for j in ids:
                # Ensure j is within the valid range for id2word
                if j < M:  # Check if index is valid
                    topicword.insert(0, id2word.get(j, "UNKNOWN"))  # Add "UNKNOWN" if id2word[j] does not exist

            tmp = ' '.join(topicword[0:min(topicWordsNum, len(topicword))])
            file.write(tmp + '\n')


# preprocessing
# N, M, word2id, id2word, X = preprocessing_plsa(datasetFilePath, stopwordsFilePath)

print(f"The latent space dimension is {K}")
output_diri_test()

The latent space dimension is 10
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
#Testing cell dsjdnsunusentusentunseutnestuesrtj


datasetFilePath = '/content/mammals.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10    # number of topic
maxIteration = 50
threshold = 0.1
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_eng_diri_test.txt'
topicWordDist = 'topicWordDistribution_eng_diri_test.txt'
dictionary = 'dictionary_eng_diri_test.dic'
topicWords = 'topics_eng_diri_test.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

def output_diri_test():
    # Document-topic distribution
    with codecs.open(docTopicDist, 'w', 'utf-8') as file:
        for i in range(0, N):
            tmp = ' '.join(str(lamda[i, j]) for j in range(0, K))
            file.write(tmp + '\n')

    # Topic-word distribution
    with codecs.open(topicWordDist, 'w', 'utf-8') as file:
        for i in range(0, K):
            tmp = ' '.join(str(theta[i, j]) for j in range(0, M))
            print(type(tmp))
            file.write(tmp + '\n')

    # Dictionary
    with codecs.open(dictionary, 'w', 'utf-8') as file:
        for i in range(0, M):
            file.write(id2word[i] + '\n')

    # Top words of each topic
    with codecs.open(topicWords, 'w', 'utf-8') as file:
        for i in range(0, K):
            topicword = []
            ids = theta[i, :].argsort()  # Get sorted word indices for topic i
            for j in ids:
                # Ensure j is within the valid range for id2word
                if j < M:  # Check if index is valid
                    topicword.insert(0, id2word.get(j, "UNKNOWN"))  # Add "UNKNOWN" if id2word[j] does not exist

            tmp = ' '.join(topicword[0:min(topicWordsNum, len(topicword))])
            file.write(tmp + '\n')


# preprocessing
# N, M, word2id, id2word, X = preprocessing_plsa(datasetFilePath, stopwordsFilePath)

print(f"The latent space dimension is {K}")
output_diri_test()

In [36]:
def evaluate_topic_coherence(theta, word2id, X, top_n=10):
    """
    Evaluate the coherence of topics based on the top words.

    :param theta: Topic-word distribution matrix.
    :param word2id: Mapping of words to ids.
    :param X: Document-term matrix.
    :param top_n: Number of top words to consider for each topic.
    :return: List of coherence scores for each topic.
    """
    coherence_scores = []

    for k in range(len(theta)):  # Iterate over topics
        # Get the top N words for the current topic
        top_word_ids = np.argsort(theta[k, :])[-top_n:][::-1]
        top_words = [id2word[word_id] for word_id in top_word_ids]

        # Calculate the PMI matrix for the top words in the topic
        pmi_matrix = calculate_pmi(top_words, word2id, X)

        # Sum the upper triangle of the PMI matrix (to avoid double counting)
        coherence = np.sum(np.triu(pmi_matrix, k=1))  # Upper triangle of matrix
        coherence_scores.append(coherence)

    return coherence_scores


In [37]:
def output_plsa_with_coherence():
    # Document-topic distribution
    file = codecs.open(docTopicDist, 'w', 'utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # Topic-word distribution
    file = codecs.open(topicWordDist, 'w', 'utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()

    # Dictionary
    file = codecs.open(dictionary, 'w', 'utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()

    # Top words of each topic
    file = codecs.open(topicWords, 'w', 'utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()

    # Calculate and output coherence scores
    coherence_scores = evaluate_topic_coherence(theta, word2id, X, top_n=topicWordsNum)
    file = codecs.open('topic_coherence.txt', 'w', 'utf-8')
    for i in range(K):
        file.write(f"Topic {i+1} coherence score: {coherence_scores[i]}\n")
    file.close()


In [39]:
import random
import numpy as np
import codecs
import time
import re
from nltk.tokenize import word_tokenize
from numpy import zeros, random
from math import log
import sys

# Preprocessing function (no change, same as your current one)
def preprocessing_da(datasetFilePath, stopwordsFilePath):
    file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
    stopwords = [line.strip() for line in file]
    file.close()

    file = codecs.open(datasetFilePath, 'r', 'utf-8')
    documents = [document.strip() for document in file]
    file.close()

    N = len(documents)
    wordCounts = []
    word2id = {}
    id2word = {}
    currentId = 0
    for document in documents:
        segList = word_tokenize(document)
        wordCount = {}
        for word in segList:
            word = word.lower().strip()
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
                if word not in word2id:
                    word2id[word] = currentId
                    id2word[currentId] = word
                    currentId += 1
                if word in wordCount:
                    wordCount[word] += 1
                else:
                    wordCount[word] = 1
        wordCounts.append(wordCount)

    M = len(word2id)
    X = zeros([N, M], int)
    for word in word2id:
        j = word2id[word]
        for i in range(N):
            if word in wordCounts[i]:
                X[i, j] = wordCounts[i][word]

    return N, M, word2id, id2word, X


# Initialize parameters for LDA (Dirichlet sampling)
def initializeParameters_da(N, M, K, alpha=0.1, beta=0.01):
    # Initialize topic distribution for documents (theta)
    theta = random.dirichlet([alpha] * K, N)  # [N, K]

    # Initialize word distribution for topics (phi)
    phi = random.dirichlet([beta] * M, K)  # [K, M]

    # p[i, j, k]: P(z_k | d_i, w_j) (used in E-step)
    p = np.zeros([N, M, K])
    return theta, phi, p

# E-step of LDA (update the responsibilities p[i, j, k])
def EStep_da(X, theta, phi, N, M, K):
    p = np.zeros([N, M, K])
    for i in range(N):
        for j in range(M):
            denominator = 0
            for k in range(K):
                p[i, j, k] = theta[i, k] * phi[k, j]
                denominator += p[i, j, k]
            if denominator > 0:
                p[i, j, :] /= denominator
            else:
                p[i, j, :] = 1.0 / K
    return p

# M-step of LDA (update theta and phi)
def MStep_da(X, p, N, M, K, beta=0.01):
    # Update phi (topic-word distribution)
    phi = np.zeros([K, M])
    for k in range(K):
        for j in range(M):
            numerator = sum(X[i, j] * p[i, j, k] for i in range(N))
            denominator = sum(X[i, j] for i in range(N))
            if denominator > 0:
                phi[k, j] = numerator / denominator
            else:
                phi[k, j] = 1.0 / M
    # Normalize phi
    for k in range(K):
        phi[k, :] /= np.sum(phi[k, :]) + beta  # Adding beta for smoothing

    # Update theta (document-topic distribution)
    theta = np.zeros([N, K])
    for i in range(N):
        for k in range(K):
            numerator = sum(X[i, j] * p[i, j, k] for j in range(M))
            denominator = sum(X[i, j] for j in range(M))
            if denominator > 0:
                theta[i, k] = numerator / denominator
            else:
                theta[i, k] = 1.0 / K
    # Normalize theta
    for i in range(N):
        theta[i, :] /= np.sum(theta[i, :]) + beta  # Normalize across topics

    return theta, phi

# Log-Likelihood of the LDA model
def LogLikelihood_da(X, theta, phi, N, M, K):
    loglikelihood = 0
    for i in range(N):
        for j in range(M):
            tmp = 0
            for k in range(K):
                tmp += theta[i, k] * phi[k, j]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# Output the results of LDA model
def output_da(theta, phi, id2word, K, M, N, docTopicDist, topicWordDist, dictionary, topicWords, topicWordsNum=10):
    # Document-topic distribution
    with codecs.open(docTopicDist, 'w', 'utf-8') as file:
        for i in range(N):
            file.write(' '.join(map(str, theta[i, :])) + '\n')

    # Topic-word distribution
    with codecs.open(topicWordDist, 'w', 'utf-8') as file:
        for i in range(K):
            file.write(' '.join(map(str, phi[i, :])) + '\n')

    # Dictionary
    with codecs.open(dictionary, 'w', 'utf-8') as file:
        for word in id2word.values():
            file.write(word + '\n')

    # Top words of each topic
    with codecs.open(topicWords, 'w', 'utf-8') as file:
        for i in range(K):
            topicwords = np.argsort(phi[i, :])[::-1]
            topwords = [id2word[j] for j in topicwords[:topicWordsNum]]
            file.write(' '.join(topwords) + '\n')


# LDA Algorithm (EM-based)
def da_algorithm(datasetFilePath, stopwordsFilePath, K, maxIteration, threshold, docTopicDist, topicWordDist, dictionary, topicWords, topicWordsNum=10):
    N, M, word2id, id2word, X = preprocessing_da(datasetFilePath, stopwordsFilePath)

    # Initialize parameters with Dirichlet distribution
    theta, phi, p = initializeParameters_da(N, M, K)

    oldLoglikelihood = -np.inf
    for iteration in range(maxIteration):
        # E-step: update responsibilities
        p = EStep_da(X, theta, phi, N, M, K)

        # M-step: update theta and phi
        theta, phi = MStep_da(X, p, N, M, K)

        # Calculate Log-Likelihood
        newLoglikelihood = LogLikelihood_da(X, theta, phi, N, M, K)

        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Iteration {iteration+1}, Log-Likelihood: {newLoglikelihood}")

        # Check for convergence
        if abs(newLoglikelihood - oldLoglikelihood) < threshold:
            break
        oldLoglikelihood = newLoglikelihood

    # Output results
    output_da(theta, phi, id2word, K, M, N, docTopicDist, topicWordDist, dictionary, topicWords, topicWordsNum)

# Set parameters
datasetFilePath = '/content/paragraphs_output.txt'
stopwordsFilePath = '/content/stopwords.dic'
K = 10  # Number of topics
maxIteration = 200
threshold = 1.0
topicWordsNum = 10
docTopicDist = 'docTopicDistribution_da.txt'
topicWordDist = 'topicWordDistribution_da.txt'
dictionary = 'dictionary_da.dic'
topicWords = 'topics_da.txt'

# Run LDA
da_algorithm(datasetFilePath, stopwordsFilePath, K, maxIteration, threshold, docTopicDist, topicWordDist, dictionary, topicWords)


[2024-11-27 23:34:18] Iteration 1, Log-Likelihood: -5579.276268721349
[2024-11-27 23:34:20] Iteration 2, Log-Likelihood: -5536.527340787083
[2024-11-27 23:34:23] Iteration 3, Log-Likelihood: -5511.815440098001
[2024-11-27 23:34:25] Iteration 4, Log-Likelihood: -5493.710616277124
[2024-11-27 23:34:27] Iteration 5, Log-Likelihood: -5480.70310434348
[2024-11-27 23:34:29] Iteration 6, Log-Likelihood: -5468.897374455879
[2024-11-27 23:34:30] Iteration 7, Log-Likelihood: -5456.867933255942
[2024-11-27 23:34:32] Iteration 8, Log-Likelihood: -5444.392617716661
[2024-11-27 23:34:34] Iteration 9, Log-Likelihood: -5431.053067493016
[2024-11-27 23:34:35] Iteration 10, Log-Likelihood: -5417.567572582202
[2024-11-27 23:34:37] Iteration 11, Log-Likelihood: -5405.254086615993
[2024-11-27 23:34:40] Iteration 12, Log-Likelihood: -5395.96200737337
[2024-11-27 23:34:42] Iteration 13, Log-Likelihood: -5387.2774867175085
[2024-11-27 23:34:43] Iteration 14, Log-Likelihood: -5378.417718405353
[2024-11-27 23:3

In [47]:
import numpy as np
import os
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from math import log

def load_topic_words(file_path):
    """ Load the topic words from the given file.
    Each line in the file contains words for a topic.
    Args:
        file_path (str): Path to the file containing topic words.
    Returns:
        List of lists of words (one list per topic).
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        topics = [line.strip().split() for line in f.readlines()]
    return topics

def compute_coherence(topics, corpus):
    """
    Compute UMass coherence score for the given topics.

    Args:
        topics (list of lists): Each list contains the top words for a topic.
        corpus (list of lists): Each list contains words of a document in the corpus.

    Returns:
        float: The average coherence score across all topics.
    """
    word_pairs_count = defaultdict(int)
    doc_count = len(corpus)

    # Build word co-occurrence statistics for the corpus
    for doc in corpus:
        seen_words = set(doc)
        for word1 in seen_words:
            for word2 in seen_words:
                if word1 != word2:
                    word_pairs_count[(word1, word2)] += 1

    # Calculate coherence score for each topic
    coherence_scores = []
    for topic in topics:
        coherence = 0
        for i in range(len(topic)):
            for j in range(i + 1, len(topic)):
                word1, word2 = topic[i], topic[j]
                co_occurrence = word_pairs_count.get((word1, word2), 0)
                coherence += np.log((co_occurrence + 1) / (doc_count + 1))
        coherence_scores.append(coherence)

    return np.mean(coherence_scores)

def compute_topic_similarity(topics1, topics2):
    """ Calculate the cosine similarity between the topic-word distributions of two models.
    Args:
        topics1 (list of lists): Top words for each topic in model 1.
        topics2 (list of lists): Top words for each topic in model 2.
    Returns:
        float: The average cosine similarity between corresponding topics from both models.
    """
    num_topics = min(len(topics1), len(topics2))  # Ensure we only compare the minimum number of topics
    similarities = []

    for i in range(num_topics):
        topic1 = topics1[i]
        topic2 = topics2[i]

        # Convert words in topics to a word vector based on word counts
        word_set = set(topic1).union(set(topic2))  # All words in both topics
        word_vector1 = np.zeros(len(word_set))  # Word vector for topic1
        word_vector2 = np.zeros(len(word_set))  # Word vector for topic2

        word_index = {word: idx for idx, word in enumerate(word_set)}  # Create an index map

        for word in topic1:
            word_vector1[word_index[word]] += 1  # Fill word vector for topic1

        for word in topic2:
            word_vector2[word_index[word]] += 1  # Fill word vector for topic2

        print("1", word_vector1)
        print("2", word_vector2)
        # Calculate cosine similarity between word vectors
        similarity = cosine_similarity([word_vector1], [word_vector2])[0][0]
        similarities.append(similarity)

    return np.mean(similarities)


def compute_topic_diversity(topics):
    """ Calculate the diversity of topics based on the uniqueness of top words.
    Args:
        topics (list of lists): Each list contains the top words for a topic.
    Returns:
        float: The average diversity score (1 means high diversity, 0 means low diversity).
    """
    unique_words = set()  # To track unique words
    total_words = 0  # To track total words across topics

    # Loop through topics and add words to the unique set
    for topic in topics:
        total_words += len(topic)
        unique_words.update(topic)  # Add all words from the topic to the unique set

    if total_words == 0:
        return 0  # Avoid division by zero if no words are present (empty topics)

    # Return the ratio of unique words to total words
    return len(unique_words) / total_words

def evaluate_topic_model(model1_file, model3_file, model4_file, model5_file, model6_file, corpus1, corpus2):
    """ Evaluate two topic models based on coherence, similarity, and diversity.
    Args:
        model1_file (str): File containing the top words for the topics of the first model.
        model2_file (str): File containing the top words for the topics of the second model.
        corpus (list of lists): The corpus of documents.
    """
    # Load topics for both models
    topics1 = load_topic_words(model1_file)
    topics3 = load_topic_words(model3_file)
    topics4 = load_topic_words(model4_file)
    topics5 = load_topic_words(model5_file)
    topics6 = load_topic_words(model6_file)


    # 1. Coherence Score (UMass)
    coherence1 = compute_coherence(topics1, corpus1)
    coherence3 = compute_coherence(topics3, corpus1)
    coherence4 = compute_coherence(topics4, corpus2)
    coherence5 = compute_coherence(topics5, corpus1)
    coherence6 = compute_coherence(topics6, corpus1)
    print(f"Coherence Score Dirichlet Distribution: {coherence1}")
    print(f"Coherence Score Dirichlet Priors: {coherence3}")
    print(f"Coherence Score Dirichlet Priors test: {coherence4}")
    print(f"Coherence Score Gibbs Sampling: {coherence5}")
    print(f"Coherence Score PLSA: {coherence6}")

    # 3. Topic Diversity
    diversity1 = compute_topic_diversity(topics1)
    diversity3 = compute_topic_diversity(topics3)
    diversity4 = compute_topic_diversity(topics4)
    diversity5 = compute_topic_diversity(topics5)
    diversity6 = compute_topic_diversity(topics6)

    print(f"Topic Diversity Dirichlet Distribution: {diversity1}")
    print(f"Topic Diversity Dirichlet Priors: {diversity3}")
    print(f"Topic Diversity Dirichlet Priors test: {diversity4}")
    print(f"Topic Diversity Gibbs Sampling: {diversity5}")
    print(f"Topic Diversity PLSA: {diversity6}")

corpus1 = [line.strip().split() for line in open('/content/paragraphs_output.txt', 'r', encoding='utf-8')]
corpus2 = [line.strip().split() for line in open('/content/mammals.txt', 'r', encoding='utf-8')]

# Paths to topic model output files
model1_file = "/content/topics_da.txt"
model3_file = "/content/topics_eng_diri.txt"
model4_file = "/content/topics_eng_diri_test.txt"
model5_file = "/content/topics_eng_gibbs.txt"
model6_file = "/content/topics_eng_plsa1.txt"
# model3_file = "/content/topics_hindi.txt"

evaluate_topic_model(model1_file, model3_file, model4_file, model5_file, model6_file, corpus1, corpus2)


Coherence Score Dirichlet Distribution: -204.79478481695554
Coherence Score Dirichlet Priors: -197.111048610292
Coherence Score Dirichlet Priors test: -141.24504678015742
Coherence Score Gibbs Sampling: -196.73570681276684
Coherence Score PLSA: -200.65266593459944
Topic Diversity Dirichlet Distribution: 1.0
Topic Diversity Dirichlet Priors: 0.76
Topic Diversity Dirichlet Priors test: 0.76
Topic Diversity Gibbs Sampling: 0.78
Topic Diversity PLSA: 0.78
