In [1]:
import gensim
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary, MmCorpus
import matplotlib.pyplot as plt
import os
import time
import json


In [2]:
# Set up the environment variable for JOBLIB
os.environ['JOBLIB_TEMP_FOLDER'] = '/data1-6tb/sp2023stock/TopicModeling/data/temp'

num_topics_list = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
total_passes = 500

# Path to save the plot
plot_save_path = '/data1-6tb/sp2023stock/TopicModeling/Model_Comparison.png'
sample = 25

print("Dataset loading - dictionary - corpus")
# Load the dictionary and corpus
dictionary = Dictionary.load(f'/data1-6tb/sp2023stock/TopicModeling/model/ModelingAssets/CorpusDict{sample}/dictionary.gensim')
corpus = MmCorpus(f'/data1-6tb/sp2023stock/TopicModeling/model/ModelingAssets/CorpusDict{sample}/corpus.mm')
json_file_path = f'/data1-6tb/sp2023stock/TopicModeling/model/ModelingAssets/CorpusDict{sample}/dataset_{sample}.json'

with open(json_file_path, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

print("Dataset loaded - dictionary - corpus")



Dataset loading - dictionary - corpus


In [6]:
coherences_cv = []
coherences_umass = []
total_time = []

In [7]:
for topics in num_topics_list:
    print(f'Starting evaluation for {topics} topics')
    start_time = time.time()

    # Initialize and train the LDA model
    lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=topics, random_state=42, passes=total_passes, workers = 4)

    # Calculate coherence c_v
    coherence_model_lda_cv = CoherenceModel(model=lda, texts=dataset, dictionary=dictionary, coherence='c_v')
    coherence_lda_cv = coherence_model_lda_cv.get_coherence()
    coherences_cv.append(coherence_lda_cv)

    # Calculate coherence U_mass
    coherence_model_lda_umass = CoherenceModel(model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_model_lda_umass.get_coherence()
    coherences_umass.append(coherence_lda_umass)

    print(f"Topics: {topics}, Coherence c_v: {coherence_lda_cv}, Coherence u_mass: {coherence_lda_umass}")

    end_time = time.time()
    time_taken = end_time - start_time
    total_time.append(time_taken)
    print(f"Number of Topics: {topics}, Time taken: {time_taken} seconds")




MmCorpus(506010 documents, 137788 features, 25186257 non-zero entries)


In [None]:
# Plotting the Coherence Scores
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(num_topics_list, coherences_cv, marker='o', color='b', label='c_v')
plt.title('C_V Coherence Scores for Different Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score (c_v)')

plt.subplot(1, 2, 2)
plt.plot(num_topics_list, coherences_umass, marker='o', color='r', label='u_mass')
plt.title('U_Mass Coherence Scores for Different Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score (u_mass)')
plt.legend()

plt.tight_layout()
plt.savefig(plot_save_path)
plt.show()

print("Evaluation complete.")