In [1]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaMulticore, CoherenceModel, Phrases, LdaModel
import matplotlib.pyplot as plt
import csv
import os

In [3]:
#original data
data_path = ""


#LDA PARAMETERS
num_topics_range = range(5,60,5)
alpha = 0.01
beta = 0.1
passes = 500
random_state=69


In [4]:

def assign_topics_to_documents(lda_model, corpus, data):
    document_topics = []
    for doc_bow in corpus:
        doc_topics = lda_model.get_document_topics(doc_bow)
        doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
        most_probable_topic = doc_topics[0][0] if doc_topics else None
        document_topics.append(most_probable_topic)
    data['Assigned_Topic'] = document_topics

    # Count the number of posts per topic
    topic_counts = data['Assigned_Topic'].value_counts()

    # Sort data by 'Assigned_Topic'
    sorted_data = data.sort_values(by='Assigned_Topic')
    return sorted_data, topic_counts


def save_topics(lda_model, num_topics, directory, topic_counts):
    topics = lda_model.print_topics(num_topics=num_topics, num_words=15)
    with open(os.path.join(directory, f'{num_topics}_topics.csv'), 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Topic ID', 'Topics', 'Post Count'])  # Column headings change to fit data
        for topic_id, topic in enumerate(topics):
            count = topic_counts.get(topic_id, 0)  # Get count for each topic, default to 0
            writer.writerow([topic_id, topic, count])



In [1]:
data = pd.read_csv(data_path)
texts = data[''].apply(lambda x: x.split()) #input col name 

bigram = Phrases(texts, min_count=5, threshold=100)
texts_bigram = [bigram[doc] for doc in texts]

dictionary = corpora.Dictionary(texts_bigram)
corpus = [dictionary.doc2bow(text) for text in texts_bigram]

coherence_scores = []


In [2]:
import os
from gensim.models.ldamulticore import LdaMulticore
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np


def topic_cosine_similarity(lda_model):
    topic_word_matrix = lda_model.get_topics()  # Shape: num_topics x vocabulary_size
    sim_matrix = cosine_similarity(topic_word_matrix)
    np.fill_diagonal(sim_matrix, 0)
    avg_similarity = np.sum(sim_matrix) / (lda_model.num_topics * (lda_model.num_topics - 1))
    return avg_similarity

perplexity_values = []
average_similarities = []

for num_topics in num_topics_range:
    print("Running for topic: ", num_topics)
    lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, alpha=alpha, eta=beta, passes=passes, workers=min(8, 9), random_state=random_state)

    perplexity_values.append(lda_model.log_perplexity(corpus))
    average_similarities.append(topic_cosine_similarity(lda_model))

    directory = f"{num_topics}"
    if not os.path.exists(directory):
        os.makedirs(directory)

    updated_data, topic_counts = assign_topics_to_documents(lda_model, corpus, data)
    save_topics(lda_model, num_topics, directory, topic_counts)
    updated_data.to_csv(os.path.join(directory, f'{num_topics}_annotated_data.csv'), index=False)

# Plotting the results
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(num_topics_range, perplexity_values, marker='o')
plt.title("Perplexity vs Number of Topics")
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")

plt.subplot(1, 2, 2)
plt.plot(num_topics_range, average_similarities, marker='o', color='r')
plt.title("Average Cosine Similarity vs Number of Topics")
plt.xlabel("Number of Topics")
plt.ylabel("Average Cosine Similarity")

plt.tight_layout()
plt.show()


In [3]:
#method of visulizing data
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis

#lda_model = LdaMulticore(corpus,num_topics=25,id2word=dictionary,alpha=alpha,eta=beta,passes=passes,workers = min(8,9),random_state=random_state)
#vis_data = gensimvis.prepare(lda_model,corpus,dictionary)
#pyLDAvis.display(vis_data)