In [None]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaMulticore, CoherenceModel, Phrases
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import csv
import os


In [None]:
data_path = "PATH/TO/PREPROCESSED_DATA.csv"
text_column = "concatenated_text"

num_topics_range = list(range(15, 251, 5))
alpha = 0.01
beta = 0.1
passes = 50
random_state = 69

selected_topic_counts = [50]

output_root = "PATH/TO/LDA_OUTPUTS"


In [None]:
def assign_topics_to_documents(lda_model, corpus, data):
    document_topics = []
    for doc_bow in corpus:
        doc_topics = lda_model.get_document_topics(doc_bow)
        doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
        most_probable_topic = doc_topics[0][0] if doc_topics else None
        document_topics.append(most_probable_topic)
    data = data.copy()
    data['Assigned_Topic'] = document_topics
    topic_counts = data['Assigned_Topic'].value_counts()
    return data.sort_values(by='Assigned_Topic'), topic_counts


def save_topics(lda_model, num_topics, directory, topic_counts):
    topics = lda_model.print_topics(num_topics=num_topics, num_words=15)
    os.makedirs(directory, exist_ok=True)
    with open(os.path.join(directory, f'{num_topics}_topics.csv'), 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Topic ID', 'Topics', 'Post Count'])
        for topic_id, topic in enumerate(topics):
            writer.writerow([topic_id, topic, topic_counts.get(topic_id, 0)])


def topic_cosine_similarity(lda_model):
    topic_word_matrix = lda_model.get_topics()
    sim_matrix = cosine_similarity(topic_word_matrix)
    np.fill_diagonal(sim_matrix, 0)
    return float(np.sum(sim_matrix) / (lda_model.num_topics * (lda_model.num_topics - 1)))


def find_elbow_point(x_values, y_values):
    if len(x_values) < 3:
        return None
    x = np.array(x_values, dtype=float)
    y = np.array(y_values, dtype=float)
    line = np.linspace(y[0], y[-1], num=len(y))
    distances = np.abs(y - line)
    idx = int(np.argmax(distances))
    return int(x[idx])


In [None]:
data = pd.read_csv(data_path)
texts = data[text_column].astype(str).apply(lambda x: x.split())

bigram = Phrases(texts, min_count=5, threshold=100)
texts_bigram = [bigram[doc] for doc in texts]

dictionary = corpora.Dictionary(texts_bigram)
corpus = [dictionary.doc2bow(text) for text in texts_bigram]


In [None]:
coherence_scores = []
cosine_scores = []

for num_topics in num_topics_range:
    print(f"Training {num_topics} topics...")
    lda_model = LdaMulticore(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        alpha=alpha,
        eta=beta,
        passes=passes,
        workers=min(8, 9),
        random_state=random_state,
    )
    coherence_model = CoherenceModel(model=lda_model, texts=texts_bigram, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    cosine_sim = topic_cosine_similarity(lda_model)

    coherence_scores.append(coherence)
    cosine_scores.append(cosine_sim)
    print(f"  coherence={coherence:.4f}, cosine_similarity={cosine_sim:.4f}")

elbow_topics = find_elbow_point(num_topics_range, cosine_scores)
print(f"Elbow topic count (cosine similarity): {elbow_topics}")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(num_topics_range, coherence_scores, marker='o')
plt.title('Coherence vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence (c_v)')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(num_topics_range, cosine_scores, marker='o', color='r')
if elbow_topics:
    plt.axvline(elbow_topics, color='gray', linestyle='--', label=f'Elbow ≈ {elbow_topics}')
    plt.legend()
plt.title('Average Cosine Similarity vs Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Average Cosine Similarity')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
os.makedirs(output_root, exist_ok=True)
final_topic_counts = list(selected_topic_counts)
if elbow_topics and elbow_topics not in final_topic_counts:
    final_topic_counts.append(elbow_topics)

for k in final_topic_counts:
    print(f"Training final model with {k} topics...")
    lda_model = LdaMulticore(
        corpus,
        num_topics=k,
        id2word=dictionary,
        alpha=alpha,
        eta=beta,
        passes=passes,
        workers=min(8, 9),
        random_state=random_state,
    )
    directory = os.path.join(output_root, f"{k}")
    updated_data, topic_counts = assign_topics_to_documents(lda_model, corpus, data)
    save_topics(lda_model, k, directory, topic_counts)
    updated_data.to_csv(os.path.join(directory, f'{k}_annotated_data.csv'), index=False)
