In [1]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import pickle
import time

# Add gensim imports for coherence calculation
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel


In [2]:
# Block 2: Define Paths for Data and Output
project_root = '../../'
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'LDA_NMF_Data')
base_output_dir = os.path.join(project_root, 'outputs', 'topic_modeling', 'lda')
model_output_dir = os.path.join(base_output_dir, 'models')
evaluation_output_dir = os.path.join(base_output_dir, 'evaluations')
viz_output_dir = os.path.join(base_output_dir, 'visualizations', 'wordclouds')
# doc_topics_output_dir = os.path.join(base_output_dir, 'document_topics')

os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(evaluation_output_dir, exist_ok=True)
os.makedirs(viz_output_dir, exist_ok=True)
# os.makedirs(doc_topics_output_dir, exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Block 3: Load and Merge Data from JSON Files
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('lda_nmf_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        subreddit_name = fname.replace('lda_nmf_r_', '').replace('.json', '')
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        df['subreddit'] = subreddit_name
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

print(f"Total number of posts: {len(all_posts)}")


In [None]:
# Block 4: Data Preparation for Topic Modeling
all_posts['text_for_topic'] = all_posts['processed_tokens_lda_nmf'].apply(lambda tokens: ' '.join(tokens))
texts = all_posts['processed_tokens_lda_nmf'].tolist() # Needed for Gensim coherence

# Create document-term matrix using Count Vectorizer for scikit-learn LDA
print("Creating document-term matrix for scikit-learn LDA...")
vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=5,
    max_features=5000,
    ngram_range=(1, 2)
)
dtm = vectorizer.fit_transform(all_posts['text_for_topic'])
feature_names = vectorizer.get_feature_names_out()
print(f"Document-term matrix shape: {dtm.shape}")

# Create Gensim dictionary and corpus (required for coherence calculation later)
print("Creating Gensim dictionary and corpus for coherence evaluation...")
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(f"Gensim dictionary size: {len(dictionary)}")
print(f"Gensim corpus size: {len(corpus)}")


In [None]:
# Block 5: Hyperparameter Tuning for LDA Model using Grid Search and NPMI Coherence

def get_topics_list(model, feature_names, n_top_words=15):
    """Extract top words for each topic from LDA model."""
    topics_list = []
    for topic in model.components_:
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics_list.append(top_words)
    return topics_list

search_params = {
    'n_components': [10, 15, 20],
    'learning_decay': [0.7, 0.9]
}
n_components_list = search_params['n_components']
learning_decay_list = search_params['learning_decay']

best_score = -np.inf
best_params = {}
coherence_scores = {}

start_time = time.time()

for n_components in n_components_list:
    for learning_decay in learning_decay_list:
        param_key = f"k={n_components}, decay={learning_decay}"
        print(f"  Testing {param_key}...")
        try:
            lda_temp = LatentDirichletAllocation(
                n_components=n_components,
                learning_decay=learning_decay,
                max_iter=20,
                learning_method='online',
                random_state=42,
                n_jobs=-1
            )
            lda_temp.fit(dtm)

            # Use feature_names from CountVectorizer (Block 4) for sklearn topics
            topics = get_topics_list(lda_temp, feature_names)

            # Use dictionary and texts (from Block 4) for Gensim coherence
            coherence_model = CoherenceModel(
                topics=topics,
                texts=texts,
                dictionary=dictionary,
                coherence='c_npmi'
                # corpus=corpus # Optional for c_npmi, but can be included
            )
            coherence_score = coherence_model.get_coherence()

            coherence_scores[param_key] = coherence_score
            print(f"    NPMI Coherence: {coherence_score:.4f}")

            if coherence_score > best_score:
                best_score = coherence_score
                best_params = {'n_components': n_components, 'learning_decay': learning_decay}
        except Exception as e:
             print(f"    Failed for {param_key}: {e}")
             coherence_scores[param_key] = np.nan

end_time = time.time()
print(f"\nTuning finished in {end_time - start_time:.2f} seconds.")
print(f"All Coherence Scores: {coherence_scores}")

if not best_params:
    print("Error: No best parameters found. LDA tuning failed.")
    best_lda_params = {'n_components': 10, 'learning_decay': 0.7} # Fallback parameters
else:
    print(f"Best NPMI Score: {best_score:.4f}")
    print(f"Best LDA Parameters found: {best_params}")
    best_lda_params = best_params


In [None]:
# Block 6: Train Final LDA Model with Best Parameters

# Using parameters found during tuning or fallback defaults
print("Training final LDA model...")
final_lda = LatentDirichletAllocation(
    n_components=best_lda_params['n_components'],
    learning_decay=best_lda_params['learning_decay'],
    max_iter=20,
    learning_method='online',
    random_state=42
)
final_lda.fit(dtm)


In [None]:
# Block 7: Save Trained LDA Model and Vectorizer
print("Saving LDA model and vectorizer...")
with open(os.path.join(model_output_dir, 'lda_model.pkl'), 'wb') as f:
    pickle.dump(final_lda, f)

with open(os.path.join(model_output_dir, 'count_vectorizer.pkl'), 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"LDA model and CountVectorizer saved to {model_output_dir}")


In [None]:
# Block 8: Model Evaluation and Coherence Analysis

def get_lda_topics(model, feature_names, n_top_words=15):
    """Extract top words for each topic from LDA model."""
    topics_list = []
    for topic in model.components_:
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics_list.append(top_words)
    return topics_list

lda_topics = get_lda_topics(final_lda, feature_names)

coherence_model = CoherenceModel(
    topics=lda_topics,
    texts=texts,
    corpus=corpus,
    dictionary=dictionary,
    coherence='c_npmi'
)
lda_coherence = coherence_model.get_coherence()

# Calculate topic diversity (proportion of unique words across all topics)
topic_diversity = len(set([word for topic in lda_topics for word in topic])) / (len(lda_topics) * 15)

evaluation_results = {
    'model_type': 'LDA',
    'coherence_score': lda_coherence,
    'n_topics': final_lda.n_components,
    'topic_diversity': topic_diversity,
    'parameters': {
        'n_components': final_lda.n_components,
        'learning_decay': final_lda.learning_decay,
        'max_iter': final_lda.max_iter
    }
}

with open(os.path.join(evaluation_output_dir, 'lda_evaluation.json'), 'w') as f:
    json.dump(evaluation_results, f, indent=4)

print("\nModel Evaluation Results:")
print(f"NPMI Coherence Score: {lda_coherence:.4f}")
print(f"Topic Diversity: {topic_diversity:.4f}")


In [None]:
# Block 9: Function to Display Top Words for each Topic
def display_topics(model, feature_names, n_top_words=15):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics[topic_idx] = top_words
        print(f"Topic {topic_idx+1}: {' '.join(top_words)}") # Keep 1-based index for LDA display consistency
    return topics

print("\nLDA Topics:")
lda_topics = display_topics(final_lda, feature_names)


In [None]:
# Block 10: Create Word Clouds for Visualization
def create_wordcloud(topics_dict, model_name):
    """Create and save word clouds for each topic."""
    for topic_idx, words in topics_dict.items():
        word_freq = {word: 1 for word in words}
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            max_words=100
        ).generate_from_frequencies(word_freq)

        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'{model_name} Topic {topic_idx+1}')

        out_path = os.path.join(viz_output_dir, f'{model_name}_topic_{topic_idx+1}_wordcloud.png')
        plt.savefig(out_path, bbox_inches='tight')
        plt.close()
        print(f"Saved word cloud for {model_name} Topic {topic_idx+1}")

create_wordcloud(lda_topics, 'LDA')


In [None]:
# Block 11: Assign Topics to Documents
lda_doc_topic_dist = final_lda.transform(dtm)

all_posts['lda_dominant_topic'] = np.argmax(lda_doc_topic_dist, axis=1)
all_posts['lda_topic_confidence'] = np.max(lda_doc_topic_dist, axis=1)

# Add one to make topics 1-indexed for better readability
all_posts['lda_dominant_topic'] = all_posts['lda_dominant_topic'] + 1

print(all_posts[['lda_dominant_topic', 'lda_topic_confidence']].head(10))


In [None]:
# Block 12: Analyze Topic Distribution by Subreddit
lda_topic_by_subreddit = pd.crosstab(
    all_posts['subreddit'],
    all_posts['lda_dominant_topic'],
    normalize='index'
) * 100

plt.figure(figsize=(12, 8))
sns.heatmap(lda_topic_by_subreddit, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('LDA Topic Distribution by Subreddit (%)')
plt.xlabel('Topic')
plt.ylabel('Subreddit')
plt.savefig(os.path.join(evaluation_output_dir, 'lda_topic_by_subreddit.png'))
plt.close()

print("Topic distribution analysis by subreddit completed and saved")


In [None]:
# Block 13: Analyze Overall Topic Distribution
# Note: Confidence values are model-specific and should NOT be used to compare between different models (e.g., LDA vs NMF).
# These values are useful for: 1) filtering unreliable topic assignments, 2) identifying which topics have more
# consistent assignments within this specific model, and 3) potentially flagging uncertain classifications in applications.

topic_distribution = all_posts['lda_dominant_topic'].value_counts(normalize=True) * 100

plt.figure(figsize=(15, 6))
topic_distribution.sort_index().plot(kind='bar')
plt.title('Distribution of Topics Across All Posts')
plt.xlabel('Topic Number')
plt.ylabel('Percentage of Posts')
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(evaluation_output_dir, 'lda_topic_distribution.png'))
plt.close()

print("\nTopic Distribution Statistics:")
print(topic_distribution.sort_index())

# The following confidence analysis helps understand the relative strength of topic assignments WITHIN this model only.
avg_confidence = all_posts['lda_topic_confidence'].mean()
print(f"\nAverage LDA Topic Assignment Strength: {avg_confidence:.3f}")

topic_confidence = all_posts.groupby('lda_dominant_topic')['lda_topic_confidence'].mean()
print("\nTop 5 Topics with Highest Assignment Strength:")
print(topic_confidence.nlargest(5))
print("\nTop 5 Topics with Lowest Assignment Strength:")
print(topic_confidence.nsmallest(5))

# When selecting between models (LDA, NMF, BERTopic, etc.), prioritize coherence scores, topic diversity,
# and qualitative evaluation of topic interpretability rather than these confidence values.


In [14]:
# Block 14: Process Each Subreddit Separately and Save Results (Commented Out)
# for fname in os.listdir(processed_dir):
#     if fname.startswith('processed_r_') and fname.endswith('.json'):
#         file_path = os.path.join(processed_dir, fname)
#         with open(file_path, 'r', encoding='utf-8') as f:
#             df = pd.DataFrame(json.load(f))
#
#         if 'processed_tokens_ml' not in df.columns:
#             continue
#
#         df['text_for_topic'] = df['processed_tokens_ml'].apply(lambda tokens: ' '.join(tokens))
#         dtm_subreddit = vectorizer.transform(df['text_for_topic'])
#         doc_topic_dist = final_lda.transform(dtm_subreddit)
#
#         df['lda_dominant_topic'] = np.argmax(doc_topic_dist, axis=1) + 1
#         df['lda_topic_confidence'] = np.max(doc_topic_dist, axis=1)
#
#         out_file = fname.replace('.json', '_lda_topics.csv')
#         out_path = os.path.join(doc_topics_output_dir, out_file)
#         df[['id', 'title', 'lda_dominant_topic', 'lda_topic_confidence']].to_csv(out_path, index=False)
#         print("Saved:", out_file)
