In [None]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import time # Optional: To time the tuning process

# Add gensim imports
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import Nmf
from gensim.models import TfidfModel


In [None]:
# Block 2: Define Paths for Data and Output
project_root = '../../'
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'LDA_NMF_Data')
base_output_dir = os.path.join(project_root, 'outputs', 'topic_modeling', 'nmf')
model_output_dir = os.path.join(base_output_dir, 'models')
evaluation_output_dir = os.path.join(base_output_dir, 'evaluations')
viz_output_dir = os.path.join(base_output_dir, 'visualizations', 'wordclouds')
# doc_topics_output_dir = os.path.join(base_output_dir, 'document_topics')

os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(evaluation_output_dir, exist_ok=True)
os.makedirs(viz_output_dir, exist_ok=True)
# os.makedirs(doc_topics_output_dir, exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Block 3: Load and Merge Data from JSON Files
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('lda_nmf_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        subreddit_name = fname.replace('lda_nmf_r_', '').replace('.json', '')
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        df['subreddit'] = subreddit_name
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

print(f"Total number of posts: {len(all_posts)}")


In [None]:
# Block 4: Data Preparation for Topic Modeling (Gensim NMF with TF-IDF)
print("Preparing data for Gensim NMF...")
texts = all_posts['processed_tokens_lda_nmf'].tolist() # Use correct token column

print("Creating Gensim dictionary and corpus (BoW)...")
dictionary = Dictionary(texts)
print(f"Original dictionary size: {len(dictionary)}")
# Apply filtering
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=None) # Same filtering as before
print(f"Filtered dictionary size: {len(dictionary)}")

# Create Bag-of-Words corpus (still needed for TF-IDF model and coherence)
corpus = [dictionary.doc2bow(text) for text in texts]
print(f"Number of documents in BoW corpus: {len(corpus)}")
print(f"Number of unique tokens in filtered dictionary: {len(dictionary)}")

# Create TF-IDF representation using Gensim
print("Creating TF-IDF representation using Gensim...")
tfidf_model = TfidfModel(corpus, id2word=dictionary) # Train TF-IDF model on BoW corpus
corpus_tfidf = tfidf_model[corpus] # Apply transformation
print("TF-IDF corpus created.")
# Note: corpus_tfidf will be used as input for NMF model


In [None]:
# Block 5: Hyperparameter Tuning for NMF Model using NPMI Coherence

# --- Helper Function ---
def get_gensim_nmf_topics(model, topn=15):
    topics_list = []
    topic_term_matrix = model.get_topics()
    for topic_idx in range(model.num_topics):
        top_words_indices = np.argsort(topic_term_matrix[topic_idx, :])[::-1][:topn]
        top_words = [model.id2word[idx] for idx in top_words_indices]
        topics_list.append(top_words)
    return topics_list

# --- Define Hyperparameter Grid ---
search_params_nmf = {
    # 'num_topics': [10, 15, 20, 25, 30]
    'num_topics': [18, 19, 20, 21, 22] # Include the current best (20) and neighbors
}
num_topics_list = search_params_nmf['num_topics']

best_score = -np.inf
best_params = {}
coherence_scores = {}
start_time = time.time()
num_workers = os.cpu_count() - 1 if os.cpu_count() and os.cpu_count() > 1 else 1

# --- Grid Search ---
print(f"Starting NMF hyperparameter tuning with k = {num_topics_list} (using TF-IDF input)...")
for k in num_topics_list:
    param_key = f"k={k}"
    print(f"  Testing {param_key}...")

    try:
        # Initialize and train the temporary NMF model USING TF-IDF CORPUS
        nmf_temp = Nmf(
            corpus=corpus_tfidf, # *** Use TF-IDF corpus ***
            id2word=dictionary,
            num_topics=k,
            random_state=42,
            chunksize=100,
            passes=10,
            eval_every=None,
            normalize=True # Often recommended with TF-IDF input for NMF
        )

        # Extract topics for coherence calculation
        topics = get_gensim_nmf_topics(nmf_temp, 15)

        # Calculate Coherence Score (uses BoW corpus and texts)
        coherence_model = CoherenceModel(
            topics=topics,
            texts=texts,
            dictionary=dictionary,
            corpus=corpus, # *** Coherence uses BoW corpus ***
            coherence='c_npmi',
            processes=num_workers
        )
        coherence_score = coherence_model.get_coherence()
        coherence_scores[param_key] = coherence_score
        print(f"    NPMI Coherence: {coherence_score:.4f}")

        # Update Best Score and Parameters
        if coherence_score > best_score:
            best_score = coherence_score
            best_params = {'num_topics': k}

    except Exception as e:
         print(f"    Failed for {param_key}: {e}")
         coherence_scores[param_key] = np.nan

# --- Post-tuning steps (remain the same logic, just ensure best_nmf_params is set) ---
end_time = time.time()
print(f"\nTuning finished in {end_time - start_time:.2f} seconds.")
# ... (print sorted scores) ...
sorted_scores = sorted(coherence_scores.items(), key=lambda item: item[1] if not np.isnan(item[1]) else -np.inf, reverse=True)
print(f"\nAll Coherence Scores:")
for params, score in sorted_scores:
    score_str = f"{score:.4f}" if not np.isnan(score) else "Failed"
    print(f"  {params}: {score_str}")

if not best_params:
    print("\nError: All NMF tuning runs failed. Using fallback.")
    best_nmf_params = {'num_topics': 15}
else:
    print(f"\nBest NPMI Score achieved during NMF tuning: {best_score:.4f}")
    print(f"Best NMF Parameters found: {best_params}")
    best_nmf_params = best_params

print(f"\nParameters selected for final NMF model training in Block 6: {best_nmf_params}")


In [None]:
# Block 6: Train Final NMF Model with Best Parameters

print("\nTraining final Gensim NMF model using best parameters (with TF-IDF input)...")
final_nmf = Nmf(
    corpus=corpus_tfidf, # *** Use TF-IDF corpus ***
    id2word=dictionary,
    num_topics=best_nmf_params['num_topics'],
    random_state=42,
    chunksize=100,
    passes=20, # More passes for final model
    eval_every=None,
    normalize=True # Keep consistent with tuning
)

print("\nFinal NMF Model Configuration:")
print(final_nmf)


In [None]:
# Block 7: Save Trained Gensim NMF Model, Dictionary, and TF-IDF Model

print("\nSaving Gensim NMF model, dictionary, and TF-IDF model...")

# Save NMF model
model_path = os.path.join(model_output_dir, 'gensim_nmf_tfidf.model') # Indicate TF-IDF input
final_nmf.save(model_path)
print(f"Gensim NMF model saved to {model_path}")

# Save Dictionary
dict_path = os.path.join(model_output_dir, 'gensim_dictionary.dict')
dictionary.save(dict_path)
print(f"Gensim dictionary saved to {dict_path}")

# Save TF-IDF Model (needed for applying NMF to new data later)
tfidf_model_path = os.path.join(model_output_dir, 'gensim_tfidf.model')
tfidf_model.save(tfidf_model_path)
print(f"Gensim TF-IDF model saved to {tfidf_model_path}")


In [None]:
# Block 8: Model Evaluation and Coherence Analysis

print("\nEvaluating the final Gensim NMF model (trained on TF-IDF)...")

# Extract topics (helper function remains the same)
nmf_topics = get_gensim_nmf_topics(final_nmf, 15)

# Calculate topic diversity (remains the same)
all_topic_words = [word for topic in nmf_topics for word in topic]
topic_diversity = len(set(all_topic_words)) / (final_nmf.num_topics * 15) if final_nmf.num_topics > 0 else 0

# Calculate Coherence Scores for the final model
print("Calculating coherence for the final model...")
# Calculate C_NPMI Coherence (uses BoW corpus)
coherence_model_npmi = CoherenceModel(
    topics=nmf_topics,
    texts=texts,
    corpus=corpus, # *** Coherence uses BoW corpus ***
    dictionary=dictionary,
    coherence='c_npmi',
    processes=num_workers
)
nmf_coherence_npmi = coherence_model_npmi.get_coherence()

# Calculate C_V Coherence (uses texts)
print("Calculating C_V coherence...")
coherence_model_cv = CoherenceModel(
    topics=nmf_topics,
    texts=texts, # C_V uses the original texts
    dictionary=dictionary,
    coherence='c_v',
    processes=num_workers
)
nmf_coherence_cv = coherence_model_cv.get_coherence()


# Store evaluation results
used_random_seed = 42
evaluation_results = {
    'model_type': 'Gensim NMF (TF-IDF Input)', # Specify input type
    'coherence_score_npmi': nmf_coherence_npmi, # Renamed for clarity
    'coherence_score_cv': nmf_coherence_cv,     # Added C_V score
    'topic_diversity': topic_diversity,
    'n_topics': final_nmf.num_topics,
    'parameters': {
        'num_topics': final_nmf.num_topics,
        'passes': final_nmf.passes,
        'chunksize': final_nmf.chunksize,
        'normalize': final_nmf.normalize, # Record if normalization was used
        'random_state_seed': used_random_seed
    }
}

eval_path = os.path.join(evaluation_output_dir, 'gensim_nmf_tfidf_evaluation.json') # Indicate TF-IDF
try:
    with open(eval_path, 'w') as f:
        json.dump(evaluation_results, f, indent=4)
    print(f"Evaluation results saved to {eval_path}")
except TypeError as e:
    print(f"Error saving evaluation results to JSON: {e}")

print("\nModel Evaluation Results:")
print(f"NPMI Coherence Score: {nmf_coherence_npmi:.4f}") # Updated variable name
print(f"C_V Coherence Score: {nmf_coherence_cv:.4f}")   # Added C_V output
print(f"Topic Diversity: {topic_diversity:.4f}")


In [None]:
# Block 9: Function to Display Top Words for each Topic
def display_topics(model, n_top_words=15):
    """Displays topics from a trained Gensim NMF model and returns them as a 0-indexed dictionary."""
    topics_dict = {}
    # Use the helper function defined in Block 5/8
    topics_list = get_gensim_nmf_topics(model, n_top_words)
    print("\nGensim NMF Topics:")
    for topic_id, topic_words in enumerate(topics_list):
        topics_dict[topic_id] = topic_words # 0-based index for internal use
        print(f"Topic {topic_id + 1}: {' '.join(topic_words)}") # Display as 1-based
    return topics_dict

# Display NMF topics from the final model
nmf_topics_dict = display_topics(final_nmf)


In [None]:
# Block 10: Create Word Clouds for Visualization (Combined Plot)
def create_combined_wordcloud(topics_dict, model_name):
    """Creates and saves a single figure with word clouds for each topic."""
    num_topics = len(topics_dict)
    if num_topics == 0:
        print("No topics to generate word clouds for.")
        return
    # Adjust grid layout dynamically (e.g., aim for ~5 columns)
    n_cols = min(5, num_topics)
    n_rows = (num_topics + n_cols - 1) // n_cols
    plt.figure(figsize=(n_cols * 5, n_rows * 3))

    print(f"\nGenerating combined word cloud for {num_topics} topics...")
    for topic_idx, words in topics_dict.items(): # topic_idx is 0-based
        if not words: # Handle empty topics if they occur
             print(f"Skipping word cloud for empty Topic {topic_idx + 1}")
             continue
        # Create frequency dict (can adjust if model provides weights)
        word_freq = {word: 1 for word in words}
        try:
            wordcloud = WordCloud(
                width=400, height=200, background_color='white', max_words=100
            ).generate_from_frequencies(word_freq)

            plt.subplot(n_rows, n_cols, topic_idx + 1) # subplot requires 1-based index
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'{model_name} Topic {topic_idx + 1}') # Display 1-based index
        except Exception as e:
            print(f"Error generating word cloud for Topic {topic_idx + 1}: {e}")
            # Optionally plot a placeholder or skip

    plt.tight_layout(pad=2.0)
    out_path_all = os.path.join(viz_output_dir, f'{model_name}_all_topics_wordclouds.png')
    plt.savefig(out_path_all, bbox_inches='tight')
    plt.close()
    print(f"Saved combined word cloud figure to {out_path_all}")

# Use the 0-indexed dictionary from display_topics
create_combined_wordcloud(nmf_topics_dict, 'Gensim_NMF')


In [None]:
# Block 11: Assign Topics to Documents

print("\nAssigning topics to documents using Gensim NMF model (TF-IDF input)...")

# Apply the trained NMF model to the TF-IDF corpus
doc_topic_dist = final_nmf[corpus_tfidf] # *** Use TF-IDF corpus ***

# --- Logic for extracting dominant topic and confidence remains the same ---
dominant_topics = []
topic_confidences = []
for doc_vector in doc_topic_dist:
    # ... (same extraction logic as before) ...
    if not doc_vector:
        dominant_topic_idx = -1
        max_prob = 0.0
    else:
        if isinstance(doc_vector, np.ndarray) or isinstance(doc_vector, list) and not isinstance(doc_vector[0], tuple):
             if len(doc_vector) > 0:
                 dominant_topic_idx = np.argmax(doc_vector)
                 max_prob = np.max(doc_vector)
             else:
                 dominant_topic_idx = -1
                 max_prob = 0.0
        elif isinstance(doc_vector, list) and isinstance(doc_vector[0], tuple):
             if len(doc_vector) > 0:
                  dominant_topic_tuple = max(doc_vector, key=lambda item: item[1])
                  dominant_topic_idx = dominant_topic_tuple[0]
                  max_prob = dominant_topic_tuple[1]
             else:
                 dominant_topic_idx = -1
                 max_prob = 0.0
        else:
             dominant_topic_idx = -1
             max_prob = 0.0
    dominant_topics.append(dominant_topic_idx)
    topic_confidences.append(max_prob)


# --- Add to DataFrame and handle types (remains the same) ---
all_posts['nmf_dominant_topic'] = dominant_topics
all_posts['nmf_topic_confidence'] = topic_confidences
all_posts['nmf_dominant_topic'] = all_posts['nmf_dominant_topic'] + 1
all_posts.loc[all_posts['nmf_dominant_topic'] == 0, ['nmf_dominant_topic', 'nmf_topic_confidence']] = [np.nan, np.nan]
all_posts['nmf_dominant_topic'] = all_posts['nmf_dominant_topic'].astype('Int64')
all_posts['nmf_topic_confidence'] = all_posts['nmf_topic_confidence'].astype('float32')

print("\nSample of assigned topics (1-based index):")
print(all_posts[['nmf_dominant_topic', 'nmf_topic_confidence']].head(10))
print("\nData types after assignment:")
print(all_posts[['nmf_dominant_topic', 'nmf_topic_confidence']].dtypes)


In [None]:
# Block 12: Analyze Topic Distribution by Subreddit
print("\nAnalyzing NMF topic distribution by subreddit...")

# Use dropna() in case NaNs were introduced in Block 11
# Ensure nmf_dominant_topic is not NaN before crosstab
nmf_topic_by_subreddit = pd.crosstab(
    all_posts['subreddit'],
    all_posts['nmf_dominant_topic'].dropna(), # Drop NaN topics before crosstab
    normalize='index'
) * 100

plt.figure(figsize=(12, 8))
sns.heatmap(nmf_topic_by_subreddit, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('Gensim NMF Topic Distribution by Subreddit (%)')
plt.xlabel('Topic (1-based)')
plt.ylabel('Subreddit')
plt.savefig(os.path.join(evaluation_output_dir, 'gensim_nmf_topic_by_subreddit.png'))
plt.close()

print("Topic distribution analysis by subreddit completed and saved.")


In [None]:
# Block 13: Analyze Overall Topic Distribution
print("\nAnalyzing overall Gensim NMF topic distribution...")
# Note: Assignment "confidence" (weight) values are model-specific.

topic_distribution = all_posts['nmf_dominant_topic'].value_counts(normalize=True) * 100

plt.figure(figsize=(15, 6))
topic_distribution.sort_index().plot(kind='bar')
plt.title('Gensim NMF: Distribution of Topics Across All Posts')
plt.xlabel('Topic Number (1-based)')
plt.ylabel('Percentage of Posts (%)')
plt.xticks(rotation=0)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(evaluation_output_dir, 'gensim_nmf_topic_distribution.png'))
plt.close()

print("\nNMF Topic Distribution Statistics (%):")
# Ensure index is sorted for printing
print(topic_distribution.sort_index())

# Analyze assignment strength (confidence/weight) WITHIN this Gensim NMF model
avg_confidence = all_posts['nmf_topic_confidence'].mean()
print(f"\nAverage Gensim NMF Topic Assignment Strength: {avg_confidence:.3f}")

# Ensure NaNs are handled before grouping
topic_confidence = all_posts.dropna(subset=['nmf_dominant_topic', 'nmf_topic_confidence']).groupby('nmf_dominant_topic')['nmf_topic_confidence'].mean()

print("\nTop 5 Topics with Highest Average Assignment Strength:")
print(topic_confidence.nlargest(5))
print("\nTop 5 Topics with Lowest Average Assignment Strength:")
print(topic_confidence.nsmallest(5))

print("\nRemember: Prioritize coherence, diversity, and interpretability over assignment strength for model comparison.")


In [None]:
# Block 14: Process Each Subreddit Separately (Commented Out - Adapted for TF-IDF)

# def process_subreddit_nmf_tfidf(file_path, dictionary, tfidf_model_g, nmf_model_g):
#      with open(file_path, 'r', encoding='utf-8') as f:
#          df = pd.DataFrame(json.load(f))
#      if 'processed_tokens_lda_nmf' not in df.columns:
#          return None
#
#      texts_subreddit = df['processed_tokens_lda_nmf'].tolist()
#      # Create BoW corpus first
#      corpus_subreddit_bow = [dictionary.doc2bow(text) for text in texts_subreddit]
#      if not corpus_subreddit_bow: return None
#
#      # Apply TF-IDF transformation
#      corpus_subreddit_tfidf = tfidf_model_g[corpus_subreddit_bow]
#
#      # Get NMF topic distribution using TF-IDF input
#      doc_topic_dist_sub = nmf_model_g[corpus_subreddit_tfidf] # *** Use TF-IDF ***
#
#      # --- Extract dominant topic/confidence (same logic as Block 11) ---
#      dominant_topics_sub = []
#      topic_confidences_sub = []
#      for doc_vector_sub in doc_topic_dist_sub:
#          # ... (same extraction logic) ...
#          if not doc_vector_sub:
#             dominant_topic_idx_sub = -1; max_prob_sub = 0.0
#          elif isinstance(doc_vector_sub, list) and len(doc_vector_sub) > 0 and isinstance(doc_vector_sub[0], tuple):
#              dominant_topic_tuple_sub = max(doc_vector_sub, key=lambda item: item[1])
#              dominant_topic_idx_sub = dominant_topic_tuple_sub[0]; max_prob_sub = dominant_topic_tuple_sub[1]
#          else: # Handle other formats or empty lists
#              dominant_topic_idx_sub = -1; max_prob_sub = 0.0
#          dominant_topics_sub.append(dominant_topic_idx_sub)
#          topic_confidences_sub.append(max_prob_sub)
#
#      df['nmf_dominant_topic'] = dominant_topics_sub
#      df['nmf_topic_confidence'] = topic_confidences_sub
#      df['nmf_dominant_topic'] = df['nmf_dominant_topic'] + 1
#
#      df.loc[df['nmf_dominant_topic'] == 0, ['nmf_dominant_topic', 'nmf_topic_confidence']] = [np.nan, np.nan]
#      df['nmf_dominant_topic'] = df['nmf_dominant_topic'].astype('Int64')
#      df['nmf_topic_confidence'] = df['nmf_topic_confidence'].astype('float32')
#
#      return df[['id', 'title', 'nmf_dominant_topic', 'nmf_topic_confidence']]
#
# # --- Loop (requires loading tfidf_model and final_nmf) ---
# # print("\nProcessing subreddits individually for NMF (TF-IDF input)...")
# # Load models if running separately:
# # dictionary = Dictionary.load(os.path.join(model_output_dir, 'gensim_dictionary.dict'))
# # tfidf_model_loaded = TfidfModel.load(os.path.join(model_output_dir, 'gensim_tfidf.model'))
# # final_nmf_loaded = Nmf.load(os.path.join(model_output_dir, 'gensim_nmf_tfidf.model'))
# #
# # for fname in os.listdir(processed_dir):
# #      if fname.startswith('lda_nmf_r_') and fname.endswith('.json'):
# #          print(f"Processing {fname} for NMF (TF-IDF)...")
# #          file_path = os.path.join(processed_dir, fname)
# #          results_df = process_subreddit_nmf_tfidf(file_path, dictionary, tfidf_model_loaded, final_nmf_loaded)
# #          if results_df is not None:
# #              out_file = fname.replace('.json', '_gensim_nmf_tfidf_topics.csv')
# #              out_path = os.path.join(doc_topics_output_dir, out_file) # Ensure doc_topics_output_dir exists
# #              results_df.to_csv(out_path, index=False)
# #              print(f"  Saved NMF results: {out_file}")
# #          else:
# #              print(f"  Skipped NMF processing for {fname} (missing data or empty)")
# # print("Finished processing individual subreddits for NMF (TF-IDF).")
