In [1]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import time

# Gensim imports
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaMulticore


In [2]:
# Block 2: Define Paths for Data and Output
project_root = '../../'
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'LDA_NMF_Data')
base_output_dir = os.path.join(project_root, 'outputs', 'topic_modeling', 'lda')
model_output_dir = os.path.join(base_output_dir, 'models')
evaluation_output_dir = os.path.join(base_output_dir, 'evaluations')
viz_output_dir = os.path.join(base_output_dir, 'visualizations', 'wordclouds')
# doc_topics_output_dir = os.path.join(base_output_dir, 'document_topics')

os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(evaluation_output_dir, exist_ok=True)
os.makedirs(viz_output_dir, exist_ok=True)
# os.makedirs(doc_topics_output_dir, exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Block 3: Load and Merge Data from JSON Files
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('lda_nmf_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        subreddit_name = fname.replace('lda_nmf_r_', '').replace('.json', '')
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        df['subreddit'] = subreddit_name
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

print(f"Total number of posts: {len(all_posts)}")


In [None]:
# Block 4: Data Preparation for Topic Modeling (Gensim Only)
print("Preparing data for Gensim LDA...")
texts = all_posts['processed_tokens_lda_nmf'].tolist()

print("Creating Gensim dictionary and corpus...")
dictionary = Dictionary(texts)

# Filter extremes:
# - Remove tokens that appear in less than 15 documents (no_below)
# - Remove tokens that appear in more than 50% of documents (no_above)
# - Keep only the top "None" most frequent tokens (keep_n)
print(f"Original dictionary size: {len(dictionary)}")
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=None)
print(f"Filtered dictionary size: {len(dictionary)}")

corpus = [dictionary.doc2bow(text) for text in texts]
print(f"Number of documents in corpus: {len(corpus)}")
print(f"Number of unique tokens in filtered dictionary: {len(dictionary)}")


In [None]:
# Block 5: Comprehensive Hyperparameter Tuning for LDA Model

# --- Define Hyperparameter Grid ---
search_params = {
    'num_topics': [10, 15, 20],     # Reduced range for k
    'eta': [None, 'auto', 0.1]    # Reduced eta options
}
num_topics_list = search_params['num_topics']
eta_settings = search_params['eta']

best_score = -np.inf
best_params = {} # Store the best combination found
coherence_scores = {} # Store scores for all combinations

start_time = time.time()
# Use one less than the total number of cores, but at least 1
num_workers = os.cpu_count() - 1 if os.cpu_count() and os.cpu_count() > 1 else 1

# --- Grid Search ---
print(f"Starting comprehensive hyperparameter tuning with k = {num_topics_list}...")
for k in num_topics_list:
    for eta_setting in eta_settings:
        # Construct a unique key for logging and parameter tracking
        # Use 'symmetric_default' for None eta for clarity in logs
        eta_key = eta_setting if eta_setting is not None else 'symmetric_default'
        param_key = f"k={k}, alpha=symmetric, eta={eta_key}"
        print(f"  Testing {param_key}...")

        try:
            # Initialize and train the temporary LDA model
            lda_temp = LdaMulticore(
                corpus=corpus,
                id2word=dictionary,
                num_topics=k,
                random_state=42, # Ensure reproducibility
                chunksize=100,  # Process 100 documents at a time
                passes=10,      # Moderate passes for faster tuning
                alpha='symmetric', # Set alpha directly
                eta=eta_setting, # Note: 'auto' eta IS supported by LdaMulticore
                workers=num_workers
            )

            # Calculate Coherence Score (NPMI)
            coherence_model = CoherenceModel(
                model=lda_temp,
                texts=texts,        # Use the original tokenized texts
                dictionary=dictionary,
                coherence='c_npmi', # Use NPMI for coherence calculation
                processes=num_workers # Use multiple processes if available
            )
            coherence_score = coherence_model.get_coherence()
            coherence_scores[param_key] = coherence_score
            print(f"    NPMI Coherence: {coherence_score:.4f}")

            # Update Best Score and Parameters if current model is better
            if coherence_score > best_score:
                best_score = coherence_score
                # Store the parameters that yielded the best score
                best_params = {
                    'num_topics': k,
                    'alpha': 'symmetric', # Store the fixed alpha value
                    'eta': eta_setting # Store the actual eta value (could be None)
                }

        except Exception as e:
             # Log errors for specific parameter combinations
             print(f"    Failed for {param_key}: {e}")
             coherence_scores[param_key] = np.nan # Record failure with NaN

end_time = time.time()
print(f"\nTuning finished in {end_time - start_time:.2f} seconds.")
print(f"\nAll Coherence Scores:")
# Print scores sorted by coherence value (highest first), handling potential NaNs
sorted_scores = sorted(coherence_scores.items(), key=lambda item: item[1] if not np.isnan(item[1]) else -np.inf, reverse=True)
for params, score in sorted_scores:
    # Format NaN scores appropriately for printing
    score_str = f"{score:.4f}" if not np.isnan(score) else "Failed"
    print(f"  {params}: {score_str}")


# --- Select Best Parameters ---
# Check if any run succeeded and found parameters
if not best_params:
    print("\nError: All tuning runs failed or no valid best parameters found. Using fallback.")
    # Define fallback parameters if all runs fail
    best_lda_params = {'num_topics': 20, 'alpha': 'symmetric', 'eta': None}
else:
    print(f"\nBest NPMI Score achieved during tuning: {best_score:.4f}")
    print(f"Best LDA Parameters found: {best_params}")
    # Use the dictionary containing the best combination found during tuning
    best_lda_params = best_params

# Ensure the best_lda_params dictionary is ready for Block 6
print(f"\nParameters selected for final model training in Block 6: {best_lda_params}")

# Note: Block 6 will automatically use 'best_lda_params' when run next.


In [None]:
# Block 6: Train Final LDA Model with Best Parameters

print("\nTraining final Gensim LDA model using best parameters found during tuning...")
# Use best_lda_params dictionary determined in Block 5
final_lda = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=best_lda_params['num_topics'], # Get k from best_params
    random_state=42,
    chunksize=100,
    passes=20, # Use more passes for the final model
    alpha=best_lda_params['alpha'], # Get alpha from best_params
    eta=best_lda_params['eta'],     # Get eta from best_params
    workers=num_workers # num_workers should still be defined from Block 5
)

print("\nFinal Model Configuration:")
print(final_lda)
# The script will now proceed automatically to Block 7 (Save Model), Block 8 (Evaluate), etc.


In [None]:
# Block 7: Save Trained Gensim LDA Model and Dictionary
print("Saving Gensim LDA model and dictionary...")

model_path = os.path.join(model_output_dir, 'gensim_lda.model')
final_lda.save(model_path)

dict_path = os.path.join(model_output_dir, 'gensim_dictionary.dict')
dictionary.save(dict_path)

print(f"Gensim LDA model saved to {model_path}")
print(f"Gensim dictionary saved to {dict_path}")


In [None]:
# Block 8: Model Evaluation and Coherence Analysis

# Get topics as list of lists of words for diversity calculation
topics_for_diversity = final_lda.show_topics(
    num_topics=final_lda.num_topics,
    num_words=15, # Consistent number of words
    formatted=False
)
all_topic_words = [word for topic_id, topic in topics_for_diversity for word, prob in topic]
topic_diversity = len(set(all_topic_words)) / (final_lda.num_topics * 15)


print("Calculating coherence for the final model...")
# Calculate C_NPMI Coherence
coherence_model_npmi = CoherenceModel(
    model=final_lda,
    texts=texts,
    corpus=corpus, # Providing corpus might improve some coherence measures
    dictionary=dictionary,
    coherence='c_npmi',
    processes=num_workers
)
lda_coherence_npmi = coherence_model_npmi.get_coherence()

# Calculate C_V Coherence
print("Calculating C_V coherence...")
coherence_model_cv = CoherenceModel(
    model=final_lda,
    texts=texts, # C_V requires the original texts
    dictionary=dictionary,
    coherence='c_v',
    processes=num_workers
)
lda_coherence_cv = coherence_model_cv.get_coherence()


# Store the integer seed value instead of the RandomState object
# Assuming the seed was set using np.random.seed(42) and passed as random_state=42
# If the model's internal random state was derived differently, this might need adjustment,
# but for this code, 42 is the effective seed.
used_random_seed = 42 # The seed we specified for reproducibility

evaluation_results = {
    'model_type': 'Gensim LDA',
    'coherence_score_npmi': lda_coherence_npmi, # Renamed for clarity
    'coherence_score_cv': lda_coherence_cv,     # Added C_V score
    'topic_diversity': topic_diversity,
    'n_topics': final_lda.num_topics,
    'parameters': {
        'num_topics': final_lda.num_topics,
        'passes': final_lda.passes,
        'chunksize': final_lda.chunksize,
        # 'alpha': final_lda.alpha, # Record learned alpha if 'auto' was used
        # 'eta': final_lda.eta,   # Record learned eta if 'auto' was used
        'random_state_seed': used_random_seed # Store the seed integer
    }
}

eval_path = os.path.join(evaluation_output_dir, 'gensim_lda_evaluation.json')
# Saving to JSON should now work
try:
    with open(eval_path, 'w') as f:
        json.dump(evaluation_results, f, indent=4)
    print(f"Evaluation results saved to {eval_path}")
except TypeError as e:
    # This block should hopefully not be reached now
    print(f"Error saving evaluation results to JSON: {e}")
    print("Evaluation Results Dictionary:", evaluation_results)


print("\nModel Evaluation Results:")
print(f"NPMI Coherence Score: {lda_coherence_npmi:.4f}") # Updated variable name
print(f"C_V Coherence Score: {lda_coherence_cv:.4f}")   # Added C_V output
print(f"Topic Diversity: {topic_diversity:.4f}")


In [None]:
# Block 9: Function to Display Top Words for each Topic

def display_topics(model, n_top_words=15):
    """Displays topics from a Gensim LDA model and returns them as a 0-indexed dictionary."""
    shown_topics = model.show_topics(num_topics=model.num_topics, num_words=n_top_words, formatted=False)
    topics_dict = {}
    print("\nGensim LDA Topics:")
    for topic_id, topic_words_probs in shown_topics:
        topic_words = [word for word, prob in topic_words_probs]
        topics_dict[topic_id] = topic_words # 0-based index for internal use
        print(f"Topic {topic_id + 1}: {' '.join(topic_words)}") # Display as 1-based
    return topics_dict

lda_topics = display_topics(final_lda)


In [None]:
# Block 10: Create Word Clouds for Visualization

def create_wordcloud(topics_dict, model_name):
    """Creates and saves a single figure with word clouds for each topic."""
    num_topics = len(topics_dict)
    n_cols = 5 # Adjust grid layout as needed
    n_rows = (num_topics + n_cols - 1) // n_cols
    plt.figure(figsize=(n_cols * 5, n_rows * 3))

    for topic_idx, words in topics_dict.items(): # topic_idx is 0-based
        word_freq = {word: 1 for word in words}
        wordcloud = WordCloud(
            width=400, height=200, background_color='white', max_words=100
        ).generate_from_frequencies(word_freq)

        plt.subplot(n_rows, n_cols, topic_idx + 1) # subplot requires 1-based index
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'{model_name} Topic {topic_idx + 1}') # Display 1-based index

    plt.tight_layout(pad=2.0)
    out_path_all = os.path.join(viz_output_dir, f'{model_name}_all_topics_wordclouds.png')
    plt.savefig(out_path_all, bbox_inches='tight')
    plt.close()
    print(f"Saved combined word cloud figure to {out_path_all}")

# Use the 0-indexed dictionary from display_topics
create_wordcloud(lda_topics, 'Gensim_LDA')


In [None]:
# Block 11: Assign Topics to Documents

print("Assigning topics to documents using Gensim LDA model...")

def get_dominant_topic_gensim(doc_topics):
    """Gets the dominant topic (0-based index) and its probability."""
    if not doc_topics:
        return -1, 0.0 # Indicate no topic assigned above threshold
    dominant_topic = max(doc_topics, key=lambda item: item[1])
    return dominant_topic[0], dominant_topic[1]

# Get topic distribution per document (list of tuples (topic_id, probability))
# Use minimum_probability=0.0 to get all assignments, can increase to filter low-prob assignments
doc_topic_tuples = [get_dominant_topic_gensim(final_lda.get_document_topics(bow, minimum_probability=0.0)) for bow in corpus]

dominant_topics, topic_confidences = zip(*doc_topic_tuples)
all_posts['lda_dominant_topic'] = dominant_topics
all_posts['lda_topic_confidence'] = topic_confidences

# Convert to 1-based index for analysis/display consistency
all_posts['lda_dominant_topic'] = all_posts['lda_dominant_topic'] + 1

# Optional: Handle docs where no topic was assigned (dominant_topic == -1 -> 0 after +1)
# print(f"Number of documents with no assigned topic: {sum(all_posts['lda_dominant_topic'] == 0)}")
# all_posts.loc[all_posts['lda_dominant_topic'] == 0, ['lda_dominant_topic', 'lda_topic_confidence']] = [np.nan, np.nan]

print("Sample of assigned topics (1-based index):")
print(all_posts[['lda_dominant_topic', 'lda_topic_confidence']].head(10))

# Use nullable integer type for topic index in case of NaNs
all_posts['lda_dominant_topic'] = all_posts['lda_dominant_topic'].astype('Int64')

print("\nData types after assignment:")
print(all_posts[['lda_dominant_topic', 'lda_topic_confidence']].dtypes)


In [None]:
# Block 12: Analyze Topic Distribution by Subreddit
print("Analyzing topic distribution by subreddit...")

# Use dropna() in case NaNs were introduced in Block 11
lda_topic_by_subreddit = pd.crosstab(
    all_posts['subreddit'],
    all_posts['lda_dominant_topic'].dropna(),
    normalize='index'
) * 100

plt.figure(figsize=(12, 8))
sns.heatmap(lda_topic_by_subreddit, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('Gensim LDA Topic Distribution by Subreddit (%)')
plt.xlabel('Topic (1-based)')
plt.ylabel('Subreddit')
plt.savefig(os.path.join(evaluation_output_dir, 'gensim_lda_topic_by_subreddit.png'))
plt.close()

print("Topic distribution analysis by subreddit completed and saved.")


In [None]:
# Block 13: Analyze Overall Topic Distribution
print("\nAnalyzing overall topic distribution...")
# Note: Assignment probability/confidence values are model-specific.

topic_distribution = all_posts['lda_dominant_topic'].value_counts(normalize=True) * 100

plt.figure(figsize=(15, 6))
topic_distribution.sort_index().plot(kind='bar')
plt.title('Gensim LDA: Distribution of Topics Across All Posts')
plt.xlabel('Topic Number (1-based)')
plt.ylabel('Percentage of Posts (%)')
plt.xticks(rotation=0)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(evaluation_output_dir, 'gensim_lda_topic_distribution.png'))
plt.close()

print("\nTopic Distribution Statistics (%):")
print(topic_distribution.sort_index())

# Analyze assignment probability (confidence) WITHIN this Gensim model
avg_confidence = all_posts['lda_topic_confidence'].mean()
print(f"\nAverage Gensim LDA Topic Assignment Probability: {avg_confidence:.3f}")

# Ensure NaNs are handled if necessary before grouping
topic_confidence = all_posts.dropna(subset=['lda_dominant_topic', 'lda_topic_confidence']).groupby('lda_dominant_topic')['lda_topic_confidence'].mean()

print("\nTop 5 Topics with Highest Average Assignment Probability:")
print(topic_confidence.nlargest(5))
print("\nTop 5 Topics with Lowest Average Assignment Probability:")
print(topic_confidence.nsmallest(5))

print("\nRemember: Prioritize coherence, diversity, and interpretability over assignment probability for model comparison.")


In [14]:
# Block 14: Process Each Subreddit Separately and Save Results (Commented Out)
# def process_subreddit(file_path, dictionary, model):
#      with open(file_path, 'r', encoding='utf-8') as f:
#          df = pd.DataFrame(json.load(f))
#      if 'processed_tokens_lda_nmf' not in df.columns: # Check correct column name
#          return None, None
#
#      texts_subreddit = df['processed_tokens_lda_nmf'].tolist()
#      corpus_subreddit = [dictionary.doc2bow(text) for text in texts_subreddit]
#
#      doc_topic_tuples = [get_dominant_topic_gensim(model.get_document_topics(bow, minimum_probability=0.0))
#                          for bow in corpus_subreddit]
#
#      dominant_topics, topic_confidences = zip(*doc_topic_tuples)
#      df['lda_dominant_topic'] = dominant_topics
#      df['lda_topic_confidence'] = topic_confidences
#      df['lda_dominant_topic'] = df['lda_dominant_topic'] + 1 # 1-based index
#
#      # Handle NaNs if necessary
#      df['lda_dominant_topic'] = df['lda_dominant_topic'].astype('Int64')
#
#      return df[['id', 'title', 'lda_dominant_topic', 'lda_topic_confidence']]
#
# # Ensure doc_topics_output_dir is defined and created in Block 2 if uncommenting
# # os.makedirs(doc_topics_output_dir, exist_ok=True)
#
# print("\nProcessing subreddits individually...")
# for fname in os.listdir(processed_dir):
#      if fname.startswith('lda_nmf_r_') and fname.endswith('.json'): # Use correct prefix
#          print(f"Processing {fname}...")
#          file_path = os.path.join(processed_dir, fname)
#          results_df = process_subreddit(file_path, dictionary, final_lda)
#
#          if results_df is not None:
#              out_file = fname.replace('.json', '_gensim_lda_topics.csv') # Use new suffix
#              out_path = os.path.join(doc_topics_output_dir, out_file)
#              results_df.to_csv(out_path, index=False)
#              print(f"  Saved: {out_file}")
#          else:
#              print(f"  Skipped {fname} (missing required columns or empty)")
# print("Finished processing individual subreddits.")


In [None]:
# Define hyperparameter search space
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
    }

# Run hyperparameter search
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5
)

# Apply best hyperparameters
for param, value in best_trial.hyperparameters.items():
    setattr(training_args, param, value)

# Train final model
trainer = Trainer(
    model=model_init(),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
