In [1]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import time  # To time the model fitting process

# BERTopic-specific imports
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Add gensim imports for coherence calculation
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel


In [2]:
# Block 2: Define Paths for Data and Output
project_root = '../../'
base_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit')
bertopic_input_dir = os.path.join(base_data_dir, 'BERTopic_Data')
lda_nmf_input_dir = os.path.join(base_data_dir, 'LDA_NMF_Data')
base_output_dir = os.path.join(project_root, 'outputs', 'topic_modeling', 'bertopic')
model_output_dir = os.path.join(base_output_dir, 'models')
evaluation_output_dir = os.path.join(base_output_dir, 'evaluations')
viz_output_dir = os.path.join(base_output_dir, 'visualizations', 'wordclouds')
# doc_topics_output_dir = os.path.join(base_output_dir, 'document_topics')

os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(evaluation_output_dir, exist_ok=True)
os.makedirs(viz_output_dir, exist_ok=True)
# os.makedirs(doc_topics_output_dir, exist_ok=True)

# Set random seed for reproducibility
np.random.seed(42)


In [3]:
# Block 3: Load and Merge Data from JSON Files
dfs_text_and_meta = [] # List to hold dataframes with text and derived subreddit
dfs_tokens = []        # List to hold dataframes with tokens

# Iterate through files in the BERTopic input directory
# Assumes corresponding files exist in the LDA/NMF directory
for fname in os.listdir(bertopic_input_dir):
    if fname.startswith('bertopic_r_') and fname.endswith('.json'):
        # --- Load BERTopic Text Data ---
        bertopic_file_path = os.path.join(bertopic_input_dir, fname)
        # Extract subreddit name from the bertopic filename
        subreddit_name = fname.replace('bertopic_r_', '').replace('.json', '')

        with open(bertopic_file_path, 'r', encoding='utf-8') as f_text:
            text_data_json = json.load(f_text)
        df_text = pd.DataFrame(text_data_json)
        # Add the derived subreddit column
        df_text['subreddit'] = subreddit_name
        # Append text data ('id', 'processed_text_bertopic') and the derived subreddit
        dfs_text_and_meta.append(df_text[['id', 'subreddit', 'processed_text_bertopic']])


        # --- Construct LDA/NMF filename and path ---
        lda_nmf_fname = fname.replace('bertopic_', 'lda_nmf_')
        lda_nmf_file_path = os.path.join(lda_nmf_input_dir, lda_nmf_fname)

        # --- Load LDA/NMF Token Data ---
        with open(lda_nmf_file_path, 'r', encoding='utf-8') as f_tokens:
            token_data_json = json.load(f_tokens)
        df_tokens = pd.DataFrame(token_data_json)
        # Append only 'id' and 'processed_tokens_lda_nmf'
        dfs_tokens.append(df_tokens[['id', 'processed_tokens_lda_nmf']])


# Concatenate after the loop
all_posts_text_and_meta = pd.concat(dfs_text_and_meta, ignore_index=True)
all_posts_tokens = pd.concat(dfs_tokens, ignore_index=True)

# Merge the text/subreddit DataFrame with the tokens DataFrame
# Use 'inner' merge to keep only posts that have both text and tokens successfully loaded
all_posts = pd.merge(all_posts_text_and_meta, all_posts_tokens, on='id', how='inner')

# Drop rows where the essential text or tokens might be missing
essential_cols = ['processed_text_bertopic', 'processed_tokens_lda_nmf']
all_posts.dropna(subset=essential_cols, inplace=True)

print(f"Total number of posts after merging: {len(all_posts)}")
print("Columns available:", all_posts.columns)
# Note: The final dataframe 'all_posts' will have columns:
# 'id', 'subreddit', 'processed_text_bertopic', 'processed_tokens_lda_nmf'


Total number of posts after merging: 49175
Columns available: Index(['id', 'subreddit', 'processed_text_bertopic',
       'processed_tokens_lda_nmf'],
      dtype='object')


In [4]:
# Block 4: Data Preparation for Topic Modeling

# Assumes 'all_posts' DataFrame from Block 3 contains the necessary columns
docs = all_posts['processed_text_bertopic'].tolist()
original_tokens_for_coherence = all_posts['processed_tokens_lda_nmf'].tolist()

print(f"Number of documents for BERTopic modeling: {len(docs)}")
print(f"Number of token lists for coherence: {len(original_tokens_for_coherence)}")


Number of documents for BERTopic modeling: 49175
Number of token lists for coherence: 49175


In [5]:
# Block 5: Train BERTopic Model
print("Training BERTopic model...")
start_time = time.time()

# BERTopic model configuration
topic_model = BERTopic(
    embedding_model='paraphrase-MiniLM-L3-v2',
    min_topic_size=7,
    nr_topics='auto',
    calculate_probabilities=True,
    verbose=True
)

# Fit the model and transform documents
topics, probs = topic_model.fit_transform(docs)

end_time = time.time()
print(f"BERTopic training completed in {end_time - start_time:.2f} seconds.")

# Get topic information
topic_info = topic_model.get_topic_info()
print(f"\nTotal number of topics found (including outliers): {len(topic_info)}")
print(f"Number of non-outlier topics: {len(topic_info[topic_info['Topic'] != -1])}")

# Print the top 10 largest topics
print("\nTop 10 largest topics:")
print(topic_info.head(10))


2025-04-08 19:36:07,576 - BERTopic - Embedding - Transforming documents to embeddings.


Training BERTopic model...


Batches:   0%|          | 0/1537 [00:00<?, ?it/s]

2025-04-08 19:36:33,476 - BERTopic - Embedding - Completed ✓
2025-04-08 19:36:33,476 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-08 19:37:02,485 - BERTopic - Dimensionality - Completed ✓
2025-04-08 19:37:02,487 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-08 20:09:42,487 - BERTopic - Cluster - Completed ✓
2025-04-08 20:09:42,488 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-08 20:09:48,729 - BERTopic - Representation - Completed ✓
2025-04-08 20:09:48,753 - BERTopic - Topic reduction - Reducing number of topics
2025-04-08 20:09:49,096 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-08 20:09:55,300 - BERTopic - Representation - Completed ✓
2025-04-08 20:09:55,322 - BERTopic - Topic reduction - Reduced number of topics from 687 to 526


BERTopic training completed in 2033.03 seconds.

Total number of topics found (including outliers): 526
Number of non-outlier topics: 525

Top 10 largest topics:
   Topic  Count                                     Name  \
0     -1  19088                     -1_you_crypto_not_my   
1      0   1551                0_bear_bull_crypto_market   
2      1   1022  1_recession_stocks_recommendation_crash   
3      2    786                   2_tesla_musk_tsla_elon   
4      3    518              3_coinbase_coin_coins_email   
5      4    503             4_bitcoin_gold_miners_mining   
6      5    473          5_apple_iphone_facebook_billion   
7      6    467            6_blockchain_game_games_block   
8      7    465      7_buffett_berkshire_warren_hathaway   
9      8    437           8_options_trade_trading_option   

                                      Representation  \
0  [you, crypto, not, my, it, do, and, that, have...   
1  [bear, bull, crypto, market, we, you, run, not...   
2  [reces

In [6]:
# Block 6: Save Trained BERTopic Model
print("Saving BERTopic model...")
model_path = os.path.join(model_output_dir, 'bertopic_model')
topic_model.save(model_path)
print(f"BERTopic model saved to {model_path}")




Saving BERTopic model...
BERTopic model saved to ../../outputs\topic_modeling\bertopic\models\bertopic_model


In [7]:
# Block 7: Model Evaluation and Coherence Analysis
# This block evaluates the BERTopic model's performance using NPMI coherence
# and saves the results for comparison with other topic models.

# Setup for coherence evaluation using original tokens
# Note: Using original tokens (before BERTopic's specific preprocessing) for coherence
# is often recommended for interpretability.
texts = original_tokens_for_coherence # Assumes this variable holds the list of token lists from before BERTopic processing
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts] # Keep BoW corpus for NPMI

# Define number of workers for coherence calculation
num_workers = os.cpu_count() - 1 if os.cpu_count() and os.cpu_count() > 1 else 1

# Get topics from the trained BERTopic model
def get_bertopic_topics(model, n_top_words=15):
    """Extract top words for each topic from BERTopic model, excluding outlier topic -1."""
    topics_list = []
    # Get all topic IDs, filter out the outlier topic (-1)
    valid_topic_ids = sorted([topic_id for topic_id in model.get_topics().keys() if topic_id != -1])
    for topic_id in valid_topic_ids:
        # Get the top N words for the current topic
        # BERTopic model's get_topic returns (word, probability) pairs
        words_probs = model.get_topic(topic_id)
        if words_probs: # Check if the topic is not empty
             words, _ = zip(*words_probs[:n_top_words])
             topics_list.append(list(words))
        # else: Handle potentially empty topics if necessary, though BERTopic usually prunes them.
    # Return the list of lists of words, and the count of valid (non-empty, non-outlier) topics found
    return topics_list, len(valid_topic_ids)

# Extract topics and count
# Assumes topic_model is your trained BERTopic model object from a previous block
bertopic_topics_list, num_valid_topics = get_bertopic_topics(topic_model, 15)

print("Calculating coherence for the BERTopic model...")
# Calculate NPMI Coherence (uses BoW corpus)
coherence_model_npmi = CoherenceModel(
    topics=bertopic_topics_list,
    texts=texts,
    corpus=corpus, # *** NPMI uses BoW corpus ***
    dictionary=dictionary,
    coherence='c_npmi',
    processes=num_workers
)
bertopic_coherence_npmi = coherence_model_npmi.get_coherence()
print(f"  NPMI Coherence calculated: {bertopic_coherence_npmi:.4f}")

# Calculate C_V Coherence (uses texts)
print("Calculating C_V coherence...")
coherence_model_cv = CoherenceModel(
    topics=bertopic_topics_list,
    texts=texts, # *** C_V uses original texts ***
    dictionary=dictionary,
    coherence='c_v',
    processes=num_workers
)
bertopic_coherence_cv = coherence_model_cv.get_coherence()
print(f"  C_V Coherence calculated: {bertopic_coherence_cv:.4f}")

# Calculate additional metrics: Topic Diversity and Average Topic Length
all_topic_words = [word for topic in bertopic_topics_list for word in topic]
# Calculate diversity based on the number of unique words across the top words of all valid topics
# Using num_valid_topics * 15 as the denominator, assuming 15 words per topic were requested.
topic_diversity = len(set(all_topic_words)) / (num_valid_topics * 15) if num_valid_topics > 0 else 0

# Prepare evaluation results dictionary
evaluation_results = {
    'model_type': 'BERTopic',
    'coherence_score_npmi': bertopic_coherence_npmi, # Renamed for clarity
    'coherence_score_cv': bertopic_coherence_cv,     # Added C_V score
    'n_topics': num_valid_topics, # Store the count of actual topics found (excluding -1)
    'topic_diversity': topic_diversity,
    'parameters': {
        # Ensure these parameter values match what was used in Block 5 or model training
        'embedding_model': 'paraphrase-MiniLM-L3-v2',
        'min_topic_size': 7,
        'nr_topics': 'auto'
    }
}

# Save evaluation results to a JSON file
eval_path = os.path.join(evaluation_output_dir, 'bertopic_evaluation.json')
with open(eval_path, 'w') as f:
    json.dump(evaluation_results, f, indent=4)

# Print evaluation results
print("\nModel Evaluation Results:")
print(f"NPMI Coherence Score: {bertopic_coherence_npmi:.4f}") # Updated variable name
print(f"C_V Coherence Score: {bertopic_coherence_cv:.4f}")   # Added C_V output
print(f"Topic Diversity: {topic_diversity:.4f}")


Calculating coherence for the BERTopic model...
  NPMI Coherence calculated: 0.0512
Calculating C_V coherence...
  C_V Coherence calculated: 0.5279

Model Evaluation Results:
NPMI Coherence Score: 0.0512
C_V Coherence Score: 0.5279
Topic Diversity: 0.4236


In [None]:
# Block 8: Analyze and Visualize Topics
# Helper function to display topics
def display_topics(topic_model, n_topics=20, n_words=15):
    topics_dict = {}
    top_topics = topic_info[topic_info['Topic'] != -1].sort_values('Count', ascending=False).head(n_topics)['Topic'].tolist()

    for topic_id in top_topics:
        words, weights = zip(*topic_model.get_topic(topic_id))
        topics_dict[topic_id] = list(words)
        words_str = ' '.join(words[:n_words])
        print(f"Topic {topic_id}: {words_str}")

    return topics_dict

# Display topics
print("\nBERTopic Topics:")
bertopic_topics = display_topics(topic_model)


In [None]:
# Block 9: Create Visualizations

# 1. Topic-term barchart
top_n_topics = topic_info[topic_info['Topic'] != -1].sort_values('Count', ascending=False).head(15)['Topic'].tolist()
topic_model.visualize_barchart(top_n_topics=len(top_n_topics), topics=top_n_topics, n_words=8, height=600).write_html(
    os.path.join(viz_output_dir, 'bertopic_barchart.html')
)

# 2. Topic similarity map
# Note: This might fail if there are too few topics
topic_model.visualize_topics(top_n_topics=len(top_n_topics), topics=top_n_topics).write_html(
    os.path.join(viz_output_dir, 'bertopic_topic_similarity.html')
)

# 3. Topic hierarchy dendrogram
# Note: This might fail if there are too few topics
if len(top_n_topics) > 5:
    # Assumes 'docs' list exists from Block 4
    hierarchy = topic_model.hierarchical_topics(docs)
    topic_model.visualize_hierarchy(top_n_topics=len(top_n_topics)).write_html( # Removed hierarchy kwarg, kept top_n_topics
        os.path.join(viz_output_dir, 'bertopic_hierarchy.html')
    )

# 4. Create Word Clouds for top topics
def create_wordclouds(topic_model, topics_dict):
    # Assumes viz_output_dir is defined
    for topic_id, words in topics_dict.items():
        words_weights = dict(topic_model.get_topic(topic_id)[:50])

        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            max_words=100
        ).generate_from_frequencies(words_weights)

        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'BERTopic Topic {topic_id}')

        out_path = os.path.join(viz_output_dir, f'bertopic_topic_{topic_id}_wordcloud.png')
        plt.savefig(out_path, bbox_inches='tight')
        plt.close()
        print(f"Saved word cloud for BERTopic Topic {topic_id}")

# Create word clouds using the dictionary from Block 7
create_wordclouds(topic_model, bertopic_topics)


In [None]:
# Block 10: Assign Topics to Documents
# Get the topic distribution for each document
# The topics and probabilities ('probs') were obtained during fit_transform in Block 5.

# Find the dominant topic for each document
all_posts['bertopic_topic'] = topics
all_posts['bertopic_topic_confidence'] = [max(prob) for prob in probs]

# Add one to make topics 1-indexed instead of 0-indexed for better readability
# (This step is usually omitted for BERTopic due to the meaning of topic ID -1)

# Sample of posts with their assigned topics
print(all_posts[['title', 'bertopic_topic', 'bertopic_topic_confidence']].head(10))


In [None]:
# Block 11: Analyze Topic Distribution by Subreddit
# Create a cross-tabulation of subreddits and topics
# Applying crosstab directly to all_posts, including outlier topic (-1)
bertopic_by_subreddit = pd.crosstab(
    all_posts['subreddit'],
    all_posts['bertopic_topic'],
    normalize='index'
) * 100

# Plot the distribution
plt.figure(figsize=(12, 8)) # Match LDA figure size
sns.heatmap(bertopic_by_subreddit, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('BERTopic Topic Distribution by Subreddit (%)') # Match title format
plt.xlabel('Topic')
plt.ylabel('Subreddit')
plt.savefig(os.path.join(evaluation_output_dir, 'bertopic_topic_by_subreddit.png'))
plt.close()

print("Topic distribution analysis by subreddit completed and saved")


In [None]:
# Block 12: Analyze Overall Topic Distribution
# Note: Confidence values are model-specific and should NOT be used to compare between different models (e.g., LDA vs BERTopic).
# These values are useful for: 1) filtering unreliable topic assignments, 2) identifying which topics have more
# consistent assignments within this specific model, and 3) potentially flagging uncertain classifications in applications.

# Calculate topic distribution across all posts
# Using bertopic_topic which includes outlier topic -1
topic_distribution = all_posts['bertopic_topic'].value_counts(normalize=True) * 100

# Create a bar plot of topic distribution (for all topics)
plt.figure(figsize=(15, 6)) # Match LDA figure size
topic_distribution.sort_index().plot(kind='bar')
plt.title('Distribution of Topics Across All Posts (BERTopic)') # Match LDA title format
plt.xlabel('Topic Number')
plt.ylabel('Percentage of Posts')
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Save the plot
# Assumes evaluation_output_dir is defined
plt.savefig(os.path.join(evaluation_output_dir, 'bertopic_topic_distribution.png')) # Match LDA filename format
plt.close()

# Print topic distribution statistics (including outlier topic -1)
print("\nTopic Distribution Statistics:")
print(topic_distribution.sort_index())

# The following confidence analysis helps understand the relative strength of topic assignments WITHIN this model only.
# Higher values indicate more distinctive topic assignments, while lower values suggest documents with mixed topics.
# Calculate average topic confidence (across all assigned topics, including -1 if it has confidence)
# Assumes 'bertopic_topic_confidence' column exists and was calculated in Block 10
avg_confidence = all_posts['bertopic_topic_confidence'].mean()
print(f"\nAverage BERTopic Topic Assignment Strength: {avg_confidence:.3f}") # Match LDA print format

# Find most and least confident topics (including outlier topic -1)
# Assumes 'bertopic_topic' column exists
topic_confidence = all_posts.groupby('bertopic_topic')['bertopic_topic_confidence'].mean()
print("\nTop 5 Topics with Highest Assignment Strength:")
print(topic_confidence.nlargest(5))
print("\nTop 5 Topics with Lowest Assignment Strength:")
# Filter out NaN confidence values before finding smallest, in case outliers had NaN confidence assigned
print(topic_confidence.dropna().nsmallest(5))

# When selecting between models (LDA, NMF, BERTopic, etc.), prioritize coherence scores, topic diversity,
# and qualitative evaluation of topic interpretability rather than these confidence values.


In [None]:
# Block 13: Topic Similarity Analysis

non_outlier_topics = [t for t in bertopic_topics.keys() if t != -1]

topic_similarity = topic_model.calculate_topic_similarity(non_outlier_topics)

plt.figure(figsize=(12, 10))
sns.heatmap(topic_similarity, annot=True, cmap='coolwarm', fmt='.2f',
            xticklabels=non_outlier_topics, yticklabels=non_outlier_topics)
plt.title('Topic Similarity Matrix')
# Assumes evaluation_output_dir is defined
plt.savefig(os.path.join(evaluation_output_dir, 'bertopic_similarity_matrix.png'))
plt.close()

print("Topic similarity analysis completed and saved")

similarity_df = pd.DataFrame(topic_similarity, index=non_outlier_topics, columns=non_outlier_topics)
np.fill_diagonal(similarity_df.values, 0)

max_similarities = []
for i in range(len(non_outlier_topics)):
    for j in range(i + 1, len(non_outlier_topics)):
        t1 = non_outlier_topics[i]
        t2 = non_outlier_topics[j]
        sim = similarity_df.loc[t1, t2]
        max_similarities.append((t1, t2, sim))

max_similarities.sort(key=lambda x: x[2], reverse=True)

print("\nTop 5 most similar topic pairs:")
# Assumes bertopic_topics dictionary exists
for t1, t2, sim in max_similarities[:5]:
    print(f"Topic {t1} and Topic {t2}: {sim:.3f}")
    print(f"  Topic {t1}: {' '.join(bertopic_topics.get(t1, ['N/A'])[:7])}")
    print(f"  Topic {t2}: {' '.join(bertopic_topics.get(t2, ['N/A'])[:7])}")
    print()


In [None]:
# # Block 14: Process Each Subreddit Separately and Save Results

# for fname in os.listdir(finbert_posts_dir):
#     if fname.startswith('finbert_r_') and fname.endswith('.json'):
#         file_path = os.path.join(finbert_posts_dir, fname)
#         with open(file_path, 'r', encoding='utf-8') as f:
#             df = pd.DataFrame(json.load(f))

#         # Prepare text for topic modeling (directly use the column)
#         subreddit_docs = df['processed_text_finbert'].tolist()

#         # Get topic distribution for each document using BERTopic model
#         subreddit_topics, subreddit_probs = topic_model.transform(subreddit_docs)

#         # Find the dominant topic and confidence for each document
#         df['bertopic_topic'] = subreddit_topics
#         df['bertopic_topic_confidence'] = [max(prob) for prob in subreddit_probs]

#         # Save results for this subreddit
#         out_file = fname.replace('finbert_r_', 'processed_r_').replace('.json', '_bertopic_topics.csv')
#         out_path = os.path.join(doc_topics_output_dir, out_file)
#         df[['id', 'bertopic_topic', 'bertopic_topic_confidence']].to_csv(out_path, index=False)
#         print("Saved:", out_file)
