In [1]:
# Tweet Topic Modeling with BERTopic
# This notebook performs topic modeling on a large database of tweets using BERTopic.
# Since the database is large, we use efficient loading techniques and batch processing.
# We use pysentimiento for specialized tweet preprocessing before applying BERTopic.
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import hdbscan
from tqdm.notebook import tqdm
import re
import nltk
from nltk.corpus import stopwords
from pysentimiento.preprocessing import preprocess_tweet
import warnings
import logging
warnings.filterwarnings("ignore")
logging.getLogger('gensim').setLevel(logging.ERROR)
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Wedge
from sentence_transformers import SentenceTransformer
from datetime import datetime
import os
import yaml
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import plotly.io as pio
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from math import ceil
from matplotlib.patches import Wedge, Rectangle
from itertools import combinations
# from openai import OpenAI

# client = OpenAI() 

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shayher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/shayher/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def split_and_train_multiple_models(tweets, num_chunks=5):
    print("Splitting data into multiple chunks and training BERTopic models...")
    models = []
    config = load_bertopic_config()
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    stop_words = set(stopwords.words('english'))
    additional_stopwords = {
        'rt', 'amp', 'twitter', 'tweet', 'tweets', 'like', 'just', 'still', 'got', 'one',
        'he', 'she', 'they', 'you', 'i', 'we', 'im', 'hes', 'shes', 'its', 're', 'theyre',
        'dont', 'doesnt', 'would', 'could', 'know', 'say', 'said', 'look', 'looks',
        'think', 'oh', 'yeah', 'is', 'are', 'and', 'guy', 'gonna', 'thing', 'someone',
        'people', 'get', 'thats', 'really', 'even', 'want', 'need', 'needs', 'your',
        'youre', 'not', 'this', 'that', 'it', 'url', 'emoji'
    }
    stop_words.update(additional_stopwords)
    vectorizer_model = TfidfVectorizer(stop_words=list(stop_words))

    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

    # Calculating the size of each chunk
    num_tweets = len(tweets)
    chunk_size = ceil(num_tweets / num_chunks)

    for i in range(0, num_tweets, chunk_size):
        chunk_number = i // chunk_size + 1
        print(f"Training sub-model on chunk {chunk_number}...")
        chunk = tweets[i:i + chunk_size]
        sub_model = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
            umap_model=umap_model,
            nr_topics=config.get("nr_topics"),
            min_topic_size=config.get("min_topic_size"),
            calculate_probabilities=config.get("calculate_probabilities"),
            verbose=config.get("verbose")
        )
        sub_model.fit(chunk)
        models.append(sub_model)

    print(f"Merging {len(models)} topic models into one final model...")
    merged_model = BERTopic.merge_models(models)
    return merged_model


In [3]:
# Output Directory 
OUTPUT_DIR = "bertopic_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
def save_figure(fig, name):
    path = os.path.join(OUTPUT_DIR, f"{name}.png")
    if hasattr(fig, "write_image"):
        fig.write_image(path)
    elif hasattr(fig, "savefig"):
        fig.savefig(path)
        plt.close(fig)
    else:
        print(f"[Warning] Could not save figure {name}: unsupported type")

In [5]:
# Load Configuration
def load_bertopic_config(config_path='bertopic_config.yaml'):
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

In [6]:
# Database connection function
def connect_to_db(db_path):
    """Connect to the SQLite database."""
    try:
        conn = sqlite3.connect(db_path)
        print(f"Successfully connected to database: {db_path}")
        return conn
    except sqlite3.Error as e:
        print(f"Error connecting to database: {e}")
        return None        

In [7]:
# Data Preprocessing
def preprocess_tweets(text):
    """Clean and preprocess tweet text using pysentimiento and additional cleaning."""
    if not isinstance(text, str):
        return ""
    # Use pysentimiento's specialized tweet preprocessor first
    # This handles mentions, URLs, emojis, and other Twitter-specific elements
    processed_text = preprocess_tweet(text)
    # Additional cleaning
    # Remove any remaining special characters and numbers
    processed_text = re.sub(r'[^\w\s]', '', processed_text)
    processed_text = re.sub(r'\d+', '', processed_text)
    # Convert to lowercase
    processed_text = processed_text.lower()

    stop_words = set(stopwords.words('english'))
    additional_stopwords = {'rt', 'amp', 'twitter', 'tweet', 'tweets', 'like', 'just', 'still', 'got', 'one', 'he', 'she', 'they', 'you', 'i', 'we', 'im', 'hes', 'shes', 'its', 'rt', 'amp','re',  'hes', 'shes', 'theyre', 'im', 'dont', 'doesnt', 'would', 'could', 'like', 
    'know', 'say', 'said', 'one', 'look', 'looks', 'think', 'oh', 'yeah', 'is', 'are', 'and'
    'guy', 'gonna', 'thing', 'someone', 'people', 'get', 'got', 'thats', 'just',
    'really', 'even', 'still', 'want', 'need', 'needs','you','your', 'it','this','that','your','youre','not'}
    stop_words = stop_words.union(additional_stopwords)
    tokens = processed_text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Remove extra whitespace
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    return ' '.join(tokens)


In [8]:
# Efficient Data Loading
def load_tweets_in_batches(conn, batch_size=10000, max_tweets=None):
    """
    Load tweets in batches to handle large datasets efficiently.
    Args:
        conn: Database connection
        batch_size: Number of tweets to load in each batch
        max_tweets: Maximum number of tweets to load (None for all)
        
    Returns:
        List of preprocessed tweet texts
    """
    cursor = conn.cursor()
    # Get total count if needed
    if max_tweets is None:
        cursor.execute("SELECT COUNT(*) FROM posts")
        max_tweets = cursor.fetchone()[0]
    
    # Initialize variables
    all_tweets = []
    all_dates = []
    # offset = 0
    last_rowid = 0
    total_loaded = 0
    print(f"Loading up to {max_tweets:,} tweets in batches of {batch_size:,}")
    # Load in batches
    pbar = tqdm(total=min(max_tweets, max_tweets))
    while total_loaded < max_tweets:
        query = f"""
            SELECT ROWID, content, date
            FROM posts
            WHERE ROWID > {last_rowid}
            ORDER BY ROWID ASC
            LIMIT {batch_size}
        """
        cursor.execute(query)
        batch = cursor.fetchall()

        # Check if batch is empty
        if not batch:
            break

        # Process batch
        for rowid, content, date in batch:
            if content is None or date is None:
                continue
            cleaned = preprocess_tweets(content)
            if len(cleaned) > 20:
                all_tweets.append(cleaned)
                if "Jerusalem Daylight Time" in date:
                    cleaned_date = date.split("Jerusalem Daylight Time")[0].strip()
                else:
                    cleaned_date = date
                all_dates.append(pd.to_datetime(cleaned_date, errors='coerce'))
                total_loaded += 1
                last_rowid = rowid  # Update the last ROWID processed

        pbar.update(len(batch))
    pbar.close()
    print(f"Loaded {len(all_tweets):,} tweets after preprocessing and filtering")
    return all_tweets, all_dates

In [9]:
def save_topic_mapping(topic_model, tweets, output_path='bertopic_output/tweet_topic_mapping.csv'):
    """
    Save tweet-topic mapping to CSV, using updated topics from the model.
    """
    topic_info = topic_model.get_topic_info()
    topic_mapping = {}

    for _, row in topic_info.iterrows():
        topic_id = row['Topic']
        if topic_id != -1:  # מתעלמים מאאוטליירים
            topic_mapping[topic_id] = row['Name']

    topics, _ = topic_model.transform(tweets)

    mapping_df = pd.DataFrame(list(zip(tweets, topics)), columns=['Document', 'Topic'])

    mapping_df['Topic'] = mapping_df['Topic'].map(topic_mapping).fillna('Outlier')

    mapping_df.to_csv(output_path, index=False)
    print(f'Tweet-topic mapping saved to {output_path}')


In [10]:
# Topic Modeling with BERTopic
def create_topic_model(tweets):
    """
    Create and train a BERTopic model.
    Args:
        tweets: List of preprocessed tweet texts
        nr_topics: Number of topics to extract ("auto" to determine automatically)
        
    Returns:
        Trained BERTopic model
    """
    print("Setting up the topic modeling pipeline...")
    # Create sentence transformer model for embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Smaller model for efficiency

    stop_words = set(stopwords.words('english'))
    additional_stopwords = {'rt', 'amp', 'twitter', 'tweet', 'tweets', 'like', 'just', 'still', 'got', 'one', 'he', 'she', 'they', 'you', 'i', 'we', 'im', 'hes', 'shes', 'its', 'rt', 'amp','re',  'hes', 'shes', 'theyre', 'im', 'dont', 'doesnt', 'would', 'could', 'like', 
    'know', 'say', 'said', 'one', 'look', 'looks', 'think', 'oh', 'yeah', 'is', 'are', 'and'
    'guy', 'gonna', 'thing', 'someone', 'people', 'get', 'got', 'thats', 'just',
    'really', 'even', 'still', 'want', 'need', 'needs','you','your', 'it','this','that','your','youre','not'}
    stop_words = stop_words.union(additional_stopwords)
    vectorizer_model = TfidfVectorizer(stop_words=list(stop_words))
    config = load_bertopic_config()
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        nr_topics=config.get("nr_topics"),
        min_topic_size=config.get("min_topic_size"),
        calculate_probabilities=config.get("calculate_probabilities"),
        verbose=config.get("verbose")
    )
    
    print("Training BERTopic model...")
    topics, probs = topic_model.fit_transform(tweets)
    topic_model.update_topics(
        tweets,
        topics=topics,
        vectorizer_model=vectorizer_model
    )
    print(f"Model training complete. Found {len(topic_model.get_topic_info())-1} topics.")
    return topic_model, topics

In [11]:
# Evaluation and Logging 
def evaluate_model(topic_model, tweets):
    tokenized_texts = [preprocess_tweets(doc).split() for doc in tweets]
    dictionary = Dictionary(tokenized_texts)
    topics = [[term for term, _ in topic_model.get_topic(topic_id)[:10]] 
              for topic_id in topic_model.get_topics() if topic_id != -1]
    coherence_model = CoherenceModel(
        topics=topics,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    all_topic_words = [word for topic in topics for word in topic]
    unique_topic_words = set(all_topic_words)
    diversity_score = len(unique_topic_words) / len(all_topic_words)
    result_text = f"Coherence Score: {coherence_score:.4f}\nDiversity Score: {diversity_score:.4f}"
    print(result_text)
    with open(os.path.join(OUTPUT_DIR, "evaluation_scores.txt"), "w") as f:
        f.write(result_text)
    return coherence_score, diversity_score

In [12]:
def plot_coherence_vs_min_topic_size(tweets):
    min_topic_sizes = list(range(70, 101, 5))
    coherence_scores = []
    texts = [tweet.split() for tweet in tweets]  # Pre-cleaned texts
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    for min_size in min_topic_sizes:
        print(f"Training model with min_topic_size={min_size}...")
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        stop_words = set(stopwords.words('english'))
        additional_stopwords = {'rt', 'amp', 'twitter', 'tweet', 'tweets', 'like', 'just', 'still', 'got', 'one'}
        stop_words.update(additional_stopwords)
        vectorizer_model = CountVectorizer(stop_words=list(stop_words))

        topic_model = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
            nr_topics="auto",
            min_topic_size=min_size
        )
        topics, _ = topic_model.fit_transform(tweets)
        new_topics = topic_model.reduce_outliers(tweets, topics, strategy="embeddings")
        topic_model.update_topics(tweets, topics=new_topics)
        topics, _ = topic_model.transform(tweets)

        topic_words = [[word for word, _ in topic_model.get_topic(topic)] for topic in topic_model.get_topics() if topic != -1]
        coherence_model = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_scores.append(coherence_score)

    fig = plt.figure(figsize=(10, 6))
    plt.plot(min_topic_sizes, coherence_scores, marker='o')
    plt.title("Coherence Score vs. min_topic_size")
    plt.xlabel("min_topic_size")
    plt.ylabel("Coherence Score (c_v)")
    plt.grid(True)
    save_figure(fig, "coherence_vs_min_topic_size")

In [13]:
# Analyzing Topics
# Get topic information
def analyze_and_display_topics(topic_model):
    topic_info = topic_model.get_topic_info()
    output_path = os.path.join(OUTPUT_DIR, "topic_distribution.png")
    
    fig = plt.figure(figsize=(12, 6))
    sizes = topic_info[topic_info['Topic'] != -1]['Count'].values
    plt.hist(sizes, bins=30)
    plt.xlabel('Topic Size (Number of Tweets)')
    plt.ylabel('Number of Topics')
    plt.title('Distribution of Topic Sizes')
    plt.tight_layout()
    save_figure(fig, "topic_distribution")
    
    # Get the top topics (excluding the -1 outlier topic)
    top_topics = topic_info[topic_info['Topic'] != -1].head(10)['Topic'].values
    
    output_text_path = os.path.join(OUTPUT_DIR, "top_terms.txt")
    with open(output_text_path, "w") as f:
        for topic in top_topics:
            f.write(f"\nTopic {topic}:\n")
            for term, weight in topic_model.get_topic(topic)[:10]:
                f.write(f"  {term}: {weight:.4f}\n")
    
    output_similarity_path = os.path.join(OUTPUT_DIR, "topic_similarity.html")
    try:
        fig = topic_model.visualize_topics()
        fig.write_html(output_similarity_path)
    except Exception as e:
        print(f"Could not generate interactive topic similarity: {e}")
        
    fig, axes = plt.subplots(1, 5, figsize=(20, 4))
    for i, topic in enumerate(top_topics[:5]):
        try:
            word_freq = dict(topic_model.get_topic(topic)[:30])
            wc = WordCloud(width=400, height=400, background_color='white').generate_from_frequencies(word_freq)
            axes[i].imshow(wc, interpolation='bilinear')
            axes[i].set_title(f"Topic {topic}", fontsize=12)
            axes[i].axis('off')
        except Exception as e:
            print(f"Could not generate wordcloud for topic {topic}: {e}")
    plt.tight_layout()
    output_wordcloud_path = os.path.join(OUTPUT_DIR, "wordclouds_top5_topics.png")
    plt.savefig(output_wordcloud_path, bbox_inches='tight')
    plt.close()
    print(f"Saved combined word cloud image to {output_wordcloud_path}")

    print("\nGenerating topic hierarchy visualization...")
    try:
        fig = topic_model.visualize_hierarchy()
        save_figure(fig, "topic_hierarchy")
    except Exception as e:
        print(f"Could not generate topic hierarchy visualization: {e}")

In [14]:
def visualize_topics_over_time(topic_model, tweets, topics, timestamps):
    print("\nGenerating Topics Over Time visualization...")
    if isinstance(topics, np.ndarray):
        topics = topics.tolist()
    timestamps = pd.to_datetime(timestamps, errors='coerce')  # Already done earlier
    valid_mask = ~pd.isnull(timestamps)
    tweets = [tweet for i, tweet in enumerate(tweets) if valid_mask[i]]
    topics = [topic for i, topic in enumerate(topics) if valid_mask[i]]
    timestamps = timestamps[valid_mask]

    topics_over_time = topic_model.topics_over_time(
        docs=tweets,
        topics=topics,
        timestamps=timestamps,
        nr_bins=20,
        global_tuning=False
    )
    fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=None)
    output_path = os.path.join(OUTPUT_DIR, "topics_over_time.html")
    fig.write_html(output_path)

In [15]:
def plot_stacked_topic_trends(tweets, timestamps, topics, num_total_topics, time_freq='M'):
    if num_total_topics is None:
        raise ValueError("num_total_topics must be provided.")

    df = pd.DataFrame({
        'timestamp': pd.to_datetime(timestamps, errors='coerce'),
        'topic': topics
    }).dropna()

    if time_freq == 'M':
        df['time_period'] = df['timestamp'].dt.to_period('M').dt.to_timestamp()
    elif time_freq == 'Y':
        df['time_period'] = df['timestamp'].dt.to_period('Y').dt.to_timestamp()
    elif time_freq == 'W':
        df['time_period'] = df['timestamp'].dt.to_period('W').dt.to_timestamp()
    elif time_freq == 'Q':
        df['time_period'] = df['timestamp'].dt.to_period('Q').dt.to_timestamp()
    else:
        raise ValueError(f"Unsupported time_freq: {time_freq}")

    df['topic'] = df['topic'].astype(int)
    topic_dummies = pd.get_dummies(df['topic'], prefix='Topic')

    all_topic_cols = [f'Topic_{i}' for i in range(num_total_topics)]
    for col in all_topic_cols:
        if col not in topic_dummies:
            topic_dummies[col] = 0
    topic_dummies = topic_dummies[all_topic_cols]

    grouped = pd.concat([df['time_period'], topic_dummies], axis=1).groupby('time_period').sum()

    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 8))

    try:
        colors = plt.cm.get_cmap('tab20', len(grouped.columns))(np.linspace(0, 1, len(grouped.columns)))
        grouped.plot(kind='area', stacked=True, ax=ax, alpha=0.85, color=colors, linewidth=0.5)
    except Exception as e:
        print(f"Could not generate colormap: {e}. Using default.")
        grouped.plot(kind='area', stacked=True, ax=ax, alpha=0.85, linewidth=0.5)

    ax.set_title("Topic Trends Over Time (Stacked Area)", fontsize=16)
    ax.set_xlabel(f"Time Period ({time_freq})", fontsize=14)
    ax.set_ylabel("Number of Tweets per Topic", fontsize=14)

    # 8. הוספת מקרא בתיבה חיצונית
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles,
        labels,
        title='Topics',
        bbox_to_anchor=(1.02, 1),
        loc='upper left',
        fontsize=10
    )
    plt.tight_layout(rect=[0, 0, 0.88, 1])

    plt.grid(True, which='major', linestyle='--', linewidth=0.5, axis='y')
    plt.grid(False, which='major', linestyle='--', linewidth=0.5, axis='x')
    save_figure(fig, f"stacked_area_topics_over_time_{time_freq}")
    print(f"Stacked area chart saved to: stacked_area_topics_over_time_{time_freq}.png")

In [16]:
# Export Results
# Save model
def export_topic_model_results(topic_model):
    topic_info = topic_model.get_topic_info()
    topic_model.save(os.path.join(OUTPUT_DIR, "tweet_bertopic_model"))
    
    # Export topic information to CSV
    topic_info.to_csv(os.path.join(OUTPUT_DIR, "tweet_topics_info.csv"), index=False)
    
    # Export top terms for all topics
    all_topics_terms = {}
    for topic in topic_info['Topic'].values:
        if topic != -1:  # Skip outlier topic
            all_topics_terms[topic] = topic_model.get_topic(topic)
    
    # Convert to DataFrame and save
    topics_df = pd.DataFrame({
        'Topic': [topic for topic in all_topics_terms.keys() for _ in range(10)],
        'Term': [term for terms in all_topics_terms.values() for term, _ in terms[:10]],
        'Weight': [weight for terms in all_topics_terms.values() for _, weight in terms[:10]]
    })
    topics_df.to_csv(os.path.join(OUTPUT_DIR, "tweet_topic_terms.csv"), index=False)

In [17]:
def clean_params(params_dict, keys_to_keep=None):
    if keys_to_keep is None:
        keys_to_keep = ['nr_topics', 'min_topic_size', 'diversity']
    return {k: v for k, v in params_dict.items() if k in keys_to_keep}

def log_bertopic_run(model, dataset_name, coherence_score, diversity_score, log_path='topic_modeling_log.csv'):
    """
    Log the BERTopic run into a CSV file, including coherence and diversity metrics.
    
    Args:
        model (BERTopic): The BERTopic model instance.
        dataset_name (str): Name of the dataset (for identification purposes).
        coherence_score (float): c_v coherence score.
        diversity_score (float): Word diversity score.
        log_path (str): Path to the CSV log file.
    """
    n_topics = len(set(model.topics_)) - (1 if -1 in model.topics_ else 0)
    top_topics = model.get_topic_info()['Name'].head(5).tolist()
    params_dict = model.get_params()
    params_dict = model.get_params()
    cleaned_params = clean_params(params_dict)
    run_data = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'dataset_name': dataset_name,
        'model_name': 'BERTopic',
        'parameters': str(cleaned_params),
        'n_topics': n_topics,
        'top_5_topics': ' | '.join(top_topics),
        'coherence': round(coherence_score, 4),
        'diversity': round(diversity_score, 4)
    }

    df_run = pd.DataFrame([run_data])
    full_log_path = os.path.join(OUTPUT_DIR, log_path)
    if os.path.exists(full_log_path):
        df_run.to_csv(full_log_path, mode='a', header=False, index=False)
    else:
        df_run.to_csv(full_log_path, mode='w', header=True, index=False)

In [18]:
def plot_topic_distribution_by_label(db_path, column_name, topics_file='bertopic_output/tweet_topic_mapping.csv', output_path='bertopic_output/topics_label_distribution.png'):
    """
    Creates a bar plot showing topic distribution per label (normalized by each label's total tweets).
    This answers: "How does each group (label) distribute its tweets across topics?"
    """
    # Connect to the database
    conn = sqlite3.connect(db_path)

    # Retrieve data from the authors and posts tables
    query = """
    SELECT p.author, a.label, p.content
    FROM posts p
    JOIN authors a ON p.author = a.author_screen_name
    """
    posts_df = pd.read_sql_query(query, conn)

    # Clean the content field to ensure consistency
    posts_df['content'] = posts_df['content'].str.strip().str.lower()

    # Read full tweet-topic mapping from CSV file
    topics_df = pd.read_csv(topics_file)
    topics_df['Document'] = topics_df['Document'].str.strip().str.lower()

    # Merge the tweet-topic mapping with author data
    merged_df = pd.merge(posts_df, topics_df, left_on='content', right_on='Document', how='inner')

    # Group by label and topic to count tweets
    grouped_df = merged_df.groupby(['label', column_name]).size().reset_index(name='Count')

    # Pivot and normalize by each label's total tweets
    group_counts_by_label = grouped_df.pivot(index='label', columns=column_name, values='Count').fillna(0)
    normalized_counts = group_counts_by_label.div(group_counts_by_label.sum(axis=1), axis=0)

    # Sort topics by total count across all labels
    topic_totals = group_counts_by_label.sum().sort_values(ascending=False)
    sorted_columns = topic_totals.index
    normalized_counts = normalized_counts[sorted_columns]

    # Plot
    plt.figure(figsize=(18, 10))
    colors = plt.cm.viridis(np.linspace(0, 0.9, len(normalized_counts.index)))
    ax = normalized_counts.T.plot(kind='bar', figsize=(18, 10), color=colors, width=0.8)

    plt.xlabel("Topics", fontsize=14, fontweight='bold')
    plt.ylabel("Proportion of Tweets within Label", fontsize=14, fontweight='bold')
    plt.title("Distribution of Tweets by Topic (Normalized by Label)", fontsize=16, fontweight='bold')
    plt.legend(title="Labels", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Save and show
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"Normalized label-topic plot saved to: {output_path}")

In [19]:
def plot_sunburst_for_top_topics_bertopic(
    db_path,
    topic_model,
    column_name,
    topics_file='bertopic_output/tweet_topic_mapping.csv',
    topic_info_file='bertopic_output/tweet_topics_info.csv',
    output_path='bertopic_output/sunburst_chart.png',
    top_n=6
):
    def compute_participation_by_authors(df, topic_id, label1, label2):
        g1_total = df[df['label'] == label1]['author'].nunique()
        g2_total = df[df['label'] == label2]['author'].nunique()
        
        g1_yes = df[(df['label'] == label1) & (df[column_name] == topic_id)]['author'].nunique()
        g2_yes = df[(df['label'] == label2) & (df[column_name] == topic_id)]['author'].nunique()
        
        g1_no = g1_total - g1_yes
        g2_no = g2_total - g2_yes
        return g1_yes, g1_no, g2_yes, g2_no


    def draw_sunburst_donut(ax, g1_yes, g1_no, g2_yes, g2_no, top_words, title, g1_name, g2_name):
        total_g1 = g1_yes + g1_no
        total_g2 = g2_yes + g2_no
        g1_yes_angle = g1_yes / total_g1 * 360 if total_g1 > 0 else 0
        g1_no_angle  = 360 - g1_yes_angle
        g2_yes_angle = g2_yes / total_g2 * 360 if total_g2 > 0 else 0
        g2_no_angle  = 360 - g2_yes_angle

        start = 0
        for angle, color in zip([g1_yes_angle, g1_no_angle], ['#1f77b4', '#2ca02c']):
            ax.add_patch(Wedge((0, 0), 1.0, start, start + angle, width=0.3, color=color))
            start += angle

        start = 0
        for angle, color in zip([g2_yes_angle, g2_no_angle], ['#9467bd', '#d62728']):
            ax.add_patch(Wedge((0, 0), 1.3, start, start + angle, width=0.3, color=color))
            start += angle

        wc = WordCloud(width=300, height=300, background_color='white').generate(top_words)
        ax.imshow(wc, extent=[-0.5, 0.5, -0.5, 0.5], zorder=10)
        ax.text(0, 1.4, title, ha='center', va='bottom', fontsize=10)
        ax.set_xlim(-1.5, 1.5)
        ax.set_ylim(-1.5, 1.5)
        ax.set_aspect('equal')
        ax.axis('off')

    # Load and merge data
    conn = sqlite3.connect(db_path)
    posts_df = pd.read_sql_query("""
        SELECT p.author, a.label, p.content
        FROM posts p
        JOIN authors a ON p.author = a.author_screen_name
    """, conn)
    conn.close()

    topics_df = pd.read_csv(topics_file)
    print(f"✅ posts_df shape: {posts_df.shape}")
    print(f"✅ topics_df shape: {topics_df.shape}")
    print(f"✅ sample posts_df:\n{posts_df.head(2)}")
    print(f"✅ sample topics_df:\n{topics_df.head(2)}")

    merged_df = pd.merge(posts_df, topics_df, left_on='content', right_on='Document', how='inner')
    print(f"🧩 merged_df shape: {merged_df.shape}")
    print(f"🧩 sample merged_df:\n{merged_df[['content','Document',column_name]].head(2)}")
    merged_df['label'] = merged_df['label'].astype(str).str.strip()
    merged_df = merged_df[(merged_df['label'] != '') & (merged_df['label'].str.lower() != 'none')]
    merged_df[column_name] = merged_df[column_name].astype(str).str.extract(r'^(\d+)')[0].astype('Int64')
    merged_df = merged_df[merged_df[column_name].notna()]
    print(f"🔍 cleaned merged_df shape: {merged_df.shape}")
    print(f"🔍 label counts:\n{merged_df['label'].value_counts()}")
    print(f"🔍 topic counts (after cleaning):\n{merged_df[column_name].value_counts().head()}")


    top_labels = merged_df['label'].value_counts().nlargest(2).index.tolist()
    print(f"🏷️ top labels: {top_labels}")
    if len(top_labels) < 2:
        print("Not enough distinct labels to compare.")
        return
    label1, label2 = top_labels

    # חישוב נושאים עם השונות הכי גדולה
    topic_scores = []
    for topic_id in merged_df[column_name].unique():
        g1_yes, g1_no, g2_yes, g2_no = compute_participation_by_authors(merged_df, topic_id, label1, label2)
        p1 = g1_yes / (g1_yes + g1_no + 1)
        p2 = g2_yes / (g2_yes + g2_no + 1)
        score = abs(p1 - p2)
        topic_scores.append((topic_id, score))

    top_topics = sorted(topic_scores, key=lambda x: x[1], reverse=True)[:top_n]
    top_topic_ids = [t[0] for t in top_topics]

    # ציור
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    for i, topic_id in enumerate(top_topic_ids):
        g1_yes, g1_no, g2_yes, g2_no = compute_participation_by_authors(merged_df, topic_id, label1, label2)

        print(f"Topic {topic_id}:")
        print(f"  {label1}: YES={g1_yes}, NO={g1_no}")
        print(f"  {label2}: YES={g2_yes}, NO={g2_no}")

        try:
            topic_words = topic_model.get_topic(topic_id)
            top_words = ', '.join([word for word, _ in topic_words[:10]]) if topic_words else "No valid words"
        except Exception:
            top_words = "Error loading words"

        draw_sunburst_donut(axes[i], g1_yes, g1_no, g2_yes, g2_no, top_words, f"Topic {topic_id}", label1, label2)

    # מקרא
    legend_ax = fig.add_axes([0.1, 0.05, 0.8, 0.05])
    legend_ax.set_axis_off()
    items = [
        (0.1, '#1f77b4', f'{label1} Participated'),
        (0.3, '#2ca02c', f'{label1} Not Participated'),
        (0.5, '#9467bd', f'{label2} Participated'),
        (0.7, '#d62728', f'{label2} Not Participated'),
    ]
    for pos, color, label in items:
        legend_ax.add_patch(Rectangle((pos, 0.2), 0.03, 0.6, color=color))
        legend_ax.text(pos + 0.04, 0.5, label, va='center', fontsize=10)

    plt.tight_layout(rect=[0, 0.1, 1, 1])
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.savefig(output_path, bbox_inches='tight')
    plt.close()
    print(f"\n✅ Sunburst donut chart saved to: {output_path}")

In [20]:
def should_merge_topics(topic1, topic2):
    system_prompt = """
You are an expert in topic modeling and keyword clustering.

Your task is to decide whether the following two topic names refer to the **same core topic** or represent **closely related concepts**.

Please respond with **"yes"** if the two topics should be merged — for example, if they are:
- synonyms,
- strongly related,
- overlapping in meaning,
- or commonly used to describe the same type of content.

Respond with **"no"** if they describe clearly **different topics** or represent **distinct themes**.

Your answer should be **only** "yes" or "no" — no explanations or extra text.
"""
    user_prompt = f"""Topic 1: {topic1}
Topic 2: {topic2}

Do these two topics represent the same topic and should be merged?"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            top_p=1.0
        )
        answer = response.choices[0].message.content.strip().lower()
        return answer == "yes"
    except Exception as e:
        print(f"Error with pair ({topic1}, {topic2}): {e}")
        return False

def generate_merge_topics(topic_pairs):
    merged_groups = []
    
    for topic1, topic2 in tqdm(topic_pairs, desc="Checking topic pairs"):
        if should_merge_topics(topic1, topic2):
            # Merge into existing group if present
            found = False
            for group in merged_groups:
                if topic1 in group or topic2 in group:
                    group.update([topic1, topic2])
                    found = True
                    break
            if not found:
                merged_groups.append(set([topic1, topic2]))
    return merged_groups

def generate_group_name_from_topics(topics):
    user_prompt = (
        "You are an expert in topic modeling.\n"
        "Given the following topic names:\n" +
        '\n'.join(f"- {t}" for t in topics) +
        "\nSuggest a concise 2–3 word name summarizing the shared theme.\n"
        "Output ONLY the name — no punctuation or explanation."
    )

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a topic naming assistant."},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.3,
        top_p=1.0
    )

    return response.choices[0].message.content.strip()

def extract_topic_pairs_from_csv(file_path, topic_name_column='Name'):
    """
    Reads a CSV file and generates all unique topic pairs from a column containing topic names.
    Returns a list of (topic1, topic2) tuples.
    """
    df = pd.read_csv(file_path)
    
    if topic_name_column not in df.columns:
        raise ValueError(f"Column '{topic_name_column}' not found in CSV.")

    df['clean_topic_name'] = df[topic_name_column].astype(str).str.replace(r'^\d+_', '', regex=True)
    
    topic_names = df['clean_topic_name'].dropna().unique().tolist()
    topic_pairs = list(combinations(topic_names, 2))

    return topic_pairs

def assign_merged_topic_names(merged_groups):
    """
    Assigns LLM-generated names to merged topic groups.
    Returns:
        - topic_to_group_name: {original_topic_name: descriptive_name}
    """
    topic_to_group_name = {}

    for idx, group in enumerate(merged_groups):
        group_id = f"Group_{idx+1}"
        group = sorted(group)
        try:
            name = generate_group_name_from_topics(group)
        except Exception as e:
            print(f"Error naming group {group_id}: {e}")
            name = group_id

        for topic in group:
            topic_to_group_name[topic] = name

    return topic_to_group_name

def apply_merged_names_to_tweets(mapping_file_path, topic_to_group_name, output_path):
    """
    Loads a tweet-topic mapping file, replaces the topic names with merged group names, and saves to new CSV.
    """
    df = pd.read_csv(mapping_file_path)
    df['clean_topic'] = df['Topic'].astype(str).str.replace(r'^\d+_', '', regex=True)
    df['merged_topic'] = df['clean_topic'].map(topic_to_group_name)
    df.to_csv(output_path, index=False)
    print(f"Saved updated tweet-topic mapping with merged names to: {output_path}")

def merge_topics(file_path):
    topic_pairs = extract_topic_pairs_from_csv(file_path)
    merged_groups = generate_merge_topics(topic_pairs)
    topic_to_group_name = assign_merged_topic_names(merged_groups)
    apply_merged_names_to_tweets('bertopic_output/tweet_topic_mapping.csv', topic_to_group_name, 'bertopic_output/tweet_merged_topic_mapping.csv')

In [21]:
def main():
    config = load_bertopic_config()
    conn = connect_to_db(config.get("db_path"))
    tweets, timestamps = load_tweets_in_batches(conn, batch_size=10000)
    # , max_tweets=100000

    if len(tweets) > 200000:
        print("Large dataset detected. Splitting into batches and training multiple BERTopic models...")
        topic_model = split_and_train_multiple_models(tweets)
        topics, _ = topic_model.transform(tweets)
        new_topics = topic_model.reduce_outliers(tweets, topics, strategy="embeddings")
        topic_model.update_topics(tweets, topics=new_topics)
        topics, _ = topic_model.transform(tweets)
        # save_topic_mapping(tweets, topics)
        save_topic_mapping(topic_model, tweets)
    else:
        topic_model, topics = create_topic_model(tweets)
        new_topics = topic_model.reduce_outliers(tweets, topics, strategy="embeddings")
        topic_model.update_topics(tweets, topics=new_topics)
        topics, _ = topic_model.transform(tweets)
        # save_topic_mapping(tweets, topics)
        save_topic_mapping(topic_model, tweets)
    print(type(topics), len(topics))
    print(topics[:5])

    #-------------------------------------------------#
    # topic_model, topics = create_topic_model(tweets)
    # new_topics = topic_model.reduce_outliers(tweets, topics, strategy="embeddings")
    # topic_model.update_topics(tweets, topics=new_topics)
    # topics, _ = topic_model.transform(tweets)
    #-------------------------------------------------#

    analyze_and_display_topics(topic_model)
    visualize_topics_over_time(topic_model, tweets, topics, timestamps)
    export_topic_model_results(topic_model)
    coherence_score, diversity_score = evaluate_model(topic_model, tweets)    
    # plot_coherence_vs_min_topic_size(tweets)
    log_bertopic_run(
        model=topic_model,
        dataset_name=config.get("db_path"),
        coherence_score=coherence_score,
        diversity_score=diversity_score
    )
    n_topics = len(set(topic_model.topics_)) - (1 if -1 in topic_model.topics_ else 0)
    plot_stacked_topic_trends(tweets, timestamps, topics, n_topics)
    merge_topics("bertopic_output/tweet_topics_info.csv")
    plot_topic_distribution_by_label(db_path=config.get("db_path"), column_name='Topic', topics_file='bertopic_output/tweet_topic_mapping.csv', output_path='bertopic_output/topics_label_distribution.png')
    plot_sunburst_for_top_topics_bertopic(
        db_path=config.get("db_path"),
        topic_model=topic_model,
        column_name='Topic',
        topics_file='bertopic_output/tweet_topic_mapping.csv',
        topic_info_file='bertopic_output/tweet_topics_info.csv',
        output_path='bertopic_output/sunburst_chart.png'
    )
    plot_topic_distribution_by_label(db_path=config.get("db_path"), column_name='merged_topic', topics_file='bertopic_output/tweet_merged_topic_mapping.csv', output_path='bertopic_output/merged_topics_label_distribution.png')
    plot_sunburst_for_top_topics_bertopic(
        db_path=config.get("db_path"),
        topic_model=topic_model,
        column_name='merged_topic',
        topics_file='bertopic_output/tweet_merged_topic_mapping.csv',
        topic_info_file='bertopic_output/tweet_topics_info.csv',
        output_path='bertopic_output/sunburst_chart_merged.png'
    )
    #-------------------------------------------------#

    conn.close()


In [22]:
if __name__ == "__main__":
    main()

Successfully connected to database: tweets_db.db
Loading up to 356,985 tweets in batches of 10,000


  0%|          | 0/356985 [00:00<?, ?it/s]

Loaded 250,450 tweets after preprocessing and filtering
Large dataset detected. Splitting into batches and training multiple BERTopic models...
Splitting data into multiple chunks and training BERTopic models...


2025-05-22 13:07:08,471 - BERTopic - Embedding - Transforming documents to embeddings.


Training sub-model on chunk 1...


Batches:   0%|          | 0/1566 [00:00<?, ?it/s]

2025-05-22 13:07:19,744 - BERTopic - Embedding - Completed ✓
2025-05-22 13:07:19,745 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 13:09:51,381 - BERTopic - Dimensionality - Completed ✓
2025-05-22 13:09:51,384 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 13:09:55,203 - BERTopic - Cluster - Completed ✓
2025-05-22 13:09:55,204 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-22 13:09:55,927 - BERTopic - Representation - Completed ✓
2025-05-22 13:09:55,929 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 13:09:55,962 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 13:09:56,512 - BERTopic - Representation - Completed ✓
2025-05-22 13:09:56,519 - BERTopic - Topic reduction - Reduced number of topics from 75 to 44
2025-05-22 13:09:56,803 - BERTopic - Embedding - Transforming documents to embeddings.


Training sub-model on chunk 3...


Batches:   0%|          | 0/1566 [00:00<?, ?it/s]

2025-05-22 13:10:08,081 - BERTopic - Embedding - Completed ✓
2025-05-22 13:10:08,082 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 13:11:01,448 - BERTopic - Dimensionality - Completed ✓
2025-05-22 13:11:01,450 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 13:11:05,447 - BERTopic - Cluster - Completed ✓
2025-05-22 13:11:05,448 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-22 13:11:06,010 - BERTopic - Representation - Completed ✓
2025-05-22 13:11:06,012 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 13:11:06,032 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 13:11:06,537 - BERTopic - Representation - Completed ✓
2025-05-22 13:11:06,543 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4
2025-05-22 13:11:06,601 - BERTopic - Embedding - Transforming documents to embeddings.


Training sub-model on chunk 4...


Batches:   0%|          | 0/1566 [00:00<?, ?it/s]

2025-05-22 13:11:17,048 - BERTopic - Embedding - Completed ✓
2025-05-22 13:11:17,049 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 13:12:12,585 - BERTopic - Dimensionality - Completed ✓
2025-05-22 13:12:12,586 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 13:12:16,573 - BERTopic - Cluster - Completed ✓
2025-05-22 13:12:16,573 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-22 13:12:17,138 - BERTopic - Representation - Completed ✓
2025-05-22 13:12:17,140 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 13:12:17,159 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 13:12:17,670 - BERTopic - Representation - Completed ✓
2025-05-22 13:12:17,676 - BERTopic - Topic reduction - Reduced number of topics from 6 to 6
2025-05-22 13:12:17,743 - BERTopic - Embedding - Transforming documents to embeddings.


Training sub-model on chunk 5...


Batches:   0%|          | 0/1566 [00:00<?, ?it/s]

2025-05-22 13:12:28,211 - BERTopic - Embedding - Completed ✓
2025-05-22 13:12:28,212 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-22 13:13:22,254 - BERTopic - Dimensionality - Completed ✓
2025-05-22 13:13:22,256 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-22 13:13:26,428 - BERTopic - Cluster - Completed ✓
2025-05-22 13:13:26,429 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-22 13:13:27,109 - BERTopic - Representation - Completed ✓
2025-05-22 13:13:27,111 - BERTopic - Topic reduction - Reducing number of topics
2025-05-22 13:13:27,147 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-22 13:13:27,695 - BERTopic - Representation - Completed ✓
2025-05-22 13:13:27,702 - BERTopic - Topic reduction - Reduced number of topics from 88 to 40


Merging 5 topic models into one final model...


Batches:   0%|          | 0/7827 [00:00<?, ?it/s]

2025-05-22 13:14:21,433 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/7827 [00:00<?, ?it/s]

2025-05-22 13:15:18,188 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/7827 [00:00<?, ?it/s]

2025-05-22 13:16:12,358 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Tweet-topic mapping saved to bertopic_output/tweet_topic_mapping.csv
<class 'numpy.ndarray'> 250450
[37 52 48  4  4]
Saved combined word cloud image to bertopic_output/wordclouds_top5_topics.png

Generating topic hierarchy visualization...

Generating Topics Over Time visualization...


20it [00:19,  1.04it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to 

Coherence Score: 0.6554
Diversity Score: 0.8881
Stacked area chart saved to: stacked_area_topics_over_time_M.png


Checking topic pairs:   0%|          | 0/2211 [00:00<?, ?it/s]

Error with pair (children_kids_child_parents, pio_training_chief_public): Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error with pair (children_kids_child_parents, uzakehir_cihal_alya_cihan): Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error with pair (children_kids_child_parents, et_de_le_les): Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: 

TypeError: no numeric data to plot

<Figure size 1800x1000 with 0 Axes>

<Figure size 1800x1000 with 0 Axes>