In [1]:
# Topic Modeling with LDA - Tweet Analysis Notebook
# !pip install gensim pyLDAvis wordcloud tqdm pysentimiento nltk pandas numpy matplotlib seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
import logging
from wordcloud import WordCloud
import os
import matplotlib.dates as mdates
import matplotlib as mpl
import seaborn as sns
import matplotlib.cm as cm
from matplotlib.cm import get_cmap
import matplotlib.dates as mdates
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import plotly.graph_objs as go
import plotly.io as pio

output_dir = "lda_output"
os.makedirs(output_dir, exist_ok=True)

try:
    from pysentimiento.preprocessing import preprocess_tweet
except ImportError:
    print("pysentimiento not installed. Using basic tweet preprocessing.")
    def preprocess_tweet(text):
        """Basic tweet preprocessing if pysentimiento is not available"""
        if not isinstance(text, str):
            return ""
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        return text

# Try importing gensim and pyLDAvis, provide error messages if not available
try:
    from gensim import corpora, models
    from gensim.models import CoherenceModel
except ImportError:
    print("ERROR: gensim is not installed. Please install it with 'pip install gensim'")
    print("Topic modeling functionality will not work without gensim.")

try:
    import pyLDAvis
    import pyLDAvis.gensim_models
except ImportError:
    print("WARNING: pyLDAvis is not installed. Visualization will not be available.")
    print("Install it with 'pip install pyLDAvis'")


# Download necessary NLTK data
import nltk
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("NLTK resources downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    print("If you encounter NLTK resource errors, run these commands in a separate cell:")
    print("import nltk")
    print("nltk.download('punkt')")
    print("nltk.download('stopwords')")
    print("nltk.download('wordnet')")


# Set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger('gensim').setLevel(logging.ERROR)

NLTK resources downloaded successfully.


[nltk_data] Downloading package punkt to /home/shayher/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shayher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/shayher/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
!uname -a

Linux dt-3090-01 5.14.0-503.15.1.el9_5.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Nov 26 17:24:29 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux


In [3]:
import yaml

def load_config(config_path="LDA_config.yaml"):
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

In [4]:
# Connect to the database and load data
def load_data(db_path, table_name='posts', content_column='content', timestamp_column='date', limit=None):
    """
    Load data from SQLite database
    
    Parameters:
    -----------
    db_path : str
        Path to the SQLite database
    table_name : str
        Name of the table to query
    content_column : str
        Name of the column containing text data
    limit : int, optional
        Limit the number of rows to fetch
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the data
    """
    conn = sqlite3.connect(db_path)
    
    if limit:
        query = f"SELECT * FROM {table_name} LIMIT {limit}"
    else:
        query = f"SELECT * FROM {table_name}"
    
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    # Check if the content column exists
    if content_column not in df.columns:
        raise ValueError(f"Column '{content_column}' not found in the data")
    
    # Remove rows with empty content
    df = df[df[content_column].notna()]

    if timestamp_column in df.columns:
        df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce', infer_datetime_format=True)

    
    print(f"Loaded {len(df)} records from {table_name}")
    
    return df

In [5]:
# Text preprocessing functions
def clean_text(text, is_tweet=True):
    """
    Clean and preprocess text data
    
    Parameters:
    -----------
    text : str
        Text to preprocess
    is_tweet : bool
        Whether the text is a tweet (uses pysentimiento preprocessing)
    
    Returns:
    --------
    str
        Preprocessed text
    """
    if not isinstance(text, str):
        return ""
    
    if is_tweet:
        text = preprocess_tweet(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

In [6]:
def tokenize(text):
    """
    Tokenize text into words
    
    Parameters:
    -----------
    text : str
        Text to tokenize
    
    Returns:
    --------
    list
        List of tokens
    """
    try:
        return word_tokenize(text)
    except LookupError:
        
        # Simple split by spaces as fallback
        return text.split()

In [7]:
def remove_stopwords(tokens, custom_stopwords=None):
    """
    Remove stopwords from a list of tokens
    
    Parameters:
    -----------
    tokens : list
        List of tokens
    custom_stopwords : list, optional
        List of additional stopwords to remove
    
    Returns:
    --------
    list
        List of tokens with stopwords removed
    """
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        # Fallback if NLTK stopwords are not available
        print("NLTK stopwords not available. Using a basic stopword list.")
        stop_words = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
                      'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
                      'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
                      'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
                      'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
                      'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
                      'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 
                      'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
                      'with', 'about', 'against', 'between', 'into', 'through', 'during', 
                      'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 
                      'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
                      'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 
                        'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 
                        'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 
                        'should', 'now', 'd', 'll', 'm', 'o', 're', 's', 't', 've', 'y', 'ain', 
                        'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 
                        'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn',
                        'rt', 'via', 'https', 'http', 'www', 'com', 'co', 'amp', 'u', 'ur',
                        'get', 'like', 'got', 'one', 'im', 'yeah', 'oh', 'lol', 'hey', 'ok', 'okay', 
                        'hi', 'ha', 'haha', 'yes', 'nope', 'thanks', 'thank', 'pls', 'please'}
    
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    
    return [word for word in tokens if word not in stop_words and len(word) > 2]


In [8]:
def lemmatize(tokens):
    """
    Lemmatize tokens
    
    Parameters:
    -----------
    tokens : list
        List of tokens
    
    Returns:
    --------
    list
        List of lemmatized tokens
    """
    try:
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in tokens]
    except LookupError:        
        return tokens                                                    

In [9]:
def preprocess_documents(documents, is_tweet=True, custom_stopwords=None):
    """
    Preprocess a list of documents
    
    Parameters:
    -----------
    documents : list
        List of text documents
    is_tweet : bool
        Whether the documents are tweets
    custom_stopwords : list, optional
        List of additional stopwords to remove
    
    Returns:
    --------
    list
        List of preprocessed documents as lists of tokens
    """
    processed_docs = []
    
    # Disable tqdm completely to avoid errors
    print(f"Processing {len(documents)} documents...")
    
    for i, doc in enumerate(documents):
        # Print progress periodically
        if i % 1000 == 0 and i > 0:
            print(f"Processed {i}/{len(documents)} documents")
            
        clean_doc = clean_text(doc, is_tweet=is_tweet)
        tokens = tokenize(clean_doc)
        tokens = remove_stopwords(tokens, custom_stopwords)
        tokens = lemmatize(tokens)
        
        # Only add documents that have tokens left after preprocessing
        if tokens and len(tokens) > 3:
            processed_docs.append(tokens)
    
    print(f"Finished processing. {len(processed_docs)} documents retained after preprocessing.")
    return processed_docs

In [10]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

def add_bigrams_to_documents(processed_docs, min_count=5, threshold=100):
    """
    Add bigrams to preprocessed documents using gensim's Phrases.

    Parameters:
    -----------
    processed_docs : list of list of str
        Tokenized and cleaned documents
    min_count : int
        Minimum count of word pairs to be considered as bigram (e.g., 5)
    threshold : int
        Threshold for forming the phrases (higher = fewer phrases)

    Returns:
    --------
    list of list of str
        Documents with bigrams added (e.g., ['donald_trump', 'white_house'])
    """
    # Detect common bigrams across the corpus
    bigram_model = Phrases(processed_docs, min_count=min_count, threshold=threshold)
    bigram_phraser = Phraser(bigram_model)

    # Apply the bigram model to all documents
    bigrammed_docs = [bigram_phraser[doc] for doc in processed_docs]
    return bigrammed_docs

In [11]:
# Topic modeling functions
def create_dictionary_and_corpus(processed_docs, no_below=5, no_above=0.5):
    """
    Create a dictionary and corpus from preprocessed documents
    
    Parameters:
    -----------
    processed_docs : list
        List of preprocessed documents (lists of tokens)
    no_below : int
        Keep tokens that appear in at least this many documents
    no_above : float
        Keep tokens that appear in at most this fraction of documents
    
    Returns:
    --------
    tuple
        (dictionary, corpus)
    """
    # Create dictionary
    dictionary = corpora.Dictionary(processed_docs)
    
    # Filter extremes
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create corpus
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    
    print(f"Dictionary size: {len(dictionary)}")
    print(f"Corpus size: {len(corpus)}")
    
    return dictionary, corpus

In [12]:
def train_lda_model(corpus, dictionary, num_topics=10, passes=25, iterations=50, alpha='auto', eta='auto'):
    """
    Trains a gensim Latent Dirichlet Allocation (LDA) model.

    Args:
        corpus (list): Document-term matrix in BoW format.
        dictionary (gensim.corpora.Dictionary): Gensim word-to-ID mapping.
        num_topics (int): Number of topics to find. Defaults to 10.
        passes (int): Number of training passes. Defaults to 15.
        iterations (int): Iterations per pass. Defaults to 50.
        alpha (str or float): Document-topic prior. Defaults to 'auto'.
        eta (str or float): Topic-word prior. Defaults to 'auto'.

    Returns:
        gensim.models.LdaModel: The trained LDA model object.
    """
    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=passes,
        alpha=alpha,
        eta=eta,
        random_state=42,
        iterations=iterations
    )
    
    return lda_model

In [13]:
def compute_coherence_score(model, corpus, dictionary, processed_docs, coherence='c_v'):
    """
    Compute coherence score for an LDA model
    
    Parameters:
    -----------
    model : gensim.models.LdaModel
        Trained LDA model
    corpus : list
        Document-term matrix
    dictionary : gensim.corpora.Dictionary
        Dictionary mapping words to indices
    processed_docs : list
        List of preprocessed documents (lists of tokens)
    coherence : str
        Coherence measure to use
    
    Returns:
    --------
    float
        Coherence score
    """
    coherence_model = CoherenceModel(
        model=model,
        texts=processed_docs,
        dictionary=dictionary,
        coherence=coherence
    )
    
    return coherence_model.get_coherence()

In [14]:
def evaluate_topic_models(corpus, dictionary, processed_docs, min_topics=70, max_topics=100, step=5, iterations=50, alpha='auto', eta='auto'):
    """
    Evaluate LDA models with different numbers of topics
    
    Parameters:
    -----------
    corpus : list
        Document-term matrix
    dictionary : gensim.corpora.Dictionary
        Dictionary mapping words to indices
    processed_docs : list
        List of preprocessed documents (lists of tokens)
    min_topics : int
        Minimum number of topics to evaluate
    max_topics : int
        Maximum number of topics to evaluate
    step : int
        Step size for number of topics
    
    Returns:
    --------
    tuple
        (coherence_scores, perplexity_scores)
    """
    coherence_scores = []
    perplexity_scores = []
    
    for num_topics in range(min_topics, max_topics + 1, step):
        print(f"Evaluating model with {num_topics} topics...")
        
        lda_model = train_lda_model(corpus, dictionary, num_topics=num_topics, iterations=iterations, alpha=alpha, eta=eta)
        
        # Compute coherence score
        coherence_score = compute_coherence_score(lda_model, corpus, dictionary, processed_docs)
        coherence_scores.append(coherence_score)
        
        # Compute perplexity
        perplexity = lda_model.log_perplexity(corpus)
        perplexity_scores.append(perplexity)
        
        print(f"Coherence score: {coherence_score:.4f}")
        print(f"Perplexity: {perplexity:.4f}")
        print("-" * 50)
    
    return coherence_scores, perplexity_scores

In [15]:
def plot_topic_evaluation(topic_range, coherence_scores, perplexity_scores):
    """
    Plot coherence scores and perplexity scores for different numbers of topics
    
    Parameters:
    -----------
    topic_range : list
        List of numbers of topics
    coherence_scores : list
        List of coherence scores
    perplexity_scores : list
        List of perplexity scores
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot coherence scores
    ax1.plot(topic_range, coherence_scores, 'o-')
    ax1.set_xlabel('Number of Topics')
    ax1.set_ylabel('Coherence Score')
    ax1.set_title('Coherence Score by Number of Topics')
    ax1.grid(True)
    
    # Plot perplexity scores
    ax2.plot(topic_range, perplexity_scores, 'o-')
    ax2.set_xlabel('Number of Topics')
    ax2.set_ylabel('Perplexity Score')
    ax2.set_title('Perplexity Score by Number of Topics')
    ax2.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "topic_evaluation.png"), bbox_inches='tight')
    plt.close()

In [16]:
def create_topic_wordclouds(lda_model, num_topics):
    """
    Create word clouds for each topic
    
    Parameters:
    -----------
    lda_model : gensim.models.LdaModel
        Trained LDA model
    num_topics : int
        Number of topics
    """
    # Create subplots
    cols = 3
    rows = (num_topics + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(18, 4 * rows))
    
    # Flatten axes array if needed
    if rows > 1:
        axes = axes.flatten()
    elif cols > 1:
        axes = [axes]
    
    # Create word cloud for each topic
    for i, (topic_num, ax) in enumerate(zip(range(num_topics), axes)):
        topic_words = dict(lda_model.show_topic(topic_num, 30))
        
        # Create word cloud
        wordcloud = WordCloud(
            width=400,
            height=400,
            background_color='white',
            max_words=50,
            prefer_horizontal=1.0
        ).generate_from_frequencies(topic_words)
        
        # Display word cloud
        ax.imshow(wordcloud)
        ax.set_title(f'Topic {topic_num + 1}')
        ax.axis('off')
    
    # Hide any unused subplots
    for j in range(num_topics, len(axes)):
        fig.delaxes(axes[j])
    
    output_path = "lda_output/topic_wordclouds.png"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path, bbox_inches='tight')
    plt.close()

In [17]:
def visualize_topics_pyldavis(lda_model, corpus, dictionary):
    """
    Visualize topics using pyLDAvis
    
    Parameters:
    -----------
    lda_model : gensim.models.LdaModel
        Trained LDA model
    corpus : list
        Document-term matrix
    dictionary : gensim.corpora.Dictionary
        Dictionary mapping words to indices
    """
    # Prepare visualization
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, mds='mmds')
    return vis


In [18]:
def get_dominant_topic_per_document(lda_model, corpus):
    """
    Get the dominant topic for each document
    
    Parameters:
    -----------
    lda_model : gensim.models.LdaModel
        Trained LDA model
    corpus : list
        Document-term matrix
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with dominant topic information
    """
    # Get topic distribution for each document
    topic_distributions = [lda_model.get_document_topics(bow) for bow in corpus]
    
    # Get dominant topics
    dominant_topics = []
    for i, dist in enumerate(topic_distributions):
        # Sort by topic probability
        sorted_topics = sorted(dist, key=lambda x: x[1], reverse=True)
        
        # Get top topic
        if sorted_topics:
            top_topic_id = sorted_topics[0][0]
            top_topic_prob = sorted_topics[0][1]
            
            # Get top words for this topic
            top_words = [word for word, _ in lda_model.show_topic(top_topic_id, 5)]
            top_words_str = ", ".join(top_words)
            
            dominant_topics.append({
                'document_id': i,
                'dominant_topic': top_topic_id,
                'topic_probability': top_topic_prob,
                'top_words': top_words_str
            })
        else:
            dominant_topics.append({
                'document_id': i,
                'dominant_topic': -1,
                'topic_probability': 0.0,
                'top_words': ""
            })
    
    return pd.DataFrame(dominant_topics)

In [19]:
def plot_stacked_topic_trends(
    original_df_with_timestamps,
    dominant_topics_info_df,
    timestamp_col='date',
    topic_assignment_col='dominant_topic',
    num_total_topics=None, 
    time_freq='M',
    figure_size=(14, 8),
    colormap_name='tab20' 
):
    """
    Generates and displays a stacked area chart of topic trends over time.

    Args:
        original_df_with_timestamps (pd.DataFrame): DataFrame containing the original data,
                                                    including the timestamp column.
        dominant_topics_info_df (pd.DataFrame): DataFrame with topic assignments.
                                                Must have an index that aligns with
                                                original_df_with_timestamps and a column
                                                specified by topic_assignment_col.
        timestamp_col (str): Name of the column in original_df_with_timestamps
                             that contains the timestamps.
        topic_assignment_col (str): Name of the column in dominant_topics_info_df
                                    that contains the dominant topic ID.
        num_total_topics (int): The total number of topics generated by the LDA model.
                                This is crucial for correct one-hot encoding.
        time_freq (str): Time frequency for grouping (e.g., 'M' for month,
                         'Y' for year).
        figure_size (tuple): Size of the matplotlib figure (width, height).
        colormap_name (str): Name of the matplotlib colormap to use for topics.
    """
    if num_total_topics is None:
        raise ValueError("num_total_topics must be provided to generate the stacked area chart correctly.")

    # Merge topic assignments with timestamps
    # Ensure indices align for a successful join.
    temp_df = original_df_with_timestamps.copy()
    # Join only the topic assignment column to avoid duplicate columns if original_df already has it
    temp_df = temp_df.join(dominant_topics_info_df[[topic_assignment_col]])

    # Prepare timestamp column
    if timestamp_col not in temp_df.columns:
        raise ValueError(
            f"Timestamp column '{timestamp_col}' not found. "
            f"Available columns: {temp_df.columns.tolist()}"
        )
    try:
        temp_df[timestamp_col] = pd.to_datetime(temp_df[timestamp_col], errors='coerce')
    except Exception as e:
        print(f"Error converting column '{timestamp_col}' to datetime: {e}. Attempting with errors='coerce'.")
        temp_df[timestamp_col] = pd.to_datetime(temp_df[timestamp_col], errors='coerce')

    # Drop rows where date conversion failed or topic assignment is missing
    temp_df.dropna(subset=[timestamp_col, topic_assignment_col], inplace=True)
    if temp_df.empty:
        print("DataFrame is empty after handling NaNs in timestamp or topic column. Skipping stacked area plot.")
        return

    # Extract time period for grouping
    if time_freq == 'M':
        temp_df['time_period'] = temp_df[timestamp_col].dt.to_period('M').dt.to_timestamp()
    elif time_freq == 'Y':
        temp_df['time_period'] = temp_df[timestamp_col].dt.to_period('Y').dt.to_timestamp()
    elif time_freq == 'W':
        temp_df['time_period'] = temp_df[timestamp_col].dt.to_period('W').dt.to_timestamp()
    elif time_freq == 'Q':
        temp_df['time_period'] = temp_df[timestamp_col].dt.to_period('Q').dt.to_timestamp()
    else:
        raise ValueError(f"Unsupported time_freq: {time_freq}. Use 'M', 'Y', 'W', or 'Q'.")

    # One-hot encode the topic assignments
    # Ensure topic_assignment_col is integer for get_dummies
    temp_df[topic_assignment_col] = temp_df[topic_assignment_col].astype(int)
    topic_dummies = pd.get_dummies(temp_df[topic_assignment_col], prefix='Topic')

    # Ensure all possible topic columns exist (from 0 to num_total_topics-1)
    all_topic_cols = [f'Topic_{i}' for i in range(num_total_topics)]
    for col in all_topic_cols:
        if col not in topic_dummies.columns:
            topic_dummies[col] = 0 # Add missing topic columns with 0s
    topic_dummies = topic_dummies[all_topic_cols] # Ensure correct order and selection

    # Concatenate topic dummies with the time_period column for grouping
    temp_df_for_grouping = pd.concat([temp_df['time_period'], topic_dummies], axis=1)
    
    if temp_df_for_grouping.empty or 'time_period' not in temp_df_for_grouping.columns:
        print("DataFrame for grouping is empty or missing 'time_period'. Skipping stacked area plot.")
        return

    # Group by time period and sum topic counts
    monthly_topic_counts = temp_df_for_grouping.groupby('time_period')[all_topic_cols].sum()
    
    if monthly_topic_counts.empty:
        print(f"No data to plot after grouping by 'time_period' with frequency '{time_freq}'. Skipping stacked area plot.")
        return

    # Plot stacked area chart
    print("\nGenerating stacked area chart for topic trends...")
    plt.style.use('seaborn-v0_8-whitegrid') # Using a seaborn style
    fig, ax = plt.subplots(figsize=figure_size)

    try:
        # Dynamically generate colors
        num_plot_topics = monthly_topic_counts.shape[1]
        if num_plot_topics == 0:
            print("No topic columns to plot. Skipping stacked area chart.")
            return
            
        # Attempt to get colormap; provide fallback for very few topics
        if num_plot_topics > 1:
            colors = cm.get_cmap(colormap_name, num_plot_topics)(np.linspace(0, 1, num_plot_topics))
        elif num_plot_topics == 1: # Single topic, simple color
            colors = [cm.get_cmap(colormap_name)(0.5)] # Pick a color from the map
        else: # No topics, should have been caught earlier
            colors = []

        # Plot each topic individually to control labels and colors
        for i, col in enumerate(monthly_topic_counts.columns):
            ax.fill_between(
                monthly_topic_counts.index,
                monthly_topic_counts[col].cumsum() if i == 0 else monthly_topic_counts.iloc[:, :i+1].sum(axis=1),
                monthly_topic_counts.iloc[:, :i].sum(axis=1) if i > 0 else 0,
                color=colors[i],
                label=col,
                linewidth=0
            )
    except Exception as e:
        print(f"Could not generate plot with colormap '{colormap_name}'. Error: {e}. Plotting with default colors.")
        monthly_topic_counts.plot(kind='area', stacked=True, ax=ax, alpha=0.85, linewidth=0.5)


    ax.set_title("Topic Trends Over Time (Stacked Area)", fontsize=16)
    ax.set_xlabel(f"Time Period ({time_freq})", fontsize=14)
    ax.set_ylabel("Number of Documents per Topic", fontsize=14)

    # Show legend clearly with colors matching the plot
    ax.legend(title='Topics', bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=10, title_fontsize=12)
    plt.tight_layout(rect=[0, 0, 0.88, 1])  # adjust for legend
    
    # # Improve legend if there are many topics
    # if num_plot_topics > 15: # Heuristic for "many" topics
    #     ax.legend().set_visible(False) # Hide legend if too cluttered
    #     print(f"Legend hidden due to large number of topics ({num_plot_topics}).")
    # elif num_plot_topics > 0 :
    #     ax.legend(title='Topics', bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=10)
    #     plt.tight_layout(rect=[0, 0, 0.88, 1]) # Adjust layout for legend
    # else:
    #     plt.tight_layout()


    plt.grid(True, which='major', linestyle='--', linewidth=0.5, axis='y')
    plt.grid(False, which='major', linestyle='--', linewidth=0.5, axis='x') # Optional: remove vertical grid lines
    # Save the figure
    os.makedirs("lda_output", exist_ok=True)
    output_path = f"lda_output/stacked_area_topics_over_time_{time_freq}.png"
    plt.savefig(output_path, bbox_inches='tight')
    print(f"Stacked area chart saved to: {output_path}")
    plt.close()

In [20]:
def run_topic_modeling(
    df,
    content_column='content',
    is_tweet=True,
    custom_stopwords=None,
    min_topics=70,
    max_topics=100,
    step=5,
    optimal_num_topics=None, 
    no_below=5,
    no_above=0.5,
    alpha='auto',
    eta='auto',
    iterations=50,
    enable_time_series_plot=True,
    timestamp_column_for_plot='date',
    time_freq_for_plot='M',
    normalize_time_plot=True,
    enable_stacked_area_plot=True,
    timestamp_col_for_stacked_plot='date',
    topic_col_name_for_stacked_plot='dominant_topic',
    time_freq_for_stacked_plot='M',
    colormap_for_stacked_plot='tab20'
):
    """
    Run complete topic modeling workflow, including multiple time series plots.

    Args:
        df (pd.DataFrame): DataFrame containing text data AND the timestamp column.
        content_column (str): Name of the column containing text data.
        is_tweet (bool): Whether the data consists of tweets for preprocessing.
        custom_stopwords (list, optional): List of additional stopwords.
        min_topics (int): Minimum number of topics for evaluation.
        max_topics (int): Maximum number of topics for evaluation.
        step (int): Step size for number of topics in evaluation.
        optimal_num_topics (int, optional): If provided, skips evaluation and uses this.
                                            If None, evaluation is performed.
        no_below (int): Min document frequency for tokens.
        no_above (float): Max document frequency (fraction) for tokens.
        iterations (int): LDA model training iterations.
        enable_time_series_plot (bool): Enable line plot for topics over time.
        timestamp_column_for_plot (str): Timestamp column for line plot.
        time_freq_for_plot (str): Time frequency for line plot.
        normalize_time_plot (bool): Normalize line plot y-axis.
        enable_stacked_area_plot (bool): Enable stacked area plot.
        timestamp_col_for_stacked_plot (str): Timestamp col for stacked plot.
        topic_col_name_for_stacked_plot (str): Topic ID col name for stacked plot.
        time_freq_for_stacked_plot (str): Time frequency for stacked plot.
        colormap_for_stacked_plot (str): Colormap for stacked plot.

    Returns:
        dict: Dictionary containing model, corpus, dictionary, dominant topics, etc.
    """
    # Initialize variables that will be defined in the workflow
    final_lda_model = None
    corpus = None
    dictionary = None
    processed_docs = None
    vis = None 

    # Preprocessing documents 
    print("Preprocessing documents...")
    documents = df[content_column].tolist()
    try:
        processed_docs = preprocess_documents(documents, is_tweet=is_tweet, custom_stopwords=custom_stopwords)

    except Exception as e:
        print(f"ERROR during preprocessing_documents: {e}")
        # Depending on severity, you might want to return or raise here
        return {"error": "Preprocessing failed"}

    # Creating dictionary and corpus 
    print("\nCreating dictionary and corpus...")
    try:
        dictionary, corpus = create_dictionary_and_corpus(processed_docs, no_below=no_below, no_above=no_above)
        if not dictionary or not corpus:
            print("ERROR: Dictionary or corpus creation failed (returned empty).")
            return {"error": "Dictionary/Corpus creation failed"}
    except Exception as e:
        print(f"ERROR during create_dictionary_and_corpus: {e}")
        return {"error": "Dictionary/Corpus creation failed"}

    # Evaluating different numbers of topics 
    # This local_optimal_num_topics will be used for training.
    # It starts as the input optimal_num_topics.
    local_optimal_num_topics = optimal_num_topics

    if local_optimal_num_topics is None:
        topic_range = list(range(min_topics, max_topics + 1, step))
        if not topic_range:
            print(f"ERROR: Topic range for evaluation is empty (min_topics={min_topics}, max_topics={max_topics}, step={step}). Using default min_topics.")
            local_optimal_num_topics = min_topics
        else:
            try:
                coherence_scores, perplexity_scores = evaluate_topic_models(
                    corpus=corpus,
                    dictionary=dictionary,
                    processed_docs=processed_docs,
                    min_topics=min_topics,
                    max_topics=max_topics,
                    step=step,
                    iterations=iterations,
                    alpha=alpha,
                    eta=eta,
                )
                plot_topic_evaluation(topic_range, coherence_scores, perplexity_scores)
                if coherence_scores:
                    optimal_index = coherence_scores.index(max(coherence_scores))
                    local_optimal_num_topics = topic_range[optimal_index]
                    print(f"Optimal number of topics based on coherence: {local_optimal_num_topics}")
                else:
                    print("Warning: Coherence scores list is empty. Defaulting to min_topics.")
                    local_optimal_num_topics = min_topics
            except Exception as e:
                print(f"ERROR during topic model evaluation: {e}. Defaulting to min_topics.")
                local_optimal_num_topics = min_topics
    else:
        print(f"\nSkipping topic evaluation. Using pre-defined optimal_num_topics: {local_optimal_num_topics}")

    # Validate local_optimal_num_topics before training
    if not isinstance(local_optimal_num_topics, int) or local_optimal_num_topics <= 0:
        print(f"ERROR: Invalid optimal_num_topics ({local_optimal_num_topics}). Must be a positive integer. Defaulting to min_topics: {min_topics}")
        local_optimal_num_topics = min_topics


    # Training final LDA model
    print(f"\nTraining final LDA model with {local_optimal_num_topics} topics...")
    try:
        final_lda_model = train_lda_model(
            corpus=corpus,
            dictionary=dictionary,
            num_topics=local_optimal_num_topics,
            iterations=iterations,
            alpha=alpha,
            eta=eta
        )
        if final_lda_model is None:
            print("ERROR: train_lda_model returned None. Model training failed.")
            return {"error": "LDA model training failed"}
    except Exception as e:
        print(f"ERROR during final LDA model training: {e}")
        return {"error": "LDA model training failed"}

    # Creating word clouds for topics ---
    print("\nCreating word clouds for topics...")
    try:
        create_topic_wordclouds(final_lda_model, local_optimal_num_topics)
    except Exception as e:
        print(f"ERROR creating word clouds: {e}")
        # This might not be a critical failure, so we can continue

    # Getting dominant topic for each document
    print("\nGetting dominant topic for each document...")
    dominant_topics_df = pd.DataFrame() # Initialize to prevent NameError if get_dominant_topic fails
    try:
        dominant_topics_df = get_dominant_topic_per_document(final_lda_model, corpus)
        if not dominant_topics_df.empty:
            # Align index if necessary (important for joins)
            if len(dominant_topics_df) == len(df):
                dominant_topics_df = dominant_topics_df.set_index(df.index)
            else:
                print(f"Warning: Length of dominant_topics_df ({len(dominant_topics_df)}) does not match original df ({len(df)}). Index alignment might be incorrect for joins.")
                # Attempt to align with the beginning of the original df's index
                dominant_topics_df = dominant_topics_df.set_index(df.index[:len(dominant_topics_df)])
        else:
            print("Warning: get_dominant_topic_per_document returned an empty DataFrame.")
    except Exception as e:
        print(f"ERROR getting dominant topics: {e}")
        # Continue if possible, plots might be skipped

    # Visualizing topics with pyLDAvis 
    print("\nVisualizing topics with pyLDAvis...")
    try:
        print("\nGenerating pyLDAvis visualization...")
        vis = gensimvis.prepare(final_lda_model, corpus, dictionary)
        vis_path = os.path.join(output_dir, "pyldavis_intertopic_map.html")
        pyLDAvis.save_html(vis, vis_path)
        print(f"Saved pyLDAvis HTML visualization to {vis_path}")
    except Exception as e:
        print(f"ERROR generating pyLDAvis HTML: {e}")

    # Plotting topics over time (LINE PLOT) 
    if enable_time_series_plot:
        print("\n Plotting topics over time (Line Plot)...")
        if timestamp_column_for_plot not in df.columns:
            print(f"Warning: Timestamp column '{timestamp_column_for_plot}' not found in df. Skipping line plot.")
        elif dominant_topics_df.empty:
            print("Warning: dominant_topics_df is empty. Skipping line plot.")
        else:
            try:
                plot_topics_over_time(
                    original_df=df,
                    topics_df=dominant_topics_df,
                    timestamp_col=timestamp_column_for_plot,
                    topic_col_name='dominant_topic', # Assuming this col name from get_dominant_topic_per_document
                    time_freq=time_freq_for_plot,
                    use_normalization=normalize_time_plot
                )
            except Exception as e:
                print(f"ERROR during plot_topics_over_time (line plot): {e}")
    else:
        print("\n Plotting topics over time (Line Plot) skipped as per configuration.")

    # Plotting STACKED AREA topic trends 
    if enable_stacked_area_plot:
        print("\nPlotting topics over time (Stacked Area Plot)...")
        if timestamp_col_for_stacked_plot not in df.columns:
            print(f"Warning: Timestamp column '{timestamp_col_for_stacked_plot}' not found in df. Skipping stacked area plot.")
        elif dominant_topics_df.empty:
            print("Warning: dominant_topics_df is empty. Skipping stacked area plot.")
        else:
            try:
                # Use local_optimal_num_topics which is guaranteed to be set
                num_topics_for_stacked_plot = local_optimal_num_topics
                plot_stacked_topic_trends(
                    original_df_with_timestamps=df,
                    dominant_topics_info_df=dominant_topics_df,
                    timestamp_col=timestamp_col_for_stacked_plot,
                    topic_assignment_col=topic_col_name_for_stacked_plot, # Defaulted in func signature
                    num_total_topics=num_topics_for_stacked_plot,
                    time_freq=time_freq_for_stacked_plot,
                    colormap_name=colormap_for_stacked_plot
                )
            except Exception as e:
                print(f"ERROR during plot_stacked_topic_trends: {e}")
    else:
        print("\nPlotting topics over time (Stacked Area Plot) skipped as per configuration.")

    # Return results 
    results_dict = {
        'lda_model': final_lda_model,
        'corpus': corpus,
        'dictionary': dictionary,
        'processed_docs': processed_docs,
        'dominant_topics_df': dominant_topics_df,
        'vis': vis,
        'optimal_num_topics': local_optimal_num_topics # Return the actually used number of topics
    }
    return results_dict


In [21]:
def create_bigram_wordcloud(processed_docs, output_path="lda_output/bigram_wordcloud.png"):
    """
    Create a word cloud from bigrams.

    Parameters:
    -----------
    processed_docs : list of list of str
        Tokenized documents with bigrams
    output_path : str
        Path to save the word cloud image
    """
    from collections import Counter

    bigram_freq = Counter()
    for doc in processed_docs:
        bigrams = [token for token in doc if '_' in token]
        bigram_freq.update(bigrams)

    if not bigram_freq:
        print("No bigrams found for word cloud.")
        return

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white'
    ).generate_from_frequencies(bigram_freq)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title("Bigram WordCloud")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"Saved bigram word cloud to {output_path}")


In [22]:
# Merge topic data with timestamps
def plot_manual_topics_over_time(df, dominant_topics_df, timestamp_column='date', time_freq='M'):
    if df.index.name != dominant_topics_df.index.name:
         print(f"Info: Index name of df is '{df.index.name}' and for dominant_topics_df is '{dominant_topics_df.index.name}'.")
         print("Joining based on index position. Ensure this is the desired behavior.")
         
    # Add topic information to the original dataframe using the correct column name
    df_merged = df.join(dominant_topics_df[['dominant_topic']])
    
    # Prepare the timestamp column
    # Check if the timestamp column exists
    if timestamp_column not in df_merged.columns:
        raise ValueError(f"Timestamp column '{timestamp_column}' not found in the DataFrame 'df_merged'. Please check the 'timestamp_column' variable and ensure it's present in your original 'df'.")
    
    # Convert timestamp column to datetime objects (handle potential errors)
    try:
        df_merged[timestamp_column] = pd.to_datetime(df_merged[timestamp_column])
    except Exception as e:
        print(f"Error converting column '{timestamp_column}' to datetime: {e}")
        print("Attempting conversion with error coercion (invalid dates become NaT)...")
        df_merged[timestamp_column] = pd.to_datetime(df_merged[timestamp_column], errors='coerce')
    
    # Drop rows where date conversion failed or where the dominant topic is missing
    df_merged.dropna(subset=[timestamp_column, 'dominant_topic'], inplace=True)
    
    # Ensure Dominant_Topic is integer or suitable for grouping
    df_merged['dominant_topic'] = df_merged['dominant_topic'].astype(int) # Corrected column name
    
    
    # Group by time and topic
    # Set timestamp as index for resampling
    df_merged.set_index(timestamp_column, inplace=True)
    
    # Group by the chosen frequency and dominant topic, then count occurrences
    
    topics_over_time = df_merged.groupby([pd.Grouper(freq=time_freq), 'dominant_topic']).size().unstack(fill_value=0)
    
    # Plotting using Plotly
    fig = go.Figure()

    for topic in topics_over_time.columns:
        fig.add_trace(go.Scatter(
            x=topics_over_time.index,
            y=topics_over_time[topic],
            mode='lines+markers',
            name=f'Topic {topic}'
        ))

    fig.update_layout(
        title=f'Topics Over Time ({time_freq} Frequency)',
        xaxis_title='Time Period',
        yaxis_title='Number of Documents',
        legend_title='Topics',
        template='plotly_white',
        hovermode='x'
    )

    # Improve x-axis formatting for dates
    if time_freq == 'M':
        fig.update_xaxes(tickformat='%Y-%m')
    elif time_freq == 'Y':
        fig.update_xaxes(tickformat='%Y')
    elif time_freq == 'W':
        fig.update_xaxes(tickformat='%Y-%W')
    elif time_freq == 'Q':
        fig.update_xaxes(tickformat='%Y-Q%q')

    # Save as interactive HTML
    output_path = f"lda_output/manual_topics_over_time_{time_freq}.html"
    pio.write_html(fig, file=output_path, auto_open=True)

In [23]:
def export_lda_model_results(lda_model, dictionary, dominant_topics_df, output_dir="lda_output"):
    os.makedirs(output_dir, exist_ok=True)
    # Save LDA model
    model_path = os.path.join(output_dir, "lda_model")
    lda_model.save(model_path)
    print(f"Saved LDA model to {model_path}")

    # Save dictionary as a structured CSV
    dict_tokens = [{"ID": token_id, "Word": word, "Frequency": dictionary.dfs.get(token_id, 0)}
                   for token_id, word in dictionary.iteritems()]
    dict_df = pd.DataFrame(dict_tokens)
    dict_df.sort_values(by="Frequency", ascending=False, inplace=True)
    
    dict_path = os.path.join(output_dir, "lda_dictionary.csv")
    dict_df.to_csv(dict_path, index=False)
    print(f"Saved structured dictionary to {dict_path}")

    # Save dominant topics assignment (if available)
    if not dominant_topics_df.empty:

        # Save top topic terms
        top_terms = []
        for topic_id in range(lda_model.num_topics):
            for term, weight in lda_model.show_topic(topic_id, topn=10):
                top_terms.append({
                    'Topic': topic_id,
                    'Term': term,
                    'Weight': round(weight, 4)
                })
    
        terms_df = pd.DataFrame(top_terms)
        terms_path = os.path.join(output_dir, "lda_topic_terms.csv")
        terms_df.to_csv(terms_path, index=False)
        print(f"Saved top topic terms to {terms_path}")
    
        # Save dominant topic assignment
        dom_path = os.path.join(output_dir, "lda_dominant_topics.csv")
        dominant_topics_df.to_csv(dom_path, index=False)
        print(f"Saved dominant topic assignments to {dom_path}")

In [24]:
def export_lda_summary(lda_model, dominant_topics_df, content_column='content', output_path='lda_topic_info.csv',output_dir="lda_output"):
    """
    Create a summary CSV file of LDA topics with topic ID, count, top terms, and a representative document.

    Args:
        lda_model (gensim.models.LdaModel): Trained LDA model.
        dominant_topics_df (pd.DataFrame): DataFrame with dominant topic assignments.
        content_column (str): Name of the column with original text (must be present in dominant_topics_df).
        output_path (str): Path to save the CSV file.
    """
    os.makedirs(output_dir, exist_ok=True)
    if 'dominant_topic' not in dominant_topics_df.columns:
        print("Error: dominant_topic column not found in dominant_topics_df")
        return

    topic_counts = dominant_topics_df['dominant_topic'].value_counts().sort_index()

    topic_info = []
    for topic_id in topic_counts.index:
        # Name: use underscore-separated top 3 words
        top_words = [word for word, _ in lda_model.show_topic(topic_id, topn=3)]
        topic_name = "_".join(top_words)

        # Full representation: top 10 words
        full_representation = [word for word, _ in lda_model.show_topic(topic_id, topn=10)]

        topic_info.append({
            "Topic": topic_id,
            "Count": topic_counts[topic_id],
            "Name": topic_name,
            "Representation": full_representation,
        })

    topic_info_df = pd.DataFrame(topic_info)
    
    terms_path = os.path.join(output_dir, output_path)
    topic_info_df.to_csv(terms_path, index=False)
    
    print(f"LDA topic summary exported to {output_path}")

In [25]:
from datetime import datetime

def log_lda_run(model, dataset_name, optimal_num_topics, coherence_score, log_path='lda_modeling_log.csv'):
    """
    Log the LDA run into a CSV file, including topic count and coherence score.

    Args:
        model (LdaModel): The trained LDA model.
        dataset_name (str): Identifier of the dataset.
        optimal_num_topics (int): The number of topics used.
        coherence_score (float): Coherence score.
        log_path (str): Path to save the CSV log.
    """
    run_data = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'dataset_name': dataset_name,
        'model_name': 'LDA (gensim)',
        'parameters': f"min_topics={config['min_topics']}, max_topics={config['max_topics']}, "
                      f"step={config['step']}, no_below={config['no_below']}, no_above={config['no_above']}, "
                      f"optimal_num_topics={config['optimal_num_topics']}, iterations={config['iterations']}, "
                      f"alpha={config['alpha']}, eta={config['eta']}",
        'n_topics': optimal_num_topics,
        'coherence': round(coherence_score, 4)
    }

    df_run = pd.DataFrame([run_data])
    full_log_path = os.path.join(output_dir, log_path)
    if os.path.exists(full_log_path):
        df_run.to_csv(full_log_path, mode='a', header=False, index=False)
    else:
        df_run.to_csv(full_log_path, mode='w', header=True, index=False)


In [26]:
def save_topic_assignment_with_labels(df, dominant_topics_df, content_column='content', output_path='lda_output/tweet_topic_label_mapping.csv'):

    # Check required columns
    if content_column not in df.columns:
        raise ValueError(f"'{content_column}' column not found in the DataFrame.")
    
    if dominant_topics_df.empty:
        raise ValueError("dominant_topics_df is empty.")
    
    # Align indices if needed
    if not df.index.equals(dominant_topics_df.index):
        print("Aligning indices between df and dominant_topics_df...")
        dominant_topics_df = dominant_topics_df.set_index(df.index[:len(dominant_topics_df)])

    # Merge and save
    merged_df = df[[content_column]].copy()
    merged_df = merged_df.join(dominant_topics_df[['dominant_topic']])
    merged_df.to_csv(output_path, index=False)
    print(f"Tweet-topic-label mapping saved to: {output_path}")

In [27]:

def plot_topic_distribution_from_db_and_csv(
    db_path,
    topics_file,
    topic_info_file='lda_output/lda_dominant_topics.csv',
    output_path='lda_output/topic_distribution_by_label.png',
    top_n=13
):
    """
    Plot distribution of topics by label using tweet-topic mapping, DB labels, and topic names from CSV.

    Args:
        db_path (str): Path to the SQLite database.
        topics_file (str): Path to the CSV file with tweet-to-topic mapping (must include 'content' and 'dominant_topic').
        topic_info_file (str): CSV file that maps dominant_topic IDs to top_words.
        output_path (str): File path to save the plot.
        top_n (int): Number of top topics to display based on total proportions.
    """
    import sqlite3
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os

    # Load topic-to-top_words mapping
    topic_info_df = pd.read_csv(topic_info_file)
    topic_name_map = {
        row['dominant_topic']: row['top_words']
        for _, row in topic_info_df.iterrows()
    }

    # Connect to the database
    conn = sqlite3.connect(db_path)
    query = """
    SELECT p.author, a.label, p.content
    FROM posts p
    JOIN authors a ON p.author = a.author_screen_name;
    """
    posts_df = pd.read_sql_query(query, conn)
    conn.close()

    # Load tweet-topic mapping
    topics_df = pd.read_csv(topics_file)

    # Merge on content
    merged_df = pd.merge(posts_df, topics_df, left_on='content', right_on='content', how='inner')

    # Group by label and topic
    grouped_df = merged_df.groupby(['label', 'dominant_topic']).size().reset_index(name='Count')
    grouped_df['Proportion'] = grouped_df.groupby('label')['Count'].transform(lambda x: x / x.sum())

    # Pivot for plotting
    pivot_df = grouped_df.pivot(index='dominant_topic', columns='label', values='Proportion').fillna(0)
    pivot_df['total'] = pivot_df.sum(axis=1)
    pivot_df = pivot_df.sort_values(by='total', ascending=False).head(top_n)
    pivot_df.drop(columns='total', inplace=True)

    # Get topic names from topic_name_map
    topic_labels = [topic_name_map.get(topic_id, f"Topic {topic_id}") for topic_id in pivot_df.index]

    # Plot
    ax = pivot_df.plot(kind='bar', figsize=(18, 8), width=0.8)
    ax.set_title("Distribution of Tweets by Topic Category", fontsize=16, weight='bold')
    ax.set_xlabel("Topics", fontsize=14)
    ax.set_ylabel("Proportion of Tweets", fontsize=14)
    ax.set_xticklabels(topic_labels, rotation=45, ha='right')
    ax.legend(title="Categories")
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    
    # Adding percentage values above each bar
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%', padding=3, fontsize=10,
                     labels=[f"{val*100:.1f}%" for val in container.datavalues])

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    print(f"Plot saved to: {output_path}")

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Wedge, Rectangle
from wordcloud import WordCloud
import sqlite3
import numpy as np
import os

def plot_sunburst_for_top_topics(
    db_path,
    topics_file,
    topic_info_file,
    output_path='lda_output/sunburst_chart.png',
    top_n=6
):
    def draw_sunburst_donut(ax, group1_yes, group1_no, group2_yes, group2_no, top_words, title, colors):
        group1_total = group1_yes + group1_no
        group2_total = group2_yes + group2_no

        # Inner ring (group1)
        group1_yes_angle = group1_yes / group1_total * 360 if group1_total else 0
        inner_wedges = [
            (0, group1_yes_angle, colors['group1_yes']),
            (group1_yes_angle, 360, colors['group1_no'])
        ]
        for start, end, color in inner_wedges:
            ax.add_patch(Wedge((0, 0), 1.0, start, end, width=0.3, color=color))

        # Outer ring (group2)
        group2_yes_angle = group2_yes / group2_total * 360 if group2_total else 0
        outer_wedges = [
            (0, group2_yes_angle, colors['group2_yes']),
            (group2_yes_angle, 360, colors['group2_no'])
        ]
        for start, end, color in outer_wedges:
            ax.add_patch(Wedge((0, 0), 1.3, start, end, width=0.3, color=color))

        # Word cloud
        wc = WordCloud(width=300, height=300, background_color='white').generate(top_words)
        ax.imshow(wc, extent=[-0.5, 0.5, -0.5, 0.5], zorder=10)

        # Title
        ax.text(0, 1.4, title, ha='center', va='bottom', fontsize=10, wrap=True)
        ax.set_xlim(-1.5, 1.5)
        ax.set_ylim(-1.5, 1.5)
        ax.set_aspect('equal')
        ax.axis('off')

    # Load data
    conn = sqlite3.connect(db_path)
    posts_df = pd.read_sql_query("""
        SELECT p.content, a.label, p.author
        FROM posts p
        JOIN authors a ON p.author = a.author_screen_name;
    """, conn)
    conn.close()

    topics_df = pd.read_csv(topics_file)
    topic_info_df = pd.read_csv(topic_info_file)
    topic_words_map = topic_info_df.set_index('dominant_topic')['top_words'].to_dict()

    merged_df = pd.merge(posts_df, topics_df, on='content', how='inner')
    top_topics = merged_df['dominant_topic'].value_counts().nlargest(top_n).index.tolist()

    # Extract unique labels
    unique_labels = merged_df['label'].unique()
    unique_labels = [x for x in unique_labels if x not in [None, "None"]]
    if len(unique_labels) != 2:
        raise ValueError(f"Expected exactly 2 unique labels, got: {unique_labels}")
    group1_label, group2_label = unique_labels

    # Define colors
    colors = {
        'group1_yes': '#1f77b4',   # blue
        'group1_no': '#2ca02c',    # green
        'group2_yes': '#9467bd',   # purple
        'group2_no': '#d62728'     # red
    }

    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()

    for i, topic_id in enumerate(top_topics):
        df = merged_df.copy()
        df['participated'] = df['dominant_topic'] == topic_id

        unique_authors = df.drop_duplicates(subset='author')

        group1_yes = len(unique_authors[(unique_authors['label'] == group1_label) & (unique_authors['participated'])])
        group1_no  = len(unique_authors[(unique_authors['label'] == group1_label) & (~unique_authors['participated'])])
        group2_yes = len(unique_authors[(unique_authors['label'] == group2_label) & (unique_authors['participated'])])
        group2_no  = len(unique_authors[(unique_authors['label'] == group2_label) & (~unique_authors['participated'])])

        top_words = topic_words_map.get(topic_id, '')
        draw_sunburst_donut(
            axes[i], group1_yes, group1_no, group2_yes, group2_no, top_words, top_words, colors
        )

    # Legend
    legend_ax = fig.add_axes([0.1, 0.05, 0.8, 0.05])
    legend_ax.set_axis_off()
    items = [
        (0.1, colors['group1_yes'], f'{group1_label} Participated'),
        (0.3, colors['group1_no'], f'{group1_label} Not Participated'),
        (0.5, colors['group2_yes'], f'{group2_label} Participated'),
        (0.7, colors['group2_no'], f'{group2_label} Not Participated'),
    ]
    for pos, color, label in items:
        legend_ax.add_patch(Rectangle((pos, 0.2), 0.03, 0.6, color=color))
        legend_ax.text(pos + 0.04, 0.5, label, va='center', fontsize=10)

    plt.tight_layout(rect=[0, 0.1, 1, 1])
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.savefig(output_path, bbox_inches='tight')
    plt.close()
    print(f"Sunburst donut chart saved to: {output_path}")

In [29]:
def should_merge_topics(topic1, topic2):
    system_prompt = """
You are an expert in topic modeling and keyword clustering.

Your task is to decide whether the following two topic names refer to the **same core topic** or represent **closely related concepts**.

Please respond with **"yes"** if the two topics should be merged — for example, if they are:
- synonyms,
- strongly related,
- overlapping in meaning,
- or commonly used to describe the same type of content.

Respond with **"no"** if they describe clearly **different topics** or represent **distinct themes**.

Your answer should be **only** "yes" or "no" — no explanations or extra text.
"""
    user_prompt = f"""Topic 1: {topic1}
Topic 2: {topic2}

Do these two topics represent the same topic and should be merged?"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            top_p=1.0
        )
        answer = response.choices[0].message.content.strip().lower()
        return answer == "yes"
    except Exception as e:
        print(f"Error with pair ({topic1}, {topic2}): {e}")
        return False

def generate_merge_topics(topic_pairs):
    merged_groups = []
    
    for topic1, topic2 in tqdm(topic_pairs, desc="Checking topic pairs"):
        if should_merge_topics(topic1, topic2):
            # Merge into existing group if present
            found = False
            for group in merged_groups:
                if topic1 in group or topic2 in group:
                    group.update([topic1, topic2])
                    found = True
                    break
            if not found:
                merged_groups.append(set([topic1, topic2]))
    return merged_groups

def generate_group_name_from_topics(topics):
    user_prompt = (
        "You are an expert in topic modeling.\n"
        "Given the following topic names:\n" +
        '\n'.join(f"- {t}" for t in topics) +
        "\nSuggest a concise 2–3 word name summarizing the shared theme.\n"
        "Output ONLY the name — no punctuation or explanation."
    )

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a topic naming assistant."},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.3,
        top_p=1.0
    )

    return response.choices[0].message.content.strip()

def extract_topic_pairs_from_csv(file_path, topic_name_column='Name'):
    """
    Reads a CSV file and generates all unique topic pairs from a column containing topic names.
    Returns a list of (topic1, topic2) tuples.
    """
    df = pd.read_csv(file_path)
    
    if topic_name_column not in df.columns:
        raise ValueError(f"Column '{topic_name_column}' not found in CSV.")

    df['clean_topic_name'] = df[topic_name_column].astype(str).str.replace(r'^\d+_', '', regex=True)
    
    topic_names = df['clean_topic_name'].dropna().unique().tolist()
    topic_pairs = list(combinations(topic_names, 2))

    return topic_pairs

def assign_merged_topic_names(merged_groups):
    """
    Assigns LLM-generated names to merged topic groups.
    Returns:
        - topic_to_group_name: {original_topic_name: descriptive_name}
    """
    topic_to_group_name = {}

    for idx, group in enumerate(merged_groups):
        group_id = f"Group_{idx+1}"
        group = sorted(group)
        try:
            name = generate_group_name_from_topics(group)
        except Exception as e:
            print(f"Error naming group {group_id}: {e}")
            name = group_id

        for topic in group:
            topic_to_group_name[topic] = name

    return topic_to_group_name

def apply_merged_names_to_tweets(mapping_file_path, topic_to_group_name, output_path):
    """
    Loads a tweet-topic mapping file, replaces the topic names with merged group names, and saves to new CSV.
    """
    df = pd.read_csv(mapping_file_path)
    df['clean_topic'] = df['dominant_topic'].astype(str).str.replace(r'^\d+_', '', regex=True)
    df['merged_topic'] = df['clean_topic'].map(topic_to_group_name)
    df.to_csv(output_path, index=False)
    print(f"Saved updated tweet-topic mapping with merged names to: {output_path}")

def merge_topics(file_path):
    topic_pairs = extract_topic_pairs_from_csv(file_path)
    merged_groups = generate_merge_topics(topic_pairs)
    topic_to_group_name = assign_merged_topic_names(merged_groups)
    apply_merged_names_to_tweets('lda_output/tweet_topic_label_mapping.csv', topic_to_group_name, 'lda_output/tweet_merged_topic_mapping.csv')

In [30]:
if __name__ == "__main__":
    config = load_config()
    df = load_data(
        db_path=config['database_path'],
        table_name='posts',
        content_column='content',
        limit=config['limit']
    )
    custom_stopwords = config.get('custom_stopwords', [])

    # Parameters for the LINE PLOT topics over time
    enable_line_plot = config.get('enable_time_series_plot', True)
    ts_column_line = config.get('timestamp_column_name', 'date') # Assuming 'date' from previous
    line_plot_freq = config.get('plot_time_frequency', 'M')
    line_plot_norm = config.get('plot_normalize_time_series', True)

    # Parameters for the STACKED AREA topics over time plot
    enable_stack_plot = config.get('enable_stacked_area_plot', True) # Default to True
    # Can reuse timestamp col if same, or define a new one in config
    ts_column_stack = config.get('timestamp_column_for_stacked_plot', ts_column_line)
    stack_plot_freq = config.get('stacked_plot_time_frequency', 'M')
    stack_plot_colormap = config.get('stacked_plot_colormap', 'tab20')


    # Run topic modeling
    results = run_topic_modeling(
        df,
        content_column='content',
        is_tweet=True,
        custom_stopwords=custom_stopwords,
        min_topics=config['min_topics'],
        max_topics=config['max_topics'],
        step=config['step'],
        optimal_num_topics=config.get('optimal_num_topics', None),
        no_below=config['no_below'],
        no_above=config['no_above'],
        iterations=config.get('iterations', 50),
        # Line plot params
        enable_time_series_plot=enable_line_plot,
        timestamp_column_for_plot=ts_column_line,
        time_freq_for_plot=line_plot_freq,
        normalize_time_plot=line_plot_norm,
        # Stacked area plot params
        enable_stacked_area_plot=enable_stack_plot,
        timestamp_col_for_stacked_plot=ts_column_stack,
        # topic_col_name_for_stacked_plot is defaulted in run_topic_modeling to 'dominant_topic'
        time_freq_for_stacked_plot=stack_plot_freq,
        colormap_for_stacked_plot=stack_plot_colormap
    )

    # Access results
    lda_model = results['lda_model']
    dominant_topics_df = results['dominant_topics_df']
    optimal_num_topics = results.get('optimal_num_topics', 'N/A')

    print(f"\n--- Analysis Complete ---")
    print(f"Optimal number of topics found/used: {optimal_num_topics}")

    # Show the distribution of dominant topics
    if not dominant_topics_df.empty:
        topic_distribution = dominant_topics_df['dominant_topic'].value_counts().sort_values(ascending=False)
        topic_distribution = topic_distribution.head(10)

        # Get topic names (e.g., top 3 words per topic)
        topic_names = {}
        for topic_id in topic_distribution.index:
            words = [word for word, _ in lda_model.show_topic(topic_id, topn=3)]
            topic_names[topic_id] = ', '.join(words)
    
        # Prepare data for plotting
        topic_ids = list(topic_distribution.index)
        topic_counts = topic_distribution.values
        topic_labels = [topic_names[tid] for tid in topic_ids]
        topic_labels = [topic_names[tid].replace(' ', '\n') for tid in topic_ids]

        plt.figure(figsize=(14, 7))
        sns.barplot(x=topic_labels, y=topic_counts)
        plt.title('Distribution of Dominant Topics')
        plt.xlabel('Topic Name')
        plt.ylabel('Number of Documents')
        output_path = os.path.join(output_dir, "dominant_topic_distribution.png")
        plt.tight_layout()
        plt.savefig(output_path)
        plt.close()
        plot_manual_topics_over_time(df, dominant_topics_df, timestamp_column=ts_column_line, time_freq=line_plot_freq)
        export_lda_model_results(lda_model, dictionary=results['dictionary'], dominant_topics_df=dominant_topics_df)
        export_lda_summary(lda_model, dominant_topics_df, content_column='content', output_path="lda_topic_info.csv")
        lda_coherence = compute_coherence_score(
            model=lda_model,
            corpus=results['corpus'],
            dictionary=results['dictionary'],
            processed_docs=results['processed_docs']
        )

        save_topic_assignment_with_labels(
            df=df,
            dominant_topics_df=results['dominant_topics_df'],
            content_column='content',  
            output_path='lda_output/tweet_topic_label_mapping.csv'
        )


        plot_topic_distribution_from_db_and_csv(
            db_path=config['database_path'], 
            topics_file='lda_output/tweet_topic_label_mapping.csv',
            topic_info_file='lda_output/lda_dominant_topics.csv',
            output_path='lda_output/topic_distribution_by_label.png'
        )

        plot_sunburst_for_top_topics(
            db_path=config['database_path'], 
            topics_file='lda_output/tweet_topic_label_mapping.csv',
            topic_info_file='lda_output/lda_dominant_topics.csv',
            output_path='lda_output/sunburst_chart.png'
        )
       
        merge_topics("lda_output/lda_topic_info.csv")
        plot_topic_distribution_from_db_and_csv(
            db_path=config['database_path'], 
            topics_file='lda_output/tweet_merged_topic_mapping.csv',
            topic_info_file='lda_output/lda_dominant_topics.csv',
            output_path='lda_output/topic_distribution_by_label_after_merge.png'
        )

        plot_sunburst_for_top_topics(
            db_path=config['database_path'], 
            topics_file='lda_output/tweet_merged_topic_mapping.csv',
            topic_info_file='lda_output/lda_dominant_topics.csv',
            output_path='lda_output/sunburst_chart_after_merge.png'
        )

        log_lda_run(
            model=lda_model,
            dataset_name=config['database_path'],
            optimal_num_topics=optimal_num_topics,
            coherence_score=lda_coherence
        )


    else:
        print("Dominant topics DataFrame is empty, skipping distribution plot.")  

  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce', infer_datetime_format=True)


Loaded 10000 records from posts
Preprocessing documents...
Processing 10000 documents...
Processed 1000/10000 documents
Processed 2000/10000 documents
Processed 3000/10000 documents
Processed 4000/10000 documents
Processed 5000/10000 documents
Processed 6000/10000 documents
Processed 7000/10000 documents
Processed 8000/10000 documents
Processed 9000/10000 documents
Finished processing. 7315 documents retained after preprocessing.

Creating dictionary and corpus...
Dictionary size: 3592
Corpus size: 7315
Evaluating model with 10 topics...
Coherence score: 0.4626
Perplexity: -6.5440
--------------------------------------------------
Evaluating model with 15 topics...
Coherence score: 0.5017
Perplexity: -8.1166
--------------------------------------------------
Evaluating model with 20 topics...
Coherence score: 0.4797
Perplexity: -9.2390
--------------------------------------------------
Evaluating model with 25 topics...
Coherence score: 0.5007
Perplexity: -9.9365
----------------------

  temp_df['time_period'] = temp_df[timestamp_col].dt.to_period('M').dt.to_timestamp()
  plt.grid(False, which='major', linestyle='--', linewidth=0.5, axis='x') # Optional: remove vertical grid lines


Stacked area chart saved to: lda_output/stacked_area_topics_over_time_M.png

--- Analysis Complete ---
Optimal number of topics found/used: 60


  topics_over_time = df_merged.groupby([pd.Grouper(freq=time_freq), 'dominant_topic']).size().unstack(fill_value=0)


Saved LDA model to lda_output/lda_model
Saved structured dictionary to lda_output/lda_dictionary.csv
Saved top topic terms to lda_output/lda_topic_terms.csv
Saved dominant topic assignments to lda_output/lda_dominant_topics.csv
LDA topic summary exported to lda_topic_info.csv


/usr/bin/xdg-open: line 881: x-www-browser: command not found
/usr/bin/xdg-open: line 881: firefox: command not found
/usr/bin/xdg-open: line 881: iceweasel: command not found
/usr/bin/xdg-open: line 881: seamonkey: command not found
/usr/bin/xdg-open: line 881: mozilla: command not found
/usr/bin/xdg-open: line 881: epiphany: command not found
/usr/bin/xdg-open: line 881: konqueror: command not found
/usr/bin/xdg-open: line 881: chromium: command not found
/usr/bin/xdg-open: line 881: chromium-browser: command not found
/usr/bin/xdg-open: line 881: google-chrome: command not found
/usr/bin/xdg-open: line 881: www-browser: command not found
/usr/bin/xdg-open: line 881: links2: command not found
/usr/bin/xdg-open: line 881: elinks: command not found
/usr/bin/xdg-open: line 881: links: command not found
/usr/bin/xdg-open: line 881: lynx: command not found
/usr/bin/xdg-open: line 881: w3m: command not found
xdg-open: no method available for opening 'file:///sise/4-year-ise-proj/Ise4thYear

Aligning indices between df and dominant_topics_df...
Tweet-topic-label mapping saved to: lda_output/tweet_topic_label_mapping.csv
Plot saved to: lda_output/topic_distribution_by_label.png



This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.



Sunburst donut chart saved to: lda_output/sunburst_chart.png


NameError: name 'combinations' is not defined