In [27]:
#IMPORTS
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
import os

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def run_lda_pipeline(filepath, name, num_topics=10):
    print(f"Loading {name} data...")
    df = pd.read_csv(filepath)
    
    # 1. Tokenization (Gensim needs a list of lists of words)
    # Our 'lda_text' is space-separated strings, so we split them
    docs = df['lda_text'].astype(str).str.split().tolist()
    
    # 2. Create Dictionary & Corpus (Bag-of-Words)
    dictionary = corpora.Dictionary(docs)
    # Filter extremes: ignore words in < 20 docs or > 50% of docs
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    print(f"Training LDA for {name} (K={num_topics}, alpha/eta=auto)...")
    # Using LdaModel to support alpha='auto'
    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        alpha='auto',      # Learned from data
        eta='auto',        # Learned from data
        passes=10,         # Standard for convergence
        random_state=42
    )
    
    # 3. Output Topics
    print(f"\n{name} TOPICS:")
    for idx, topic in lda.print_topics(-1):
        print(f"Topic {idx}: {topic}")
        
    return lda, corpus, dictionary

In [15]:
# Run for Trump
lda_trump, corpus_t, dict_t = run_lda_pipeline('../data/processed/trump_lda_ready.csv', 'TRUMP')

# Run for Biden
lda_biden, corpus_b, dict_b = run_lda_pipeline('../data/processed/biden_lda_ready.csv', 'BIDEN')

Loading TRUMP data...
Training LDA for TRUMP (K=10, alpha/eta=auto)...

TRUMP TOPICS:
Topic 0: 0.046*"good" + 0.035*"white" + 0.032*"house" + 0.028*"look" + 0.024*"support" + 0.021*"right" + 0.021*"thank" + 0.021*"voter" + 0.019*"work" + 0.016*"wait"
Topic 1: 0.066*"loser" + 0.037*"whitehouse" + 0.026*"winner" + 0.017*"india" + 0.014*"pardon" + 0.011*"racism" + 0.011*"brexit" + 0.010*"move" + 0.010*"mask" + 0.009*"boy"
Topic 2: 0.095*"president" + 0.061*"win" + 0.045*"lose" + 0.040*"usa" + 0.031*"donald" + 0.025*"joe" + 0.021*"leave" + 0.020*"new" + 0.018*"elect" + 0.018*"love"
Topic 3: 0.038*"think" + 0.033*"fraud" + 0.030*"let" + 0.029*"come" + 0.018*"concede" + 0.018*"big" + 0.017*"hope" + 0.017*"fire" + 0.016*"fight" + 0.016*"defeat"
Topic 4: 0.027*"know" + 0.023*"get" + 0.023*"say" + 0.022*"supporter" + 0.021*"want" + 0.021*"need" + 0.012*"tell" + 0.012*"watch" + 0.012*"call" + 0.011*"lie"
Topic 5: 0.038*"end" + 0.033*"count" + 0.027*"covid" + 0.022*"electionday" + 0.019*"job" + 0

In [16]:
# 1. Visualize Trump Topics
# Note: 'lda_trump', 'corpus_t', and 'dict_t' are the outputs from your previous step
vis_data_trump = gensimvis.prepare(lda_trump, corpus_t, dict_t)
pyLDAvis.display(vis_data_trump)

In [17]:
# 2. Visualize Biden Topics
vis_data_biden = gensimvis.prepare(lda_biden, corpus_b, dict_b)
pyLDAvis.display(vis_data_biden)

In [23]:
def inspect_representative_docs(filepath, lda_model, corpus, num_samples=3):
    """Reads file and prints top documents for each topic using the 'tweet' column."""
    df = pd.read_csv(filepath)
    
    # Map each document to its dominant topic and probability
    topic_results = []
    for doc_bow in lda_model[corpus]:
        # Get the highest probability topic for this document
        topics = sorted(lda_model.get_document_topics(doc_bow), key=lambda x: x[1], reverse=True)
        topic_results.append(topics[0] if topics else (None, 0))

    df['dominant_topic'] = [res[0] for res in topic_results]
    df['topic_prob'] = [res[1] for res in topic_results]

    print(f"\n--- REPRESENTATIVE DOCUMENTS FOR {filepath.split('/')[-1]} ---")
    for t in range(lda_model.num_topics):
        print(f"\nTOPIC {t}:")
        
        # Filter for the specific topic and sort by confidence
        top_docs = df[df['dominant_topic'] == t].sort_values('topic_prob', ascending=False).head(num_samples)
        
        if top_docs.empty:
            print("No documents found for this topic.")
            continue
            
        for _, row in top_docs.iterrows():
            # Using 'tweet' column as requested
            print(f"[{row['topic_prob']:.3f}] {str(row['tweet'])[:200]}...")

In [24]:
# Execution
inspect_representative_docs('../data/processed/trump_lda_ready.csv', lda_trump, corpus_t)
inspect_representative_docs('../data/processed/biden_lda_ready.csv', lda_biden, corpus_b)


--- REPRESENTATIVE DOCUMENTS FOR trump_lda_ready.csv ---

TOPIC 0:
No documents found for this topic.

TOPIC 1:
No documents found for this topic.

TOPIC 2:
No documents found for this topic.

TOPIC 3:
No documents found for this topic.

TOPIC 4:
[0.235] LIES

LIES

LIES

LIES

LIES 

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

LIES

L...
[0.235] #Debates2020 #Trump #Biden #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drink #drin...
[0.234] @realDonaldTrump #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #lies #Trump #TrumpIsALoser #TrumpIsPathetic #lies #lies #lies #lies #lies #lies #lies #lies #lies ...

TOPIC 5:
No documents found for this topic.

TOPIC 6:
No documents found for this topic.

TOPIC 7:
No documents fou

In [28]:
def run_bertopic_pipeline(docs, name):
    print(f"Starting BERTopic pipeline for {name}...")

    # 1. Embedding (Semantic Context)
    # 'all-mpnet-base-v2' is the high-quality standard for ideology/sentiment
    embedding_model = SentenceTransformer("all-mpnet-base-v2")

    # 2. Dimensionality Reduction (Structure)
    # n_neighbors=15 balances local (specific) vs global (broad) themes
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

    # 3. Clustering (Grouping)
    # min_cluster_size=15 prevents micro-clusters; 'eom' finds stable topics
    hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # 4. Tokenization (Filtering)
    # Removing common words here prevents the "LIES LIES LIES" issue from LDA
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

    # 5. Weighting (Labeling)
    # c-TF-IDF finds keywords that make this cluster distinct from others
    ctfidf_model = ClassTfidfTransformer()

    # Initialize BERTopic with the modular components
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        verbose=True
    )

    # Fit the model
    topics, probs = topic_model.fit_transform(docs)
    
    return topic_model, topics, probs

In [29]:
# Execute for Trump
# Note: Use 'tweet' (raw) or 'lda_text' (preprocessed). For BERT, raw is often better.
trump_docs = pd.read_csv('../data/processed/trump_lda_ready.csv')['tweet'].astype(str).tolist()
topic_model_trump, topics_t, probs_t = run_bertopic_pipeline(trump_docs, "TRUMP")
print(topic_model_trump.get_topic_info().head(10))

# Execute for Biden
# Note: BERTopic performs better on 'tweet' (raw text) than 'lda_text' (processed)
biden_docs = pd.read_csv('../data/processed/biden_lda_ready.csv')['tweet'].astype(str).tolist()
topic_model_biden, topics_b, probs_b = run_bertopic_pipeline(biden_docs, "BIDEN")
print(topic_model_biden.get_topic_info().head(10))

Starting BERTopic pipeline for TRUMP...


2026-01-05 00:57:37,324 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:   3%|â–Ž         | 440/16779 [03:03<1:53:19,  2.40it/s]


KeyboardInterrupt: 