## Imports

In [1]:
import torch
if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [2]:
from bertopic import BERTopic
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, OpenAI
from bertopic.dimensionality import BaseDimensionalityReduction


from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wanjanss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## OpenAI API access & prompt templates

In [3]:
import openai
client = openai.OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Function calls itself,  \nInfinite depths intertwine,  \nLogic’s dance unfolds.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None, annotations=[])


In [4]:
system_prompt = """" 
You are a helpful assistant tasked with identifying overlapping topics.
"""

instruction_prompt = """
Below I will provide a numbered list of topics. Each line represents the topic words of a topic. Return the index of the two most similar topics in python list format.
[TOPICS]

Return only the python list and nothing more!
"""

demonstration_prompt = """
Below I will provide a numbered list of topic words. Each line represents the topic words of a topic. Return the index of the two most similar topics in python list format.

0: astronaut, rocket, planet, galaxy, NASA, telescope, satellite, mission, orbit, cosmos
1: apple, fruit, red, sweet, tree, orchard, juicy, vitamin, snack, fresh
2: computer, laptop, technology, device, screen, keyboard, processor, portable, battery, software
3: orange, fruit, citrus, vitamin, juicy, tropical, snack, fresh, tree, peel
4: car, vehicle, engine, road, drive, wheels, speed, travel, gasoline, transportation
5: book, library, read, pages, author, fiction, genre, chapter, knowledge, cover

Return only the python list and nothing more!
"""

demonstration_answer = """
[1, 3]
"""

In [5]:
instruction_prompt_labeled = """
Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
[TOPICS]

Return only the python list and nothing more!
"""

demonstration_prompt_labeled = """
Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.

1: Space Exploration
2: Artificial Intelligence in Industry
3: Ocean Conservation
4: Medieval History
5: AI applied in Industry

Return only the python list and nothing more!
"""

demonstration_answer_labeled = """
[2, 5]
"""

## Functions

In [6]:
def get_topic_words(topic_model, representation = "Representation"):
  all_topic_words = topic_model.get_topic_info()[representation]
  topic_words = []
  for words in all_topic_words:
    topic_words.append(words)

  # remove first entry (topic -1)
  topic_words = topic_words[1:]
  return topic_words

In [7]:
def clean_topic_words(topic_words):
    return [topic for topic in topic_words if topic and all(word for word in topic)]

In [8]:
#function to evaluate topic coherence and diversity of topic model output
def evaluate_bertopic(topic_words, tokenized_docs):
  print(f"Length topic words before cleaning: {len(topic_words)}")
  topic_words = clean_topic_words(topic_words)
  print(f"Length topic words after cleaning: {len(topic_words)}")

  print(topic_words)
  topic_words_dict = {'topics': topic_words}
  print(topic_words_dict)


  c_npmi = Coherence(topk=10, texts=tokenized_docs)
  diversity = TopicDiversity(topk=10)
  topic_coherence_score = round(c_npmi.score(topic_words_dict), 4)
  topic_diversity_score = round(diversity.score(topic_words_dict), 4)

  print(f"Topic coherence score (NPMI) {topic_coherence_score}")
  print(f"Topic diversity score {topic_diversity_score}")
  return (topic_coherence_score + topic_diversity_score)/2

In [9]:
def merge_topics_llm(topic_model, reduced_nr, representation="Representation"):

    nr_of_topics = len(topic_model.topic_sizes_) - 1

    while nr_of_topics > reduced_nr:
        to_replace = ""
        i = 0
        topic_words =  get_topic_words(topic_model, representation)
        for words in topic_words:
            joined_words = ", ".join(words)
            to_replace += f"{i}: {joined_words}\n"
            i+=1

        real_instruction_prompt = instruction_prompt.replace('[TOPICS]', to_replace)

        if representation == "OpenAI":
            real_instruction_prompt = instruction_prompt_labeled.replace('[TOPICS]', to_replace)
            messages=[
                {"role":"system", "content": system_prompt},
                {"role": "user", "content": demonstration_prompt_labeled},
                {"role": "assistant", "content": demonstration_answer_labeled},
                {"role":"user", "content": real_instruction_prompt}    
                    ]
            
        else:
            messages=[
                {"role":"system", "content": system_prompt},
                {"role": "user", "content": demonstration_prompt},
                {"role": "assistant", "content": demonstration_answer},
                {"role":"user", "content": real_instruction_prompt}    
                    ]
        
        print(real_instruction_prompt)


        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages
            )

        response_string = response.choices[0].message.content
        print(response_string)

        merge_list = eval(response_string)
        topic_model.merge_topics(docs, merge_list)
        nr_of_topics = len(topic_model.topic_sizes_) - 1


In [20]:
def merge_label_embeddings(topic_model, embedding_model, docs, nr_topics, representation, use_ctfidf=False):
    topic_labels = topic_model.get_topic_info()[representation]
    print(topic_labels)
    if representation == "OpenAI":
        flattened_labels = [item for sublist in topic_labels for item in sublist]
        print(len(flattened_labels))
        topic_model.topic_embeddings_ = embedding_model.encode(flattened_labels)
        topic_model.reduce_topics(docs, nr_topics, use_ctfidf)
    else:
        topic_model.topic_embeddings_ = embedding_model.encode(topic_labels)
        topic_model.reduce_topics(docs, nr_topics, use_ctfidf)

## Read in data

In [10]:
basetable_path = "./data/realdonaldtrump.csv"
basetable = pd.read_csv(basetable_path)
basetable.head()

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,


In [11]:
%%time
# Select the 'full_text' column for the random subsample of rows
docs = basetable.loc[:, 'content'].tolist()

print(len(docs))

# split the docs into tokens, this is necessary for the coherence and diversity evaluation later
tokenized_docs = docs.copy()
tokenizer = RegexpTokenizer(r'\w+')
STOPWORDS = stopwords.words('english')
for idx in tqdm(range(len(tokenized_docs)), desc="Tokenizing Documents"):
    # Convert to lowercase and split into words
    tokenized_docs[idx] = tokenized_docs[idx].lower()
    tokenized_docs[idx] = tokenizer.tokenize(tokenized_docs[idx])

    # Remove stopwords & numbers
    tokenized_docs[idx] = [token for token in tokenized_docs[idx] if token not in STOPWORDS and not token.isnumeric()]

    # Remove words that are only one character
    tokenized_docs[idx] = [token for token in tokenized_docs[idx] if len(token) > 1]

43352


Tokenizing Documents: 100%|██████████| 43352/43352 [00:02<00:00, 16807.64it/s]

CPU times: total: 2.41 s
Wall time: 2.59 s





In [12]:
texts= docs
# Calculate the total number of characters and words across all texts
total_characters = sum(len(text) for text in texts)
total_words = sum(len(text.split()) for text in texts)

# Calculate the average length of characters and words per text
average_characters = total_characters / len(texts)
average_words = total_words / len(texts)

# Print the average character and word length per text
print(f"Average length of all texts: {average_characters:.2f} characters")
print(f"Average number of words in all texts: {average_words:.2f} words")

Average length of all texts: 131.53 characters
Average number of words in all texts: 20.76 words


## Embed and reduce documents, define modules,  and train initial BERTopic model

In [13]:
# Embed the docs and reduce the dimensionality using UMAP upfront to speed up the process when training the BERTopic model repeatedly
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)
reduced_embeddings = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', verbose=True, random_state=42).fit_transform(embeddings)

Batches:   0%|          | 0/1355 [00:00<?, ?it/s]

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, n_jobs=1, random_state=42, verbose=True)
Mon Jul 28 14:47:34 2025 Construct fuzzy simplicial set
Mon Jul 28 14:47:35 2025 Finding Nearest Neighbors
Mon Jul 28 14:47:35 2025 Building RP forest with 15 trees
Mon Jul 28 14:47:42 2025 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	 6  /  15
	 7  /  15
	Stopping threshold met -- exiting after 7 iterations
Mon Jul 28 14:48:03 2025 Finished Nearest Neighbor Search
Mon Jul 28 14:48:06 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Mon Jul 28 14:48:35 2025 Finished embedding


In [14]:
%%time
# define modules
empty_dimensionality_model = BaseDimensionalityReduction() #define empty dimensionality reduction model for faster re-training
hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=1, ngram_range=(1, 1))

representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "OpenAI": OpenAI(client,  model="gpt-4o-mini", chat=True)
}

CPU times: total: 0 ns
Wall time: 0 ns


In [22]:
%%time
# training
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=empty_dimensionality_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
  # nr_topics='auto'
)

topics, probs = topic_model.fit_transform(docs, reduced_embeddings)

2025-07-28 14:54:19,615 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-28 14:54:19,617 - BERTopic - Dimensionality - Completed ✓
2025-07-28 14:54:19,620 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-28 14:54:21,962 - BERTopic - Cluster - Completed ✓
2025-07-28 14:54:21,977 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 49/49 [00:43<00:00,  1.13it/s]
2025-07-28 14:55:08,493 - BERTopic - Representation - Completed ✓


CPU times: total: 6.52 s
Wall time: 49.3 s


In [16]:
#scoring before
topic_words = get_topic_words(topic_model, "Representation")
score = evaluate_bertopic(topic_words, tokenized_docs)
print(f"Overall score: {score}")

Length topic words before cleaning: 48
Length topic words after cleaning: 48
[['realdonaldtrump', 'run', 'president', 'trump', 'donald', 'mr', 'trump2016', 'need', '2016', 'vote'], ['hillary', 'clinton', 'democrats', 'crooked', 'hunt', 'witch', 'fbi', 'collusion', 'russia', 'mueller'], ['thanks', 'true', 'thank', 'luck', 'good', 'great', 'yes', 'nice', 'hi', 'billmaher'], ['entrepreneurs', 'think', 'success', 'focus', 'big', 'work', 'don', 'champion', 'like', 'midas'], ['com', 'twitter', 'thank', 'makeamericagreatagain', 'www', 'pic', 'https', 'crowd', 'join', 'donaldjtrump'], ['fake', 'news', 'media', 'cnn', 'failing', 'ratings', 'nytimes', 'dishonest', 'story', 'bad'], ['golf', 'course', 'doral', 'scotland', 'trump', 'turnberry', 'club', 'http', 'trumpdoral', 'national'], ['border', 'wall', 'mexico', 'immigration', 'security', 'southern', 'democrats', 'illegal', 'country', 'laws'], ['hotel', 'chicago', 'tower', 'trump', 'trumpchicago', 'building', 'http', 'sign', 'realdonaldtrump', '

## Topic reduction

In [17]:
'''
This code applies agglomerative clustering to reduce the number of topics in the BERTopic model. 
To use the c-TF-IDF representation, set use_ctfidf=True.
To use the average SBERT topic embedding representation, set use_ctfidf=False.
'''

topic_model.reduce_topics(docs, nr_topics=26, use_ctfidf=True)

2025-07-28 14:50:57,548 - BERTopic - Topic reduction - Reducing number of topics
100%|██████████| 26/26 [00:24<00:00,  1.08it/s]
2025-07-28 14:51:23,710 - BERTopic - Topic reduction - Reduced number of topics from 49 to 26


<bertopic._bertopic.BERTopic at 0x186edad5810>

In [21]:
'''
This code applies agglomerative clustering to reduce the number of topics in the BERTopic model using the SBERT embeddings of the (LLM-generated) topic labels.
To use the top-10 words representation, set representation="Representation".
To use the LLM-generated topic labels, set representation="OpenAI".
use_ctfidf should be left as False
'''

merge_label_embeddings(topic_model, embedding_model, docs, 26, "OpenAI", use_ctfidf=False)

2025-07-28 14:53:31,217 - BERTopic - Topic reduction - Reducing number of topics


0                   [Trump Rally and Campaign Messages]
1                      [Support for Trump as President]
2     [Trump-Russia Investigation and Accusations Ag...
3                       [Thanks and Good Luck Messages]
4          [Empowering Young Entrepreneurs for Success]
5     [Trump Rally Announcements and Voting Encourag...
6                      [Fake News and Media Corruption]
7                       [Best Golf Courses in Scotland]
8     [U.S. Southern Border Security and Immigration...
9            [Trump Hotels Overview in Chicago and NYC]
10        [Support for Veterans and Emergency Response]
11          [China Trade Deals and Tariff Negotiations]
12       [Twitter Praise and Thanks for Public Figures]
13                       [Sports and Teams Discussions]
14            [Obamacare Repeal and Replacement Debate]
15       [Political Endorsements for Strong Candidates]
16    [Positive Economic Performance and Stock Marke...
17                        [CelebApprentice Discu

100%|██████████| 26/26 [00:20<00:00,  1.27it/s]
2025-07-28 14:53:53,605 - BERTopic - Topic reduction - Reduced number of topics from 49 to 26


In [23]:
'''
This code applies agglomerative clustering to reduce the number of topics using the LLM-prompting method. 
To use the top-10 words representation, set representation="Representation".
To use the LLM-generated topic labels, set representation="OpenAI".
'''

merge_topics_llm(topic_model, 25, "OpenAI")


Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Support for Trump as President
1: Trump-Russia Investigation and Democratic Allegations
2: Gratitude and Good Luck Wishes
3: Empowering Young Entrepreneurs for Success
4: Trump Rally Announcements
5: Fake News and Media Criticism
6: Best Golf Courses in Scotland and Trump International Golf Links
7: Border Security and Immigration Reform
8: Luxury Hotels and Restaurants in Chicago and NYC
9: Support for Veterans and Disaster Response
10: China Trade and Tariffs Negotiations
11: Twitter Celebrations and Honorary Mentions
12: Sports and Events Discussion
13: Repeal and Replace Obamacare
14: Political Endorsements and Support for Candidates
15: Economic Growth and Stock Market Success
16: Celeb Apprentice Discussion
17: Twitter Image Links
18: Criticism of Barack Obama and His Presidency
19: Book Recommendations and Authors
20: Donald Tru

100%|██████████| 48/48 [00:35<00:00,  1.36it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Presidential Campaign and Hillary Clinton's Defeat
1: Good Luck Wishes
2: Entrepreneurial Mindset for Success
3: Trump Rally Announcements and Thank Yous
4: Fake News and Media Criticism
5: Trump International Golf Courses and Awards in Scotland
6: Border Security and Immigration Policy
7: Trump Hotels and Luxury Accommodations
8: Support for Disaster Response and Victims
9: US-China Trade Relations and Negotiations
10: Trump Support and Gratitude on Twitter
11: Yankees and Derek Jeter in Baseball
12: Repeal and Replace Obamacare Debate
13: Political Endorsements for Strong Candidates
14: Economic Optimism and Stock Market Performance
15: Celeb Apprentice Discussion
16: Twitter Images and Links
17: Criticism of Barack Obama and His Administration
18: Books by Donald Trump and Authors on America
19: Trump Signature Collection

100%|██████████| 47/47 [00:35<00:00,  1.33it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump-Russia Controversy and 2016 Election Dynamics
1: Gratitude and Good Wishes
2: Entrepreneurial Mindset and Success Principles
3: Trump Campaign Rallies
4: Fake News and Media Criticism
5: Trump International Golf Courses in Scotland
6: Border Security and Immigration Policy Debate
7: Trump Hotel Properties and Luxury Experiences
8: Disaster Response and Support
9: China-U.S. Trade Negotiations and Tariffs
10: Praise for Charlie Kirk and Team Trump
11: Yankees and Derek Jeter's performance
12: Obamacare Repeal and Replacement Debate
13: Political Endorsements for Strong Candidates
14: Economic Performance and Stock Market Growth
15: CelebApprentice Boardroom Reactions
16: Twitter Images
17: Obama's Loyalty and Campaigning Issues
18: Books by Influential Authors and Trump
19: Trump Signature Collection at Macy's
20: Trump Polling Le

100%|██████████| 46/46 [00:33<00:00,  1.36it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump-Russia Investigation and 2016 Election
1: Expressions of Gratitude and Well Wishes
2: Entrepreneurial Mindset for Success
3: Trump Rally Announcements and Thank Yous
4: Fake News Media Criticism
5: Trump International Golf Courses in Scotland
6: Border Security and Immigration Policy Debate
7: Trump Hotels and Luxury Accommodations
8: Trump Signature Collection at Macy's
9: Condolences, Honor, and Support for Victims and Veterans
10: US-China Trade Relations and Tariffs
11: Yankees Baseball
12: Obamacare Repeal and Healthcare Costs
13: Political Endorsements and Strong Support for Candidates
14: Positive Economic Trends and Stock Market Growth
15: Celeb Apprentice Discussion
16: Twitter Image Links
17: Criticism of Barack Obama and his presidency
18: books and authors
19: 2016 GOP Polls and Trump’s Lead
20: Fox News Interviews
21

100%|██████████| 45/45 [00:40<00:00,  1.11it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Election Campaign
1: Thanks and Good Luck Messages
2: Keys to Entrepreneurial Success
3: Media Criticism and "Fake News" Claims
4: Best Golf Courses and Rankings
5: Southern Border Security and Wall Construction
6: Trump Hotels and Luxury Accommodations in Chicago and Toronto
7: Trump Signature Collection at Macy's
8: Support for First Responders and Victims of Disasters
9: China Trade Deal and Tariff Negotiations
10: Sports Discussions on NFL and MLB
11: Obamacare Repeal and Premium Concerns
12: Political Endorsements for Congressional Candidates
13: Economic Growth and Market Records
14: Celeb Apprentice Discussions
15: Twitter Image Links
16: Criticism of Barack Obama's Presidency and Campaigning Habits
17: Books by Trump and Kessler
18: Trump Poll Numbers and Leads in 2015 GOP Race
19: Interview Appearances on Fox News
2

100%|██████████| 44/44 [00:39<00:00,  1.12it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Campaign and Controversies
1: Appreciation and Good Wishes
2: Trump Merchandise and Hotel Information
3: Entrepreneurial Mindset and Success Principles
4: Fake News and Media Criticism
5: Trump International Golf Courses in Scotland and Doral
6: Border Security and Immigration Policy Debate
7: Tribute to First Responders and Hurricane Preparedness
8: US-China Trade and Tariffs Negotiations
9: New York Yankees and Sports Highlights
10: Obamacare Premiums and Healthcare Debate
11: Political Endorsements for Strong Candidates
12: Economic Growth and Stock Market Records
13: Celeb Apprentice Discussions
14: Twitter Images and Links
15: Obama Criticism and Campaigning
16: Books by Donald Trump and Recommendations
17: Trump Polling Dominance 2015
18: Interviews on Fox News Tonight
19: Wind Turbines and Environmental Impact in Scot

100%|██████████| 43/43 [00:35<00:00,  1.23it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Presidential Campaign Support
1: Appreciation and Well Wishes
2: Entrepreneurial Mindset for Success
3: Fake News and Media Criticism
4: Golf Courses in Scotland and Trump Properties
5: Border Security and Immigration Reform
6: Hurricane Response and Support
7: China Trade Negotiations and Tariff Implications
8: Yankees and NFL Discussions
9: Obamacare Repeal and Healthcare Reform
10: Political Endorsements for Strong Candidates Supporting Veterans and the Second Amendment
11: Economic Growth and Stock Market Performance
12: Celeb Apprentice Discussion
13: Twitter Images & Links
14: Criticism of Barack Obama and Campaign Tactics
15: Books and Author Promotions
16: Trump Polling Surge 2015
17: Fox News Interviews
18: Opposition to Wind Turbines in Scotland
19: Iran Nuclear Deal and Relations
20: US foreign policy in Syria and

100%|██████████| 42/42 [00:29<00:00,  1.44it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Campaign Support
1: Gratitude and Well Wishes
2: Entrepreneurial Mindset and Success Strategies
3: Fake News Media Criticism
4: Immigration Reform and Border Security
5: Support for First Responders and Victims in Crisis Events
6: China Trade and Tariffs Discussion
7: Yankees and NFL Connections
8: Obamacare Repeal and Healthcare Reform
9: Endorsements for Strong Candidates Supporting the Second Amendment and Military
10: Positive Economic Indicators and Record Stock Market Growth
11: Celeb Apprentice Season 13 Discussion
12: Twitter Images
13: Obama Presidency Criticism
14: Books by Notable Authors
15: Trump Polling Leads in 2015 Presidential Race
16: Interview Schedule on Fox News
17: Opposition to Wind Turbines in Scotland
18: Iran Nuclear Deal and Relations
19: Syria and ISIS Strategy during the Trump Presidency
20: Inte

100%|██████████| 41/41 [00:28<00:00,  1.42it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Support and Appreciation
1: Appreciation and Good Wishes
2: Entrepreneurs' Mindset for Success
3: Fake News and Media Criticism
4: Immigration Reform and Border Security
5: Support for First Responders and Victims of Disasters
6: China Trade and Tariffs Negotiations
7: Yankees and NFL Highlights
8: Obamacare Repeal and Healthcare Plans
9: Political Endorsements for Strong Candidates on Crime, Borders, and Military Support
10: Economic Growth and Employment Success
11: Celeb Apprentice Season 13 Discussion
12: Twitter Images and Links
13: Criticism of President Obama
14: Books by Influential Authors
15: Interviews on Fox News
16: Opposition to Wind Turbines in Scotland
17: Iran Nuclear Deal and Sanctions Debate
18: U.S. Military Involvement in Syria and ISIS
19: Media Interviews on Political Campaigns and Issues
20: Internati

100%|██████████| 40/40 [00:29<00:00,  1.38it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Support for Donald Trump 2016 Presidential Campaign
1: Appreciation and Good Wishes
2: Success Mindset for Entrepreneurs
3: Fake News Media and its Impact on Public Perception
4: Immigration and Border Security Debate
5: Support for First Responders and Victims of Tragedies
6: China Trade and Tariffs Discussions
7: Celebrity Apprentice Season 13 Preview
8: Yankees and Derek Jeter's Impact
9: Obamacare and Healthcare Reform Debate
10: Congressional Endorsements for MAGA Supporters
11: Economic Growth and Job Market Optimism
12: Twitter Posts and Media Links
13: Barack Obama Campaigns and Vacations
14: Books and Authors
15: Media Interviews on Fox News
16: Opposition to Wind Turbines in Scotland
17: Iran Nuclear Deal and Sanctions Debate
18: U.S. Military Actions Against ISIS in Syria
19: Mitt Romney Interviews on 2012 Election and Barac

100%|██████████| 39/39 [00:29<00:00,  1.33it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Presidential Campaign Support
1: Expressions of Gratitude and Good Wishes
2: Entrepreneurial Mindset and Success Strategies
3: Fake News Media Criticism
4: Immigration Reform and Border Security Debate
5: Honor and Support for First Responders and Victims
6: China Trade Relations and Tariffs
7: Celebrity Apprentice Season 13 Premiere
8: Yankees and Sports Heroes
9: Obamacare Repeal and Premium Impact
10: Economic Growth and Job Market Optimism
11: Twitter Image Links
12: Obama Criticism and Campaigning
13: Book Recommendations and Author Praise
14: Fox News Interviews
15: wind turbines in Scotland and environmental concerns
16: Iran Nuclear Policy and Diplomatic Relations
17: US Involvement in Syria and ISIS Counterterrorism
18: Political Interviews and Debates 2012
19: International Meetings with World Leaders
20: Rubio vs.

100%|██████████| 38/38 [00:29<00:00,  1.30it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump 2016 Presidential Campaign Support
1: Positive Well Wishes
2: Entrepreneurial Mindset and Success Strategies
3: Criticism of the Fake News Media
4: Immigration and Border Security Reform
5: Celeb Apprentice Season 13 Premiere
6: prayers and support for victims and responders
7: U.S.-China Trade Negotiations and Tariffs
8: Yankees Baseball and Players
9: Obamacare Controversy and Premium Increases
10: Economic Growth and Record Stock Market Performance
11: Twitter Media Links
12: Criticism of Barack Obama
13: Trump Books and Readings
14: Fox News Interviews
15: Opposition to Wind Turbines in Scotland
16: Iran Nuclear Deal and Sanctions Debate
17: US involvement in Syria and the fight against ISIS
18: Interviews on Political Campaigns and Debates in 2012
19: Welcome Meetings with World Leaders
20: North Korea Denuclearization and I

100%|██████████| 37/37 [00:30<00:00,  1.20it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Support for Trump 2016 Presidential Run
1: Gratitude and Good Wishes
2: Success Mindset for Entrepreneurs
3: Fake News Media Criticism
4: Immigration Reform and Border Security
5: Celebrity Apprentice Episodes and Schedule
6: Hurricane Response and Support
7: China-US Trade Negotiations and Tariffs
8: Yankees and NFL Discussion
9: Obamacare Premiums and Repeal Debate
10: Economic Growth and Job Market Optimism
11: Twitter Images Sharing
12: Obama Campaign Controversies
13: books and authors
14: Fox News Interviews Tonight
15: Wind Turbines in Scotland and Their Environmental Impact
16: Iran Nuclear Deal Controversy
17: US involvement in Syria and ISIS dynamics
18: Interviews on Political Debates and Opinions on Barack Obama
19: Prime Minister Meetings and Welcomes
20: North Korea Relations and Denuclearization Efforts
21: Rubio vs. Cru

100%|██████████| 36/36 [00:32<00:00,  1.09it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump Support and 2016 Campaign
1: Appreciation and Well Wishes
2: Entrepreneurial Success Mindset
3: Border Wall and Immigration Security
4: Celebrity Apprentice Episode Promotion
5: Support for First Responders and Victims during Hurricane Events
6: Trade Relations and Tariffs with China
7: Yankees and NFL Players Discussion
8: Obamacare Debate and Healthcare Reform
9: Economic Growth and Stock Market Success
10: Twitter Images
11: Criticism of Barack Obama's Presidency
12: Books by Donald Trump and Related Authors
13: Interview Schedule on Fox News
14: Wind Turbines in Scotland and Their Environmental Impact
15: Iran's Nuclear Program and International Relations
16: U.S. Involvement in Syria and the Fight Against ISIS
17: Interviews on the 2012 Election and Economic Policy
18: International Meetings with Prime Ministers
19: North Ko

100%|██████████| 35/35 [00:28<00:00,  1.23it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump 2016 Campaign and Support
1: Gratitude and Well Wishes
2: Entrepreneurial Success Mindset
3: Border Security and Wall Construction
4: Celebrity Apprentice Episode Reminder
5: Appreciation for First Responders and Veterans during Hurricanes
6: Trade Relations and Tariffs with China
7: Yankees and NFL Discussion
8: ObamaCare Repeal and Healthcare Criticism
9: Positive Economic Growth and Stock Market Performance
10: Twitter Images Sharing
11: Criticism of Barack Obama as President
12: Interviews on Fox News
13: Wind Turbines and Scotland's Environmental Concerns
14: Iran Nuclear Weapons and Sanctions Debate
15: US Military Involvement in Syria and ISIS Operations
16: GOP Endorsement and Presidential Election Interviews
17: Welcome Messages to World Leaders at the White House
18: US-North Korea Summit and Denuclearization Eff

100%|██████████| 34/34 [00:28<00:00,  1.21it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump 2016 Campaign Support
1: Expressions of Gratitude and Good Luck
2: Entrepreneurial Mindset and Success Principles
3: Immigration and Border Security Debate
4: Celebrity Apprentice Promotion
5: Support for Families and First Responders During Crisis
6: U.S.-China Trade Relations and Tariffs
7: Yankees and Derek Jeter Performance
8: ObamaCare Repeal and Premium Concerns
9: Economic Growth and Job Market Performance
10: Twitter Images
11: Criticism of Barack Obama as President
12: Fox News Interviews Tonight
13: Wind Turbines Impact on Scotland's Beauty and Wildlife
14: Iran Nuclear Weapons and Sanctions Debate
15: US Military Involvement in Syria and ISIS Policy
16: Mitt Romney Interviews on Political Campaigns and Debates
17: International Prime Minister Meetings at the White House
18: Rubio vs. Cruz Controversy
19: North K

100%|██████████| 33/33 [00:25<00:00,  1.27it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump Support and Polling in 2016 Presidential Campaign
1: Gratitude and Good Luck Wishes
2: Entrepreneurial Success Mindset
3: Immigration and Border Security Debate
4: Honoring Heroes and Supporting Families
5: China-U.S. Trade Relations and Tariffs
6: Yankees and NFL Discussions
7: Obamacare Repeal and Healthcare Costs
8: Economic Growth and Stock Market Records
9: Twitter Images and Links
10: Criticism of President Obama
11: Fox News Interviews Tonight
12: Opposition to Wind Turbines in Scotland
13: Iran's Nuclear Threat and Sanctions Debate
14: U.S. Military Withdrawal and ISIS Defeat in Syria
15: Interviews on Politics and Elections
16: Welcoming World Leaders at the White House
17: Ted Cruz and Marco Rubio's Rivalry
18: North Korea Summit and Economic Potential
19: OPEC and Gas Prices Debate
20: Mitt Romney's Presidential

100%|██████████| 32/32 [00:23<00:00,  1.36it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump 2016 Presidential Campaign Support
1: Gratitude and Well Wishes
2: Empowering Entrepreneurs for Success
3: Border Security and Immigration Reform
4: Honor and Remembrance of Heroes and Victims
5: China-U.S. Trade Tariffs and Economic Impact
6: Yankees and NFL Discussions
7: Repeal and Replace Obamacare
8: Economic Growth and Stock Market Success
9: Twitter Image Links
10: Obama Criticism and Political Discontent
11: Fox News Interviews Tonight
12: Opposition to Wind Turbines in Scotland
13: Iran Nuclear Weapons and Sanctions Debate
14: U.S. Military Strategy and ISIS in Syria
15: Interviews on 2012 Election and Political Debates
16: Welcome Prime Ministers at the White House
17: Cruz vs. Rubio: Accusations and Rivalry
18: North Korea Summit and Economic Potential
19: Oil Prices and the Keystone XL Pipeline Debate
20: Mitt 

100%|██████████| 31/31 [00:27<00:00,  1.13it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump and 2016 Election Reactions
1: Expressions of Gratitude and Good Luck
2: Entrepreneurial Mindset for Success
3: Border Security and Immigration Policy
4: Honor and Support for Heroes and Victims
5: US-China Trade and Tariff Relations
6: Yankees and NFL Enthusiasm
7: Obamacare Repeal and Healthcare Costs
8: Stock Market Dynamics and Economic Growth
9: Twitter Images and Links
10: Media Interviews Tonight and Tomorrow
11: Wind Turbines Controversy in Scotland
12: Iran Nuclear Weapons and Sanctions Debate
13: U.S. Military Involvement in Syria and ISIS
14: Interviews on Election Strategies and Political Commentary
15: Global Leaders at the White House
16: North Korea-U.S. Relations and Denuclearization Efforts
17: Rubio vs. Cruz: Accusations and Controversies
18: OPEC and Oil Prices Impact on Gasoline Costs
19: Mitt Romney's 

100%|██████████| 30/30 [00:24<00:00,  1.22it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump Support and Birthday Wishes
1: Good Luck Wishes
2: Entrepreneurial Mindset and Success
3: Border Security and Immigration Reform
4: Honor and Remembrance for Heroes and Victims
5: US-China Trade Dynamics and Tariffs
6: Yankees and Football Connections
7: Repeal and Replace ObamaCare
8: Stock Market and Economic Growth
9: Twitter Image Links and Keywords
10: Interviews on Fox News and Sean Hannity
11: Opposition to Wind Turbines in Scotland
12: Iran Nuclear Deal and Sanctions Debate
13: U.S. Involvement in Syria and ISIS
14: Welcome of World Leaders at the White House
15: Rubio vs. Cruz: Accusations of Dishonesty and Political Rivalry
16: North Korea Relations and Denuclearization Efforts
17: Oil Prices and Energy Policies
18: Mitt Romney's Presidential Campaign and Debate Performance
19: Miss Universe Pageant 2012
20: Coronavirus

100%|██████████| 29/29 [00:24<00:00,  1.21it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Donald Trump Birthday and Support Activities
1: Entrepreneurial Success and Big Thinking
2: Border Security and Immigration Policy
3: Hurricane Response and Heroes
4: China-US Trade Relations and Tariffs
5: Yankees and Sports Commentary
6: Obamacare Premiums and Repeal Debate
7: Economic Growth and Record Stock Market Performance
8: Twitter Image Links
9: Interview Schedule on Fox News
10: Opposition to Wind Turbines in Scotland
11: Iran Nuclear Weapons and Sanctions Debate
12: Syria and ISIS Strategy
13: Welcome Messages to Prime Ministers at the White House
14: Rubio vs. Cruz Controversies
15: Korea Summit and Denuclearization Efforts
16: Keystone Pipeline and Gas Prices
17: Mitt Romney's Political Landscape and Election Performance
18: Miss Universe Pageant 2012
19: COVID-19 Response and Testing Efforts
20: Obama National Debt Conce

100%|██████████| 28/28 [00:21<00:00,  1.30it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump Support and Hillary Criticism
1: Business Success and Entrepreneurial Mindset
2: Border Security and Immigration Policy
3: Support for First Responders and Victims During Hurricanes
4: U.S.-China Trade Relations and Tariffs
5: Yankees and Derek Jeter's Performance
6: Obamacare Premiums and Repeal Efforts
7: Positive Economic Outlook and Stock Market Growth
8: Twitter Images Sharing
9: Fox News Interviews Tonight
10: Opposition to Wind Turbines in Scotland
11: Iran's Nuclear Threat and Diplomatic Tensions
12: Syria and ISIS Military Policy
13: Prime Minister Welcome Messages
14: North Korea-US Relations and Denuclearization Efforts
15: Keystone Pipeline and Oil Prices
16: Mitt Romney's Presidential Campaign
17: Miss Universe Pageant 2012
18: Coronavirus Response and Testing
19: National Debt and Barack Obama
20: Tax Cuts and Refor

100%|██████████| 27/27 [00:20<00:00,  1.30it/s]



Below I will provide a numbered list of topics. Each line represents a topic. Return the index of the two most similar topics in python list format.
0: Trump Support and Impeachment Discussion
1: Border Security and Immigration Policy Debate
2: Support for First Responders and Veterans During Disasters
3: U.S.-China Trade Relations and Tariffs
4: Yankees and NFL Discussion
5: Repeal and Replace ObamaCare
6: Economic Growth and Stock Market Records
7: Twitter Media Links
8: Fox News Interviews Tonight
9: Opposition to Alex Salmond's Wind Turbines in Scotland
10: Iran Nuclear Weapons and Sanctions Debate
11: U.S. Military Role in Syria and ISIS Defeat
12: International Prime Minister Meetings
13: North Korea Diplomatic Relations and Denuclearization Efforts
14: Oil Prices and the Keystone XL Pipeline Debate
15: Mitt Romney's Presidential Campaign and Debate Performance
16: Miss Universe Pageant 2012
17: Federal Government Response to Coronavirus Pandemic
18: National Debt Under Barack O

100%|██████████| 26/26 [00:18<00:00,  1.41it/s]


In [24]:
#scoring after
topic_words = get_topic_words(topic_model, "Representation")
score = evaluate_bertopic(topic_words, tokenized_docs)
print(f"Overall score: {score}")

Length topic words before cleaning: 25
Length topic words after cleaning: 25
[['realdonaldtrump', 'trump', 'com', 'great', 'thanks', 'twitter', 'thank', 'president', 'https', 'http'], ['border', 'wall', 'mexico', 'immigration', 'security', 'democrats', 'southern', 'illegal', 'country', 'laws'], ['honor', 'twitter', 'pic', 'prayers', 'today', 'com', 'hurricane', 'great', 'families', 'fema'], ['china', 'trade', 'tariffs', 'chinese', 'deal', 'farmers', 'http', 'countries', 'currency', 'dollars'], ['yankees', 'nfl', 'rod', 'nflcommish', 'game', 'derek', 'great', 'bills', 'players', 'anthem'], ['economy', 'jobs', 'market', 'unemployment', 'stock', 'numbers', 'record', 'high', '000', 'economic'], ['pic', 'twitter', 'com', 'z0i7wbsgtp', 'gizwksbuus', 'p5imhmjqs1', 'dnfxkc8uug', '9jgdue5bf8', 'jds4zuxxjg', 'knvqf6jdil'], ['interviewed', '00', 'enjoy', 'tonight', 'foxnews', 'interview', '10', 'seanhannity', 'morning', 'foxandfriends'], ['wind', 'turbines', 'alexsalmond', 'scotland', 'ugly', 'bi

## Visualization

In [25]:
# Set LLM labels as topic labels
llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["OpenAI"].values()]
topic_model.set_topic_labels(llm_labels)

In [29]:
# reduce embeddings further for visualization
reduced_embeddings_2d = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', verbose=True, random_state=42).fit_transform(embeddings)

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_jobs=1, random_state=42, verbose=True)
Mon Jul 28 15:13:47 2025 Construct fuzzy simplicial set
Mon Jul 28 15:13:47 2025 Finding Nearest Neighbors
Mon Jul 28 15:13:47 2025 Building RP forest with 15 trees
Mon Jul 28 15:13:48 2025 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	 6  /  15
	 7  /  15
	Stopping threshold met -- exiting after 7 iterations
Mon Jul 28 15:13:55 2025 Finished Nearest Neighbor Search
Mon Jul 28 15:13:56 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Mon Jul 28 15:14:20 2025 Finished embedding


In [27]:
# Visualize the topic hierarchy
fig_hierarchy = topic_model.visualize_hierarchy(custom_labels=True)

# Save the interactive plot to an HTML file
# fig_hierarchy.write_html("data/output_mergeGPT/topic_hierarchy_trump_beforeMergeGPT.html")

# Optional: Display the figure (if running in Jupyter or a script)
fig_hierarchy.show()

In [32]:
# Visualize the documents and topics with reduced embeddings
fig_docviz = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True, sample=5) # , custom_labels=True

# Step 2: Save the interactive plot to an HTML file
# fig_docviz.write_html("data\output_mergeGPT/docviz_trump_beforeMergeGPT.html")

# Optional: Display the figure (if running in Jupyter or a script)
fig_docviz.show()