In [1]:
%%capture
!pip install bertopic
!pip install datasets
!pip install openai

# Load the Paper dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]

# Extract abstracts to train on and corresponding titles
abstracts = dataset["abstract"]
titles = dataset["title"]

### HDBSCAN control the number of topics through the cluster model

In [3]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Run the model the comment out are all the options could add to the bertopic

In [4]:
from bertopic import BERTopic
abstracts = abstracts[300:]
topic_model = BERTopic(

  # # Pipeline models
  # embedding_model=embedding_model,
  # umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  # vectorizer_model=vectorizer_model,
  # representation_model=representation_model,
  calculate_probabilities=True,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)
topics, probs = topic_model.fit_transform(abstracts)

2025-04-15 19:40:58,019 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3656 [00:00<?, ?it/s]

2025-04-15 19:42:33,471 - BERTopic - Embedding - Completed ✓
2025-04-15 19:42:33,472 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-15 19:47:08,631 - BERTopic - Dimensionality - Completed ✓
2025-04-15 19:47:08,635 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [7]:
# results
probs[0]

array([0.00196831, 0.00206855, 0.00450759, 0.00381628, 0.00283973,
       0.00224487, 0.00287846, 0.00304864, 0.00484699, 0.00222548,
       0.00488951, 0.00322324, 0.00351174, 0.00240059, 0.00270235,
       0.00234652, 0.00338877, 0.00148895, 0.00304523, 0.00239978,
       0.00396434, 0.0022615 , 0.00259373, 0.00798656, 0.0023975 ,
       0.00256669, 0.00334759, 0.00290527, 0.00370954, 0.00274368,
       0.0020723 , 0.00276964, 0.00560759, 0.00440518, 0.00277987,
       0.00302243, 0.00238361, 0.00363097, 0.00253186, 0.00382322,
       0.00237475, 0.00343043, 0.00298541, 0.00256968, 0.40938045,
       0.00206569, 0.00579228, 0.00262943, 0.00221904, 0.00487611,
       0.0033906 , 0.01555394, 0.00231693, 0.00322902, 0.0055321 ,
       0.00496293, 0.00241121, 0.00440944, 0.00244957, 0.00238844,
       0.00339513, 0.00586865, 0.00608024, 0.00333491, 0.00239148,
       0.00385091, 0.00619197, 0.00340019, 0.00353656, 0.00282032,
       0.0072855 , 0.00382569, 0.00359348, 0.00281676, 0.00241

In [14]:
# show the different topics 
topic_model.get_topic_info()

# show the document info
#topic_model.get_document_info(abstracts)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,38202,-1_the_of_and_to,"[the, of, and, to, in, we, is, for, that, on]",[ Classification of high dimensional data fin...
1,0,5665,0_policy_reinforcement_rl_learning,"[policy, reinforcement, rl, learning, reward, ...",[ Policy gradient methods are among the most ...
2,1,3447,1_privacy_federated_fl_private,"[privacy, federated, fl, private, clients, dat...",[ Federated learning is a technique that enab...
3,2,3326,2_3d_object_video_objects,"[3d, object, video, objects, segmentation, ima...",[ Our goal in this work is to generate realis...
4,3,2303,3_graph_node_graphs_gnns,"[graph, node, graphs, gnns, nodes, gnn, embedd...",[ Graph neural networks have recently achieve...
...,...,...,...,...,...
154,153,110,153_flows_normalizing_density_flow,"[flows, normalizing, density, flow, invertible...",[ Normalizing flows have received a great dea...
155,154,108,154_ai_ethical_systems_ethics,"[ai, ethical, systems, ethics, intelligence, a...","[ Recently, the use of sound measures and met..."
156,155,101,155_process_event_logs_business,"[process, event, logs, business, mining, log, ...",[ Process Discovery is concerned with the aut...
157,156,101,156_skin_lesion_melanoma_lesions,"[skin, lesion, melanoma, lesions, cancer, imag...",[ Skin cancer is one of the most threatening ...


# Mind dataset
upload the news.tsv file and filter out all the entries with 150+ words

In [17]:
import csv
mind_data = []
mind_data_abstract = []
with open('news.tsv', 'r', newline='') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        info = row[3]+row[4]
        if len(info.split()) >= 150:
            mind_data_abstract.append(info)
            mind_data.append(row)

#### use bert model

In [22]:
# using bertmodel
from bertopic import BERTopic
topic_model_mind = BERTopic(calculate_probabilities=True)
pred_mind, prob_mind = topic_model_mind.fit_transform(mind_data_abstract)

In [23]:
prob_mind[0]

array([3.05514295e-309, 1.00000000e+000])

In [24]:
topic_model_mind.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,92,0_the_of_to_and,"[the, of, to, and, in, on, fire, was, were, as]",[Firefighters Battle Gospers Mountain Blaze Ra...
1,1,41,1_the_to_and_of,"[the, to, and, of, in, that, was, for, with, her]","[Sex, Drugs and Ashton Kutcher: 10 Revelations..."


#### use openai model

In [26]:
import openai
from bertopic.representation import OpenAI

# Fine-tune topic representations with GPT
# client = openai.OpenAI(api_key="sk-...")
client = openai.Client(api_key="sk-...")
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
representation_model = OpenAI(client, model="gpt-4o-mini", chat=True, prompt = prompt)
topic_model_gpt = BERTopic(representation_model=representation_model, calculate_probabilities= True)
topics, probs = topic_model_gpt.fit_transform(mind_data_abstract)

In [27]:
topic_model_gpt.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,93,0_Wildfire Battles and Evacuations,[Wildfire Battles and Evacuations],['We Have 3 Sheep Left': California Rancher Su...
1,1,40,1_Hollywood Celebrities and Personal Struggles,[Hollywood Celebrities and Personal Struggles],"[Sex, Drugs and Ashton Kutcher: 10 Revelations..."


In [28]:
probs[0]

array([2.67226897e-309, 1.00000000e+000])