In [None]:
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
import os
import pandas as pd
import pickle
from pprint import pprint
import re
from sentence_transformers import SentenceTransformer

In [None]:
speeches_path = 'IN_PATH'

topic_model_path = 'OUT_PATH'

In [None]:
df = pd.read_csv(speeches_path, lineterminator='\n')
print(df.shape)

df.head()

dataframe columns: speech_id, date, speaker, party, text

In [None]:
speeches = df['text'].tolist()
print('Number of speeches: {}'.format(len(speeches)))

In [None]:
bertopic_model_path = os.path.join(topic_model_path, 'tm.tm')
bertopic_model_params_path = os.path.join(topic_model_path, 'tm_params.txt')
bertopic_model_topic_labels_path = os.path.join(topic_model_path, 'tm_labels.pickle')
print(bertopic_model_path)

In [None]:
# define hyper parameters
embedding_type = 'sentence' # 'sentence', 'twitter_bert'
clustering_algorithm = 'hdbscan' # 'hdbscan', 'kmeans'

n_clusters = 10 # n topics for kmeans

bm25_weighting=True
reduce_frequent_words=True

top_n_words = 10
min_topic_size = 10

diversity = 0.2

diversify = True

if diversify:
    representation_model = MaximalMarginalRelevance(diversity=diversity)
else:
    representation_model = None

######################

# embedding model
if embedding_type == 'sentence':
    embedding_model = SentenceTransformer('all-mpnet-base-v2')
elif embedding_type == 'twitter_bert':
    pass

# clustering algorithm
if clustering_algorithm == 'hdbscan':
    cluster_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
elif clustering_algorithm == 'kmeans':
    cluster_model = KMeans(n_clusters=n_clusters)

# c-tf-idf
ctfidf_model = ClassTfidfTransformer(bm25_weighting=bm25_weighting, reduce_frequent_words=reduce_frequent_words)

#######################
params = {}

params['embedding_type'] = embedding_type
params['clustering_algorithm'] = clustering_algorithm
if clustering_algorithm == 'kmeans':
    params['n_clusters'] = n_clusters
params['bm25_weighting'] = str(bm25_weighting)
params['reduce_frequent_words'] = str(reduce_frequent_words)
params['top_n_words'] = str(top_n_words)
params['min_topic_size'] = str(min_topic_size)
if diversify:
    params['diversification_score'] = str(diversity)
else:
    params['diversification_score'] = str(None)

pprint(params)

In [None]:
# initialize topic model
topic_model = BERTopic(embedding_model=embedding_model, 
                       hdbscan_model=cluster_model, 
                       ctfidf_model=ctfidf_model, 
                       top_n_words=top_n_words,
                       min_topic_size = min_topic_size,
                       representation_model=representation_model,
                       verbose=True)

In [None]:
# train topic model
topics, probs = topic_model.fit_transform(speeches)

In [None]:
# save model
topic_model.save(bertopic_model_path)

In [None]:
# save hyper parameters
with open(bertopic_model_params_path, mode='w') as f_out:
    for key, value in params.items():
        f_out.write(key + ':' + '\t' + value + '\n')

In [None]:
# save topic labels
with open(bertopic_model_topic_labels_path, mode='wb') as f_out:
    pickle.dump(topics, f_out)