In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from collections import Counter
import time

from bertopic import BERTopic
from itertools import chain
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from umap import UMAP
from hdbscan import HDBSCAN

In [2]:
podcast_df = pd.read_pickle('Podcast/podcast_data_processed.pkl')

In [3]:
flattened_passages = list(chain.from_iterable(podcast_df['Passages']))

In [4]:
len(flattened_passages)

656040

In [5]:
umap_model = UMAP(n_neighbors=3, min_dist=0.25, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=125, gen_min_span_tree=True, prediction_data=True)

In [None]:
topic_model = BERTopic(n_gram_range=(1, 3), top_n_words=10,
                      umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)
topics, probs = topic_model.fit_transform(flattened_passages)
topic_model.save("Podcast/bertopic_model")

Batches:   0%|          | 0/20502 [00:00<?, ?it/s]

2023-11-09 00:54:28,624 - BERTopic - Transformed documents to Embeddings
2023-11-09 01:27:59,559 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokeni

In [None]:
podcast_passage_splits = [len(embedding) for embedding in podcast_df['Passages']]

In [None]:
passage_mapping ={}
j = 0
for i in range(len(podcast_passage_splits)):
    passages_count = podcast_passage_splits[i]
    passage_mapping[i] = topics[j:j+passages_count]
    j += passages_count

In [None]:
podcast_df['cluster_ids'] = passage_mapping.values()

In [None]:
podcast_df.to_pickle('Podcast/podcast_cluster_ids.pkl')

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info().to_pickle('Podcast/podcast-bert-clust.pkl')

In [None]:
topic_model.visualize_hierarchy().write_html('Podcast/podcast-bert-clust.html')