In [1]:
# Imports
import pandas as pd
import numpy as np

from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

In [2]:
# Load data and check properties
transcripts = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')
print(transcripts.shape)
transcripts.head(2)

(15000, 16)


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,category,pubdate,word_count
0,show_74R2UD42MRDtmeCGCpXNHA,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0.850038,11270,Inspire By Dani - The Podcast,"Real and raw conversations on mental health, f...",Danielle Williams,['en'],The Best Advice My Mum Ever Gave Me,Today’s Episode I chat about what my mother sa...,13.96255,show_74R2UD42MRDtmeCGCpXNHA,Health & Fitness,,2259
1,show_4NNO0yIIxzSsZTXR0XnaP7,3gaoEuBYb51UoX7zeqv9yr,We recording KP now. We are recording guys pro...,0.830722,26855,PROJECT MINDSET,"PROJECT MINDSET was designed to UPLIFT, INSPIR...",PROJECT MINDSET,['en'],"From A.D.D. to GOAT, Selling over a billion in...","From A.D.D. to GOAT, Selling over a billion in...",29.49965,show_4NNO0yIIxzSsZTXR0XnaP7,Business,2019-02-21,5276


In [3]:
# Create list of documents as input for BERTopic
docs = list(transcripts['transcript'])
print(len(docs))

15000


## BERT v1

In [4]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=15, 
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [5]:
# Initialize BERTopic and run

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True  # Might add a progress bar(?)
    )

topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

2023-03-07 11:13:38,485 - BERTopic - Transformed documents to Embeddings
2023-03-07 11:13:48,761 - BERTopic - Reduced dimensionality
2023-03-07 11:13:49,171 - BERTopic - Clustered reduced embeddings


In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,6320,-1_things_ve_oh_people
1,0,2042,0_anchor_she_podcast_her
2,1,631,1_fucking_shit_song_oh
3,2,446,2_god_jesus_lord_church
4,3,366,3_police_his_murder_crime
...,...,...,...
105,104,16,104_marriage_husband_married_partner
106,105,15,105_haunted_voodoo_her_marie
107,106,15,106_neighbors_horror_bridge_ozzy
108,107,15,107_trading_indicators_trade_risk


In [None]:
# Get topic info on doc level

topic_doc = topic_model.get_document_info(docs).sort_values('Probability', ascending=False) # Filter away outlier Topic = -1

topic_doc[topic_doc.Topic > -1]

In [None]:
topic_model.get_params()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_hierarchy()

In [7]:
# Save model
topic_model.save("BERT_v1")

  self._set_arrayXarray(i, j, x)


## BERTtopic V2
**Change: smaller documents, same parameter configuration.**

In [2]:
sentence_25 = pd.read_csv('sentences_chunkssize_25.csv.gz', usecols=[1,2,3], compression='gzip')
print(sentence_25.shape)
sentence_25.head(2)

(215064, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0 - 25
1,7tYqM5F5SKtt7lFgcimgAh,"And when I say, you know, I'm making decisions...",25 - 50


In [5]:
# Create list of documents as input for BERTopic
docs_sentences = list(sentence_25['transcript_subset'])
len(docs_sentences)

215064

In [3]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=15, 
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [6]:
# Initialize BERTopic and run

bert_v2 = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True
    )

topics, probs = bert_v2.fit_transform(docs_sentences)

Batches:   0%|          | 0/6721 [00:00<?, ?it/s]

In [None]:
bert_v2.save('BERT_v2')

In [None]:
bert_v2.get_topic_info()

In [None]:
from bertopic import BERTopic
bert_v2 = BERTopic.load('BERT_v2')

In [None]:
bert_v2.visualize_heatmap()

In [None]:
bert_v2.visualize_hierarchy()

In [None]:
bert_v2.visualize_topics()

## BERT v3
**Change: larger documents than v2, 40 sentences per doc, ~620 words. Same paramter configuration.**

In [None]:
##### CREATE BEFORE

In [None]:
sentence_40 = pd.read_csv('sentences_chunkssize_40.csv.gz', usecols=[1,2,3], compression='gzip')
print(sentence_40.shape)
sentence_40.head(2)

In [None]:
# Create list of documents as input for BERTopic
docs_sentences = list(sentence_40['transcript_subset'])
len(docs_sentences)

In [None]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=15, 
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
# Initialize BERTopic and run

bert_v3 = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True
    )

topics, probs = bert_v3.fit_transform(docs_sentences)

In [None]:
bert_v3.save('BERT_v3')