In [14]:
# Imports
import pandas as pd
import numpy as np


from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer



In [15]:
# Load data into df
transcripts = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')

# Check shape 
print(transcripts.shape)

# Check data
transcripts.head(5)

(15000, 14)


Unnamed: 0.1,Unnamed: 0,show_id,episode_id,transcript,avg_confidence,word_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans
0,54684,show_4BuXlpcana6xU2ctfZ3qgZ,5QBYQlCXZiWI8zQ1cFp0xa,He's here Spotify Originals presents boss mask...,0.767245,6086,Bhaskar Bose,Introducing the newest detective town - Bhaska...,Spotify Studios,['en'],Shaitaan - Part 2,"After getting lost in the maze of dead bodies,...",15.625583,show_4BuXlpcana6xU2ctfZ3qgZ
1,31278,show_0t9kmV5SO6fLH1ukh07XEb,43W5aqMdemVtnabXJMsVvH,"Hello, this is your everyday positivity. My na...",0.815956,1708,Everyday Positivity,Everyday Positivity hosted by Kate Cocker is a...,Everyday Positivity,['en-GB'],JOURNAL: What Does 2020 Mean To You?,Volley.FM - Click here for more great shows!,1.82005,show_0t9kmV5SO6fLH1ukh07XEb
2,71847,show_5sFnygApOpfjPONAFo19Ox,4ptzwT5mduauldxrijwzRh,Hey guys are coming to L here. I just wanted t...,0.858005,5855,The spiritual GOD,I found myself in a place where I longed for t...,Tony Anderson Sr.,['en'],Everything Happens for a reason.,"In this episode, we will try and discover if e...",6.697017,show_5sFnygApOpfjPONAFo19Ox
3,94839,show_6lzZTmaejvWJzU7QybhNKj,5vY7W3X0UTQQ7zhEh1xb8R,"Yo, what's kind of welcome to another episode ...",0.812965,43674,Undefeated Podcast,The only Boxing Podcast with 2 funny undefeate...,Kg tha Comedian & Travis Jay,['en-US'],Episode 20 - Made in Chelsea,Ep 20: Made in Chelsea The Wadi Camacho figh...,59.122067,show_6lzZTmaejvWJzU7QybhNKj
4,36570,show_0Rkp1fmyxfkBe1gppcM4rr,10kBETrymwsDIwmdHq8kHt,"Hey friends, if you haven't heard about anchor...",0.841438,75859,A Reseller’s Passion,My name is Leslie Tucker. I am a full time res...,A Reseller’s Passion,['en'],Season 2. Episode 18. Denali from Elducothrift,I had an amazing conversation with Denali. She...,82.317067,show_0Rkp1fmyxfkBe1gppcM4rr


In [16]:
# Create list of documents as input for BERTopic

docs = list(transcripts['transcript'])

print(len(docs))

15000


In [17]:
# Define submodels

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)



In [18]:
# Initialize BERTopic and run

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model
    )


topics, probs = topic_model.fit_transform(docs)

In [20]:
topic_model.get_topic_info()

     Topic  Count                                     Name
0       -1   7432                    -1_oh_okay_thing_kind
1        0    530                  0_god_jesus_lord_church
2        1    372           1_anxiety_mental_self_yourself
3        2    350             2_players_league_season_game
4        3    312           3_weight_diet_training_fitness
..     ...    ...                                      ...
110    109     16        109_frequency_guru_baba_spiritual
111    110     16              110_asmr_breed_finian_relax
112    111     15             111_trans_bisexual_queer_gay
113    112     15     112_jewelry_premier_jewelers_jeweler
114    113     15  113_aquarium_botanical_fishes_aquariums

[115 rows x 3 columns]
