## File for modeling

In [1]:
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load data and check properties
transcripts = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')
print(transcripts.shape)
transcripts.head(2)

(15000, 16)


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,category,pubdate,word_count
0,show_74R2UD42MRDtmeCGCpXNHA,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0.850038,11270,Inspire By Dani - The Podcast,"Real and raw conversations on mental health, f...",Danielle Williams,['en'],The Best Advice My Mum Ever Gave Me,Today’s Episode I chat about what my mother sa...,13.96255,show_74R2UD42MRDtmeCGCpXNHA,Health & Fitness,,2259
1,show_4NNO0yIIxzSsZTXR0XnaP7,3gaoEuBYb51UoX7zeqv9yr,We recording KP now. We are recording guys pro...,0.830722,26855,PROJECT MINDSET,"PROJECT MINDSET was designed to UPLIFT, INSPIR...",PROJECT MINDSET,['en'],"From A.D.D. to GOAT, Selling over a billion in...","From A.D.D. to GOAT, Selling over a billion in...",29.49965,show_4NNO0yIIxzSsZTXR0XnaP7,Business,2019-02-21,5276


In [3]:
# Create list of documents as input for enbeddings
# without additional sampling
docs = transcripts.transcript_subset.to_list()
print(len(docs))

# With additional sampling
# sample_docs = transcripts.transcript_subset.sample(1000000, random_state =42).to_list()
# print(len(sample_docs))

15000


## Google colab stuff

In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')  # This will prompt for authorization

In [None]:
# Check memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f'Your runtime has {ram_gb:.1f} gigabytes of available RAM\n')

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

## BERT v1
**Modelling data on batch level. Input instance size = max_sequence_length of embedding model**
all-MiniLM-L6-v2 max_sequence_length: 256


In [4]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size = len(docs)*0.025, # Limit at 400 clusters 
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=False)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

vectorizer_model = CountVectorizer(min_df=10)

In [5]:
# Initialize BERTopic and run

topic_model = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model = vectorizer_model,
    low_memory = True,
    calculate_probabilities=False, 
    verbose=True  # progress bar
    )

topics, probs = topic_model.fit_transform(docs)

# Save model
topic_model.save("BERT_v1")

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

2023-03-07 11:13:38,485 - BERTopic - Transformed documents to Embeddings
2023-03-07 11:13:48,761 - BERTopic - Reduced dimensionality
2023-03-07 11:13:49,171 - BERTopic - Clustered reduced embeddings


In [None]:
topic_model.save('/content/drive/MyDrive/BERT_v1')

## BERTtopic V2
**Modelling data on sentence level. Input instances size = 1 sentence**

In [2]:
sentence_25 = pd.read_csv('sentences_chunkssize_25.csv.gz', usecols=[1,2,3], compression='gzip')
print(sentence_25.shape)
sentence_25.head(2)

(215064, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0 - 25
1,7tYqM5F5SKtt7lFgcimgAh,"And when I say, you know, I'm making decisions...",25 - 50


In [5]:
# Create list of documents as input for BERTopic
docs_sentences = sentence_25.transcript.to_list()
len(docs_sentences)

215064

In [3]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=len(docs)*0.025, # Limit at 400 clusters
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [6]:
# Initialize BERTopic and run

bert_v2 = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True
    )

topics, probs = bert_v2.fit_transform(docs_sentences)

# save model
bert_v2.save('BERT_v2')

Batches:   0%|          | 0/6721 [00:00<?, ?it/s]