## File for modeling

In [13]:
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, KernelPCA

In [8]:
# Load data and check properties
transcripts = pd.read_csv('word_transcript_256.csv.gz', compression='gzip')
print(transcripts.shape)
transcripts.head(2)

(343928, 4)


Unnamed: 0.1,Unnamed: 0,episode_id,transcript_subset,words_enumerated
0,0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0 - 256
1,1,7tYqM5F5SKtt7lFgcimgAh,"you and I'm okay, but in 2010 when my partner ...",256 - 512


In [9]:
# Create list of documents as input for enbeddings
# without additional sampling
docs = transcripts.transcript_subset.to_list()
print(len(docs))

# With additional sampling
# sample_docs = transcripts.transcript_subset.sample(1000000, random_state =42).to_list()
# print(len(sample_docs))

343928


## Google colab stuff

In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')  # This will prompt for authorization

In [None]:
# Check memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f'Your runtime has {ram_gb:.1f} gigabytes of available RAM\n')

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

## BERT v1
**Modelling data on batch level. Input instance size = max_sequence_length of embedding model**
all-MiniLM-L6-v2 max_sequence_length: 256


In [10]:
embeddings = np.load('embeddings_256.npy')

In [14]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x


# Initialize and rescale PCA embeddings
pca_embeddings = rescale(KernelPCA(n_components=5, random_state=42).fit_transform(embeddings))

umap_model = UMAP(
    n_neighbors=len(docs)*0.0125, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine',
    init=pca_embeddings)

hdbscan_model = HDBSCAN(
    min_cluster_size = len(docs)*0.025, # Limit at 400 clusters 
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=False)

vectorizer_model = CountVectorizer(min_df=10, stop_words='english', ngram_range=(1,3))

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


: 

: 

In [None]:
# Initialize BERTopic and run

topic_model = BERTopic(
    # embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
    ctfidf_model=ctfidf_model,
    low_memory = True,
    calculate_probabilities=False, 
    verbose=True  # progress bar
    )

topics, probs = topic_model.fit_transform(docs, embeddings)

# Save model
topic_model.save("BERT_v1")

## BERTtopic V2
**Modelling data on sentence level. Input instances size = 1 sentence**

In [None]:
sentence_25 = pd.read_csv('sentences_chunkssize_25.csv.gz', usecols=[1,2,3], compression='gzip')
print(sentence_25.shape)
sentence_25.head(2)

In [None]:
# Create list of documents as input for BERTopic
docs_sentences = sentence_25.transcript.to_list()
len(docs_sentences)

In [None]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=len(docs)*0.025, # Limit at 400 clusters
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
# Initialize BERTopic and run

bert_v2 = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True
    )

topics, probs = bert_v2.fit_transform(docs_sentences)

# save model
bert_v2.save('BERT_v2')