### Make a small BERT to use in pipeline design 

In [19]:
import numpy as np
import pandas as pd
from openTSNE import TSNE as otsne
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.decomposition import PCA

In [3]:
# Load and sample data
plain_text = pd.read_csv('sports_word_256.csv.gz', compression='gzip')
word_embeddings = np.load('sports_embeddings_256.npy')

# sample data
sample_index = np.random.choice(word_embeddings.shape[0], 5000, replace=False)
plain_text = plain_text.drop(columns='Unnamed: 0')
plain_text_sample = plain_text.iloc[list(sample_index),:]
documents = plain_text_sample.transcript_subset.to_list()
word_embeddings_sample = word_embeddings[sample_index, :]
print(len(word_embeddings))
print(len(plain_text_sample))
print(len(word_embeddings_sample))

368835
5000
5000


In [4]:
# PCA dim reduce
PCA = PCA(n_components = 50)
PCA_data = PCA.fit_transform(word_embeddings_sample)

In [5]:
# t-SNE dim reduce
t_SNE = otsne(n_components=3, verbose=True)
tsne_data = t_SNE.fit(PCA_data)

--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_components=3, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 3.31 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.24 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=12.00, lr=416.67 for 250 iterations...
Iteration   50, KL divergence 4.6271, 50 iterations in 3.5357 sec
Iteration  100, KL divergence 4.6798, 50 iterations in 2.8652 sec
Iteration  150, KL divergence 4.6788, 50 iterations in 2.6308 sec
Iteration  200, KL divergence 4.6746, 50 iterations in 3.1134 sec
Iteration  250, KL divergence 4.6739, 50 iterations in 2.9601 sec
   --> Time elapsed: 15.11 seconds
===> Running optimization with exaggeration=1.00, lr=500

In [30]:
type(tsne_data)
np.save("t_sne_data.npy", tsne_data)

In [8]:
# fit ous data 
test_data = word_embeddings[368700:368835]
test_documents = plain_text[368700:368835]

pca_testdata = PCA.fit_transform(test_data)
tsne_test = tsne_data.transform(pca_testdata)


===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.02 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 1280.6640, 50 iterations in 0.5554 sec
Iteration  100, KL divergence 1226.4040, 50 iterations in 0.6304 sec
Iteration  150, KL divergence 1204.4282, 50 iterations in 0.7517 sec
Iteration  200, KL divergence 1197.7814, 50 iterations in 0.7124 sec
Iteration  250, KL divergence 1191.3140, 50 iterations in 0.5299 sec
   --> Time elapsed: 3.18 seconds


In [22]:
# pipeline_BERT submodel definition
dim_model = BaseDimensionalityReduction()

hdbscan_model = HDBSCAN(
    min_cluster_size = 15,
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

vectorizer_model = CountVectorizer(stop_words='english')

In [26]:
# pipeline_BERT initialise and run

pipeline_BERT = BERTopic(
    umap_model=dim_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model = vectorizer_model,
    calculate_probabilities=True,
    low_memory=True, 
    verbose=True  # progress bar
    ) 

topics, probs = pipeline_BERT.fit_transform(documents, tsne_data)

pipeline_BERT.save('pipeline_BERT')

2023-03-28 17:49:45,097 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-03-28 17:49:45,108 - BERTopic - Reduced dimensionality
2023-03-28 17:49:45,557 - BERTopic - Clustered reduced embeddings


In [27]:
len(pipeline_BERT.get_topics())

3

In [28]:
test_topics, test_proba = pipeline_BERT.transform(test_documents, tsne_test)
test_proba

2023-03-28 17:49:51,171 - BERTopic - Reduced dimensionality
2023-03-28 17:49:51,253 - BERTopic - Calculated probabilities with HDBSCAN
2023-03-28 17:49:51,254 - BERTopic - Predicted clusters


array([[9.99999952e-001, 2.98079151e-036],
       [9.99999793e-001, 1.56079517e-007],
       [9.60737606e-001, 3.92623416e-002],
       [9.99999935e-001, 2.71438083e-031],
       [9.99956448e-001, 4.34978870e-005],
       [9.99999930e-001, 1.36638543e-070],
       [9.99996288e-001, 3.66851214e-006],
       [9.99999931e-001, 2.71243864e-224],
       [9.99999931e-001, 7.97931364e-037],
       [9.95499501e-001, 4.50045387e-003],
       [9.99999947e-001, 2.51917670e-026],
       [9.99999931e-001, 2.73928375e-224],
       [9.84017481e-001, 1.59824751e-002],
       [9.99556231e-001, 4.43696618e-004],
       [9.99999937e-001, 6.95785569e-255],
       [9.99999939e-001, 1.56690818e-019],
       [9.99999942e-001, 4.32061695e-011],
       [9.99999946e-001, 1.01681324e-051],
       [9.59780381e-001, 1.22179534e-020],
       [9.99999952e-001, 2.92898336e-284],
       [9.99999953e-001, 0.00000000e+000],
       [9.99999963e-001, 0.00000000e+000],
       [9.99999946e-001, 8.40225869e-177],
       [9.9