### Make a small BERT to use in pipeline design 

In [23]:
import numpy as np
import pandas as pd
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [20]:
# Load and sample data
plain_text = pd.read_csv('sports_word_256.csv.gz', compression='gzip')
word_embeddings = np.load('sports_embeddings_256.npy')

# sample data
sample_index = np.random.choice(word_embeddings.shape[0], 5000, replace=False, random_state=42)
plain_text = plain_text.drop(columns='Unnamed: 0')
plain_text_sample = plain_text.iloc[list(sample_index),:]
documents = plain_text_sample.transcript_subset.to_list()
word_embeddings_sample = word_embeddings[sample_index, :]
print(len(word_embeddings))
print(len(plain_text_sample))
print(len(word_embeddings_sample))

368835
5000
5000


In [24]:
# PCA dim reduce
PCA = PCA(n_components = 50)
PCA_data = PCA.fit_transform(word_embeddings_sample)

In [25]:
# t-SNE dim reduce
t_SNE = TSNE(n_components=3)
tsne_data = t_SNE.fit_transform(PCA_data)

In [26]:
# pipeline_BERT submodel definition
dim_model = BaseDimensionalityReduction()

hdbscan_model = HDBSCAN(
    min_cluster_size = 15,
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

vectorizer_model = CountVectorizer(stop_words='english')

In [29]:
# pipeline_BERT initialise and run

pipeline_BERT = BERTopic(
    umap_model=dim_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model = vectorizer_model,
    calculate_probabilities=True,
    low_memory=True, 
    verbose=True  # progress bar
    ) 

topics, probs = pipeline_BERT.fit_transform(documents, tsne_data)

pipeline_BERT.save('pipeline_BERT')

2023-03-24 14:58:01,658 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-03-24 14:58:01,661 - BERTopic - Reduced dimensionality
2023-03-24 14:58:02,017 - BERTopic - Clustered reduced embeddings


In [33]:
len(pipeline_BERT.get_topics())

3