## Install rapidsai and *cuml*

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

In [None]:
# This will update the Colab environment and restart the kernel.  Don't run the next cell until you see the session crash.
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

In [None]:
# This will install CondaColab.  This will restart your kernel one last time.  Run this cell by itself and only run the next cell once you see the session crash.
import condacolab
condacolab.install()

In [None]:
# you can now run the rest of the cells as normal
import condacolab
condacolab.check()

In [None]:
# Installing RAPIDS is now 'python rapidsai-csp-utils/colab/install_rapids.py <release> <packages>'
# The <release> options are 'stable' and 'nightly'.  Leaving it blank or adding any other words will default to stable.
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'
!pip uninstall cupy -y

In [None]:
!conda install_rapids cuml


CommandNotFoundError: No command 'conda install_rapids'.
Did you mean 'conda install'?



In [None]:
import cuml
from cuml.manifold import UMAP

## Google colab stuff

In [2]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')  # This will prompt for authorization

Mounted at /content/drive


In [3]:
# Check memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f'Your runtime has {ram_gb:.1f} gigabytes of available RAM\n')

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
# Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

## File for modeling

In [None]:
!pip3 install bertopic
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Load data and check properties
transcripts = pd.read_csv('/content/drive/MyDrive/sports_word_256.csv.gz', compression='gzip', usecols=[2])
print(transcripts.shape)
transcripts.head(2)

(368835, 1)


Unnamed: 0,transcript_subset
0,Hello and welcome to the law review podcast. M...
1,"as well as several other changes, so we'll get..."


In [7]:
# Create list of documents as input for enbeddings
docs = transcripts.transcript_subset.to_list()
print(len(docs))

368835


In [None]:
# load embeddings
embeddings_256 = np.load('/content/drive/MyDrive/embeddings_256.npy')

In [None]:
# load t-SNE dim reduced embeddings
tsne_embeddings = np.load('/content/drive/MyDrive/tsne_dimred_embeddings.npy')
tsne_embeddings.shape

(343928, 3)

In [11]:
# Word embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

sports_embeddings_256 = sentence_model.encode(docs, convert_to_numpy=True)

np.save('/content/drive/MyDrive/sports_embeddings_256', sports_embeddings_256)

In [17]:
# load sentence data set 
transcripts = pd.read_csv('/content/drive/MyDrive/sport_sent_1.csv.gz', compression='gzip', usecols=[2])
print(transcripts.shape)
print(transcripts.head(2))

docs = transcripts.transcript_subset.to_list()
print(len(docs))

# Sentence embeddings
sports_embeddings_sentence_1 = sentence_model.encode(docs, convert_to_numpy=True)

np.save('/content/drive/MyDrive/sports_embeddings_sentence_1', sports_embeddings_sentence_1)

(500000, 1)
  transcript_subset
0          Awesome.
1      I like that.
500000


In [None]:
# Initialize and rescale T-SVD embeddings and dim reduce using t-SNE
#from sklearn.decomposition import TruncatedSVD
#from sklearn.manifold import TSNE

T_SNE = TSNE(n_components=3)
tsvd_embeddings = TruncatedSVD(n_components=50).fit_transform(embeddings_256)
tsne_dimreduced_embeddings = T_SNE.fit_transform(X=tsvd_embeddings)
np.save('/content/drive/MyDrive/tsne_dimred_embeddings', tsne_dimreduced_embeddings)

## BERT v1
**Modelling data on batch level. Input instance size = max_sequence_length of embedding model**
all-MiniLM-L6-v2 max_sequence_length: 256


In [None]:
# Define submodels
#from umap.parametric_umap import ParametricUMAP
#from sklearn.decomposition import PCA
#from sklearn.decomposition import KernelPCA
from bertopic.dimensionality import BaseDimensionalityReduction

#dim_model = PCA(n_components = 100, random_state=42)
#pca_kernel = KernelPCA(n_components=5, kernel='linear')

#sentence_model = SentenceTransformer("all-MiniLM-L6-v2")


dim_model = BaseDimensionalityReduction()

#umap_model = UMAP(
#    n_neighbors=15,
 #   n_components=5, 
  #  min_dist=0.0, 
   # metric='cosine',
    #low_memory=True)

hdbscan_model = HDBSCAN(
    min_cluster_size = 15,
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

vectorizer_model = CountVectorizer(stop_words='english')

In [None]:
# Initialize BERTopic and run

topic_model = BERTopic(
    #embedding_model=sentence_model,
    umap_model=dim_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model = vectorizer_model,
    calculate_probabilities=True,
    low_memory=True, 
    verbose=True  # progress bar
    )

topics, probs = topic_model.fit_transform(docs, tsne_embeddings)

2023-03-15 10:02:25,179 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-03-15 10:02:25,181 - BERTopic - Reduced dimensionality


In [None]:
topic_model.save('/content/drive/MyDrive/BERT_v2_proba')

In [1]:
len(topic_model.get_topics())

NameError: name 'topic_model' is not defined

## BERTtopic V2
**Modelling data on sentence level. Input instances size = 1 sentence**

In [None]:
sentence_25 = pd.read_csv('sentences_chunkssize_25.csv.gz', usecols=[1,2,3], compression='gzip')
print(sentence_25.shape)
sentence_25.head(2)

(215064, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0 - 25
1,7tYqM5F5SKtt7lFgcimgAh,"And when I say, you know, I'm making decisions...",25 - 50


In [None]:
# Create list of documents as input for BERTopic
docs_sentences = sentence_25.transcript.to_list()
len(docs_sentences)

215064

In [None]:
# Define submodels
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(
    n_neighbors=15, 
    n_components=5, 
    min_dist=0.0, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=len(docs)*0.025, # Limit at 400 clusters
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
# Initialize BERTopic and run

bert_v2 = BERTopic(
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    ctfidf_model=ctfidf_model,
    verbose=True
    )

topics, probs = bert_v2.fit_transform(docs_sentences)

# save model
bert_v2.save('BERT_v2')

Batches:   0%|          | 0/6721 [00:00<?, ?it/s]

## Embeddings