## Install rapidsai and *cuml*

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/env-check.py

In [None]:
# This will update the Colab environment and restart the kernel.  Don't run the next cell until you see the session crash.
!bash rapidsai-csp-utils/colab/update_gcc.sh
import os
os._exit(00)

In [None]:
# This will install CondaColab.  This will restart your kernel one last time.  Run this cell by itself and only run the next cell once you see the session crash.
import condacolab
condacolab.install()

In [None]:
# you can now run the rest of the cells as normal
import condacolab
condacolab.check()

In [None]:
# Installing RAPIDS is now 'python rapidsai-csp-utils/colab/install_rapids.py <release> <packages>'
# The <release> options are 'stable' and 'nightly'.  Leaving it blank or adding any other words will default to stable.
!python rapidsai-csp-utils/colab/install_rapids.py stable
import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ['CONDA_PREFIX'] = '/usr/local'
!pip uninstall cupy -y

In [None]:
!conda install_rapids cuml

In [None]:
import cuml
from cuml.manifold import UMAP

## Google colab stuff

In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')  # This will prompt for authorization

In [None]:
# Check memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f'Your runtime has {ram_gb:.1f} gigabytes of available RAM\n')

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

## File for modeling

In [2]:
# !pip3 install bertopic
# !pip3 install openTSNE
import pickle
import numpy as np
import pandas as pd
import hdbscan
from hdbscan import HDBSCAN
from openTSNE import TSNE as oTSNE
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Load data and check properties
transcripts = pd.read_csv('/content/drive/MyDrive/sports_word_256.csv.gz', compression='gzip', usecols=[2])
print(transcripts.shape)
transcripts.head(2)

In [None]:
# Create list of documents as input for enbeddings
docs = transcripts.transcript_subset.to_list()
print(len(docs))

In [None]:
# load embeddings
embeddings_256 = np.load('/content/drive/MyDrive/embeddings_256.npy')

In [None]:
# load t-SNE dim reduced embeddings
tsne_embeddings = np.load('/content/drive/MyDrive/tsne_dimred_embeddings.npy')
tsne_embeddings.shape

## Embeddings

In [None]:
# Word embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

sports_embeddings_256 = sentence_model.encode(docs, convert_to_numpy=True)

np.save('/content/drive/MyDrive/sports_embeddings_256', sports_embeddings_256)

In [None]:
# load sentence data set 
transcripts = pd.read_csv('/content/drive/MyDrive/sport_sent_1.csv.gz', compression='gzip', usecols=[2])
print(transcripts.shape)
print(transcripts.head(2))

docs = transcripts.transcript_subset.to_list()
print(len(docs))

# Sentence embeddings
sports_embeddings_sentence_1 = sentence_model.encode(docs, convert_to_numpy=True)

np.save('/content/drive/MyDrive/sports_embeddings_sentence_1', sports_embeddings_sentence_1)

In [None]:
# Initialize and rescale PCA embeddings and dim reduce using t-SNE, save dim reduced embeddings and embedding class object
PCA = PCA(n_components=50)
TSNE = oTSNE(n_components=3, verbose=True)

PCA_embeddings = PCA.fit_transform(embeddings_256)

# dim reduce and save t-SNE class object for prediciton
with open('tsne_data.pkl', 'wb') as outp:  # OBS create pkl file in dir
    tsne_dimreduced_embeddings = TSNE.fit(PCA_embeddings)
    pickle.dump(tsne_dimreduced_embeddings, outp, pickle.HIGHEST_PROTOCOL)

# save embeddings
np.save('/content/drive/MyDrive/tsne_dimred_embeddings', tsne_dimreduced_embeddings)

## HDBSCAN

In [3]:
# Load data and embeddings 
modelling_data = pd.read_csv('sports_word_256.csv.gz', compression='gzip', usecols=[2])
modelling_documents = modelling_data.transcript_subset.to_list()

modelling_tsne_data = np.load('tsne_dimred_embeddings.npy')

In [4]:
hdbscan_model = HDBSCAN(
    min_cluster_size = 50,
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    algorithm='boruvka_kdtree',
    prediction_data=True)

# save hdbscan as pickle for downstream transfer 
with open('hdbscan_model.pkl', 'wb') as outp:  # OBS create pkl file in dir
    hdbscan_model.fit(modelling_tsne_data)
    pickle.dump(hdbscan_model, outp, pickle.HIGHEST_PROTOCOL)

### Tokeniser and Weighting scheme

In [5]:
# we need to treat all documents of each cluster as a single document and get the BoW representations for each cluster
# Create df with cluster labels and coresponding docs 

cluster_df = pd.DataFrame({'document':modelling_documents, 'cluster_label': hdbscan_model.labels_})
cluster_df_no_outliers = cluster_df[cluster_df.cluster_label != -1]

# create concatenated documents for each cluster
cluster_labels = cluster_df_no_outliers.cluster_label.unique()

docu_ls = []
for label in cluster_labels:
    temp_df = cluster_df_no_outliers[cluster_df_no_outliers.cluster_label == label]
    document = ''.join(temp_df.document.to_list())
    docu_ls.append(document)

cluster_document_df = pd.DataFrame({'cluster': cluster_labels, 'cluster_document': docu_ls})

print(cluster_document_df.shape)
cluster_document_df.head(2)

(461, 2)


Unnamed: 0,cluster,cluster_document
0,74,"as well as several other changes, so we'll get..."
1,18,"if you know what pitch is coming shocker now, ..."


In [43]:
# TfidfVectorizer combines count vectoriser and tfidf transformer 
vectorizer = TfidfVectorizer(
    max_df=0.75,
    min_df=5,
    stop_words="english",
    norm = 'l1',
    ngram_range = (1, 2),
    use_idf=True
)
X_tfidf = vectorizer.fit_transform(cluster_document_df.cluster_document.to_list())

print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")
print(f"non-zero entries in sparse matrix: {X_tfidf.nnz / np.prod(X_tfidf.shape):.1%}")

n_samples: 461, n_features: 420322
non-zero entries in sparse matrix: 3.7%


In [44]:
# extract index of the top n words with the highest tf-idf score for each topic
top_n = 20
X_tfidf_np = X_tfidf.toarray().transpose()
topic_representation_indexes = np.argpartition(X_tfidf_np, top_n*-1, axis=0)[top_n*-1:].transpose()
topic_representation_indexes_df = pd.DataFrame(topic_representation_indexes)
topic_representation_indexes_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,168250,384664,267915,384665,322065,267925,222846,160468,93671,20333,72105,297252,409417,238042,195134,267807,403769,411718,334367,26400
1,377917,159208,168250,267915,322065,267709,234440,409417,86622,167129,72105,160469,411718,334367,20333,168257,40884,43662,267925,26400
2,23594,370583,350860,403975,411816,77943,251228,287512,78525,376754,403944,251641,251607,308498,296184,296086,174653,208533,246058,376648
3,173299,296086,6956,74446,391079,91625,74445,287512,366248,27997,305288,143419,143412,6968,77943,258531,305226,251228,6940,27729
4,15891,373605,291438,347973,411816,247533,261118,59588,364326,77943,359883,397381,180395,163102,222571,53787,251228,80482,80481,180398
5,39118,53787,40809,92354,251228,391079,291438,347973,319177,317752,311458,309405,290518,77943,259289,258531,247533,246034,5417,106202
6,223506,336807,118390,95821,91507,275259,336811,17304,296683,209595,265860,17307,86133,86135,239337,236371,223593,15538,15527,15531
7,59179,246108,279951,49808,280564,91625,366106,189648,6927,280589,246152,26141,258182,246197,347973,246058,246284,242652,118385,411543
8,187186,182234,196182,354949,236257,58156,58195,20398,259514,154051,20498,284557,27260,279764,157803,419686,336013,57824,259493,298077
9,347973,54358,300670,113904,25492,59599,376754,300652,80428,287512,246058,15795,15777,167121,59588,167119,225992,287790,221671,172071


In [45]:
# build an indexer for looking up the word of the index in previous dataframe
word_indexer = pd.DataFrame({'position': list(vectorizer.vocabulary_.values()), 
                             'word':list(vectorizer.vocabulary_.keys())}).set_index('position').sort_index()
word_indexer.tail()

Unnamed: 0_level_0,word
position,Unnamed: 1_level_1
420317,zup
420318,zuri
420319,zurich
420320,zz
420321,özil


In [47]:
topic_representations = []
for _, row in topic_representation_indexes_df.iterrows():
    topic_representation = []
    for i in list(row):
        topic_representation.append(word_indexer.word[i])
    topic_representations.append(topic_representation)

topic_reps = pd.DataFrame(topic_representations)
print(topic_reps.shape)
topic_reps.head(10)

(461, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,inning,umpire,pitcher,umpires,series,pitching,major league,home run,dugout,astros,cubs,red sox,world series,mlb,league baseball,pitch,white sox,yankees,sox,baseball
1,trade,hitter,inning,pitcher,series,pirates,mets,world series,dodgers,indians,cubs,home runs,yankees,sox,astros,innings,brewers,bullpen,pitching,baseball
2,backs,tight end,tackle,wide receiver,yards,defense,offense,quarterback,defensive,touchdowns,wide,offensive line,offensive,running backs,receivers,receiver,jones,linebacker,nfl,touchdown
3,jimmy graham,receiver,aaron jones,davante adams,vikings,draft,davante,quarterback,think packers,bears,rogers,green bay,green,aaron rodgers,defense,packers,rodgers,offense,aaron,bay
4,andy reid,titans,ravens,super bowl,yards,niners,patrick mahomes,colts,think chiefs,defense,texans,watson,kansas,houston,mahomes,chiefs,offense,deshaun watson,deshaun,kansas city
5,bowl,chiefs,brees,drew brees,offense,vikings,ravens,super bowl,seattle,seahawks,saints,russell wilson,rams,defense,panthers,packers,niners,nfc,49ers,falcons
6,make money,spotify,free anchor,edit podcast,download,podcast minimum,spotify apple,app,record edit,listenership,phone computer,app anchor,distribute,distribute podcast,money podcast,minimum listenership,make podcast,anchor dot,anchor,anchor app
7,collective bargaining,nfl europe,practices,cba,preseason,draft,think nfl,know nfl,aaf,preseason games,nfl like,bargaining agreement,owners,nfl players,super bowl,nfl,nflpa,nba,free agency,xfl
8,know coach,kids know,learning,teach,mindset,coached,coaches,athlete,parents,head coach,athletes,program,basketball,practice,high school,youth,sport,coach know,parent,relationship
9,super bowl,chris ballard,retiring,flacco,ballard,colts fans,touchdowns,retirement,deserved better,quarterback,nfl,andrew luck,andrew,indianapolis colts,colts,indianapolis,manning,quarterbacks,luck,jacoby


### Representation tuning

In [59]:
import openai
import os

openai.api_key = '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'

topic_rep = topic_reps.iloc[320].to_list()

gpt_prompt =f"""
I have a topic that is described by the following keywords:{topic_rep}.

Based on the information above, describe the topic in 4 words.
"""

response = openai.Completion.create(
  engine="text-davinci-003",
  prompt=gpt_prompt,
  temperature=0.5,
  max_tokens=256,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

print(response['choices'][0]['text'])


Basketball Rules Violations.


## Topic model investigation

In [50]:
hdbscan_model.labels_.max()

460

In [54]:
# HDBSCAN results dataframe
hdbscan_df = pd.DataFrame({'transcript': modelling_documents, 
                           'cluster_label':hdbscan_model.labels_})

hdbscan_df_noout = hdbscan_df[hdbscan_df.cluster_label != -1]
hdbscan_df_noout.head(20)

Unnamed: 0,transcript,cluster_label
1,"as well as several other changes, so we'll get...",74
2,"if you know what pitch is coming shocker now, ...",18
3,system down in Houston. So Nate I ask you do y...,18
4,be a breast of these sort of issues that are h...,18
5,that's the only way the league was realistical...,18
7,last year. Does that help them? Does that hind...,18
8,they put the military-grade telescope out in t...,74
9,seeing the MLB come down on them as hard as th...,320
10,replay systems to work with it being so close ...,320
12,it just as much as I did very much. So as a Be...,65


In [57]:
group_df = hdbscan_df.groupby('cluster_label')['transcript'].count().sort_values(ascending=False)
group_df.head(10)

cluster_label
-1      188810
 320     16136
 18       9435
 100      6928
 68       5988
 28       5573
 65       5129
 26       3544
 11       3525
 416      2943
Name: transcript, dtype: int64

## Deprecated code


In [None]:
# Define submodels
#sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

dim_model = BaseDimensionalityReduction()

hdbscan_model = HDBSCAN(
    min_cluster_size = 15,
    metric='euclidean', # same as cosine for normalised data
    cluster_selection_method='eom', 
    algorithm='boruvka_kdtree',
    prediction_data=True)

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

vectorizer_model = CountVectorizer(stop_words='english')

In [None]:
# Initialize BERTopic and run

from bertopic.dimensionality import BaseCluster

empty_cluster_model = BaseCluster()

topic_model = BERTopic(
    #embedding_model=sentence_model,
    umap_model=dim_model,
    hdbscan_model=empty_cluster_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model = vectorizer_model,
    verbose=True  # progress bar
    )

topics = topic_model.fit_transform(modelling_documents, y = hdbscan_df.cluster_labels)

In [None]:
topic_model.save('BERT_v1') 

In [None]:
len(topic_model.get_topics())