In [1]:
# ! pip install sentence_transformers
! pip install hdbscan
! pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdbscan
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 5.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.28-cp37-cp37m-linux_x86_64.whl size=2340230 sha256=5af5ef9150031dbecb5ed8e6bf9dfd1689890e11bee8fe5e3cc88fdf5cd83836
  Stored in directory: /root/.cache/pip/wheels/6e/7a/5e/259ccc841c085fc41b99ef4a71e896b62f5161f2bc8a14c97a
Successfully built hdbscan
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.28
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Embed Corpus
### Using Sentence Transformers

In [5]:
import json
import os
import random
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
import pickle
from sklearn.cluster import MiniBatchKMeans as KMeans
from sklearn.manifold import TSNE
import hdbscan

## Load Data

In [6]:
df = pd.read_csv('/content/drive/MyDrive/full_transcripts_cleaned.csv')

In [7]:
embeddings = np.load('/content/drive/MyDrive/embeddings_cleaned.npy')

In [None]:
selection = np.random.choice(df.episode.unique(),50)
df = df[df.episode.isin(selection)]

In [None]:
docs = df['text'].values.tolist()
docs

## Embed Docs



- Load Embedding Model
- Process Batches (May Take Awhile)

In [None]:
df['embeddings'] = embeddings[:].tolist()
df.to_csv('data/doc_embeds.csv')

## Define Output Functions

In [8]:
def sample_clusters(labels, n_clusters = 10, n_samples = 10, column = 'labels'):
    df[column] = labels
    for i in range(n_clusters):
        print(f'Cluster {i}: {len(df[df[column] == i])} / {len(df)}')
        sample = df[df[column] == i].sample(n_samples, replace=True)['text']
        for x in sample:
            print('\t- ' + x[:100])
        print('\n')
        print('==='*30)
        print('\n')

## 5 Clusters

In [9]:
n_clusters = 5 
n_samples = 10

In [11]:
kmeans = KMeans(n_clusters = n_clusters, random_state= 42)
kmeans = kmeans.fit(embeddings)
sample_clusters(kmeans.labels_, n_clusters=n_clusters, n_samples=n_samples, column='cluster_5')

Cluster 0: 112025 / 1387497
	-  Yeah! Oh. Really? I get plus two to stealth?
	-  Oh, 25 to hit? I'm sorry, yes, that hits.
	-  You know, maybe one. You take one, you give one, I'm not sure.
	-  Yeah. I got a 15 and then a 20, plus my Bless.
	-  Five points of damage, and then my last bonus action attack with advantage. Same roll, 13 on both. 
	-  No, I'm casting it as a fifth level again. So it's another 9d-whatever, yeah.
	-  Oh, even better. 28. 
	-  13.
	-  God! That's so many dice.
	-  19?




Cluster 1: 296365 / 1387497
	-  Sure! I'm going to pull out my sword. 
	-  No. I stay behind.
	-  Does anyone work on this ship but me?
	-  Let's go? 
	-  Well, we didn't realize this room was already taken, so we'll just go. 
	-  I think it's a good idea to see him.
	-  I can.
	-   Hi everybody, it’s me, Travis McElroy. Thank you so much for listening to this finale of Ethersea.
	-  So you're in Dream World?
	-  When the tincture is ready.




Cluster 2: 348824 / 1387497
	-  “There are, yeah

## 10 Clusters

In [12]:
n_clusters = 10 
n_samples = 10

In [13]:
kmeans = KMeans(n_clusters = n_clusters, random_state= 42)
kmeans = kmeans.fit(embeddings)
sample_clusters(kmeans.labels_, n_clusters=n_clusters, n_samples=n_samples, column='cluster_10')

Cluster 0: 67360 / 1387497
	-  17? 
	-    So nat 20.   
	-    Oh yeah, you're right! So that‘s only three.   
	-  9.
	-  Natural 20. 
	-  22.
	-  20. 
	-  Four plus– There's no bonus. So it was four plus– then I got an eight and a two, right? It was an e
	-  Then the second one hits as well, 17 plus 12.
	-  13.




Cluster 1: 184907 / 1387497
	-  I'm sorry. I'm buying it.
	-  No. You can take a stab. See how it goes.
	-  Let me just do- Roll for that.
	-  Between Grog and Scanlan, we can solve any problem, I think, so beef him up, so to speak.
	-  I think you should.
	-  Let's try to get a letter. 
	-  He said either end is fine,so it's ultimately up to how the staff's feeling that day.
	-  I think they're staying.
	-  Please. 
	-  Nobody can get here quickly. 




Cluster 2: 164879 / 1387497
	-  Down there.
	-  Pretty sure that's true.
	-  The other was a cold to the touch vial of something.
	-  What do you want to know?
	-  Was it from a dead friend? Was it from one of your other dea

## 20 Clusters

In [14]:
n_clusters = 20 
n_samples = 10

In [15]:
kmeans = KMeans(n_clusters = n_clusters, random_state= 42)
kmeans = kmeans.fit(embeddings)
sample_clusters(kmeans.labels_, n_clusters=n_clusters, n_samples=n_samples, column='cluster_20')

Cluster 0: 39803 / 1387497
	-  Yeah. 
	-  Yeah. 
	-  Yep.
	-  Yeah.
	-  Yeah. 
	-  Yeah. 
	-  Yeah. 
	-  Yeah. 
	-  Right.
	-  Yes. 




Cluster 1: 106800 / 1387497
	-  Two years and I never get used to it. 
	-  I would have probably, but I was very distracted by Warfare Whitney, so. 
	-   Dooo n‘t.  
	-  You stupid little devil.
	-  Do I feel anything?
	-  You do have to. 
	-  You're just dangling.
	-  I did. Yeah, I did.
	-  Nothing fun.
	-  All I did wrong was play some poker. Just saying. You know it in your heart of hearts, Obby. All I 




Cluster 2: 31220 / 1387497
	-  What'd you roll?
	-  So I roll three of these, right?
	-  It's effortless. No roll needed. Another applause starts. He says,
	-  All righty, what save is that? That is-- 
	-  I can also grab him out next turn. I don't know.
	-  Rolled terribly again! That is going to be a whopping six.
	-  I need you all to roll initiative.
	-  Also I didn't need to roll.
	-  You did it at advantage?
	-  Please roll good.




Clus

## 50 Clusters

In [16]:
n_clusters = 50 
n_samples = 10

In [17]:
kmeans = KMeans(n_clusters = n_clusters, random_state= 42)
kmeans = kmeans.fit(embeddings)
sample_clusters(kmeans.labels_, n_clusters=n_clusters, n_samples=n_samples, column='cluster_50')

Cluster 0: 27741 / 1387497
	-  They might freak out.
	-  Do we recognize them?
	-  They can make a lot of bad things. 
	-  On them, which makes a target indifferent about creatures of your choice–us.
	-  ‘Cause those only go if you're not- If you're not wed- 
	-  It's truly dangerous improvising with them but also the most fun.
	-  We killed two that were near me though.
	-  Don't birds have hollow bones. 
	-  They're going to be helpful.
	-  So they're dead, huh?




Cluster 1: 36515 / 1387497
	-  Give me a-- Can I get a d4? 
	-   Sure.  
	-  I can make us both invisible, so we'll be plenty stealthy.
	-  But I see what you're doing. Can I team up with him, and do Silent Image as well? 
	-   Come on.   
	-  Yeah, do you want Grog?
	-  Gimme that.
	-  With some other people. Yeah, we'll do that.
	-  Sure.
	-  I don't know! Give me 20 minutes!




Cluster 2: 22285 / 1387497
	-    Cool.   
	-  Great.
	-  Cool.
	-  Great.
	-  Okay.
	-  Okay.
	-  Okay.
	-  Cool.
	-  Cool.   
	-  Okay.




C

In [None]:
model = TSNE(n_components= 2)

dim_reduce = model.fit_transform(embeddings)
dim_reduce[:5]



array([[ 36.059105 ,   8.477272 ],
       [ 34.30238  ,   7.416399 ],
       [ 35.44172  , -16.45835  ],
       [  1.4548695, -36.095573 ],
       [ 32.732437 ,   3.7089074]], dtype=float32)

In [None]:
df['x'] = dim_reduce[:,0]
df['y'] = dim_reduce[:,1]

In [None]:
df.to_csv('data/transcript_subset.csv')