### Import Necessary Libraries

In [1]:
# general
import os

# matrix manipulation
import numpy as np

# dataset handling
import pandas as pd

# sentence clustering model
from sentence_transformers import SentenceTransformer

# sentence clustering utils
from sentence_transformers import util

# model building
import torch

### Load Data

In [2]:
data = pd.read_csv('../data/DC.csv')
data

Unnamed: 0,text
0,hi how are you thank you for calling chase ban...
1,that's really nice i can definitely help you w...
2,missus miss smith can i also have your nationa...
3,now how much are you looking to donate to your...
4,so that amount will be subject to taxes if you...
...,...
84626,no <filler> that <filler> <babble> i'm not hap...
84627,okay <filler> what time will they be open
84628,nine to one okay i will definitely go tomorrow...
84629,i don't need anything further today thanks i'm...


In [32]:
data.text = data.text.str.replace('<filler>', '').str.replace('<babble>', '')

In [33]:
sentences = data.text.to_list()

In [34]:
sentences

['hi how are you thank you for calling chase bank how can i help you',
 "that's really nice i can definitely help you with that can i have your full name",
 'missus miss smith can i also have your national insurance number',
 'now how much are you looking to donate to your daughter',
 "so that amount will be subject to taxes if you don't want to pay taxes which i'm assuming you wouldn't  a tax-free gift has to be  no higher than thirty thousand dollars",
 'great so now  do you know approximately when your daugh your daughter plans  it was your daughter yeah plans to purchase her home the home',
 'now  is there any anything that i could advise you about during this time',
 "of course no i understand so  fortunately because of covid and everything they are doing video calls we  i can if you'd like i can schedule a video call instead",
 'me see if that works yeah thursday is definitely open how is two p m',
 "me see  yep they have ten a m open so i'm just gonna repeat the details to you  

### Load Model

In [35]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Clustering

Getting Sentence Embeddings.

In [36]:
sentence_embeddings = model.encode(sentences, batch_size=2048, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Clustering the sentences based on embeddings.

In [37]:
clusters = util.community_detection(
  sentence_embeddings,
  min_community_size=10,
  threshold=.75
)

Let's see the clusters.

In [21]:
# help(util)

In [22]:
# ?util.community_detection

In [40]:
num_print_clusters = 10

In [46]:
for i, cluster in enumerate(clusters[:num_print_clusters]):
  
  print(f'*** Cluster {i+1} ***')
  
  for sentence_id in cluster:
    print(sentences[sentence_id], end='\n\n')
  
  print('---------------------')

*** Cluster 1 ***
it's gonna be six one five two three two nine four six eight

i've got that right here it's one three two six six five two four eight

four five eight seven four three six nine one one

okay it is five nine nine six seven three two

six zero three nine

seven six two four

okay it's seven nine five six one five six one six six nine one nine

sure  it is five one six six one seven nine nine three zero

sure that's two six zero one three one two eight two one

is eight five two one one two zero nine one five

yes it is eight three one four zero zero four

what i have is  two three one seven three one zero three four two

number is nine seven three four seven three three zero one six

yes ma'am that would be two four six eight one zero one two one three five seven nine zero zero one

it is  one six seven one three four eight one two three five four six seven

 so i have four seven one seven  eight nine zero zero one five eight nine  one two five

here it is okay  let's s

In [49]:
len(clusters[0])

1231

In [50]:
sum([len(cluster) for cluster in clusters])

45856