### Import Necessary Libraries

In [2]:
# general
import os

# matrix manipulation
import numpy as np

# dataset handling
import pandas as pd

# sentence clustering model
from sentence_transformers import SentenceTransformer

# sentence clustering utils
from sentence_transformers import util

# model building
import torch

### Load Data

In [6]:
data = pd.read_csv('../data/Sentiment_AIData.csv')
data = data.iloc[:,:2]
data

Unnamed: 0,Transcription,FinalOutput
0,no no i'm good wish you all the best good day,Positive
1,don't have a capability to add or remove on th...,Negative
2,i need to complain about receiving poor custom...,Negative
3,yes i was hoping to upgrade my data plan and j...,Positive
4,i'd like to complain about an incorrect amount...,Negative
...,...,...
124255,hi i would like to inquire about an issue i ha...,Neutral
124256,thank you very much for calling all state and ...,Positive
124257,sil know i'm i'm sorry i'm sorry since this is...,Negative
124258,regents bank right i mean regions bank right t...,Positive


In [32]:
# data.text = data.text.str.replace('<filler>', '').str.replace('<babble>', '')

In [10]:
sentences = data.Transcription.to_list()

In [12]:
sentences = sentences[:10000]

In [13]:
sentences

["no no i'm good wish you all the best good day",
 "don't have a capability to add or remove on the account unless otherwise the information on your employer's side will be updated and they are the one who's going to coordinate with us to have that removed on the account",
 'i need to complain about receiving poor customer service',
 'yes i was hoping to upgrade my data plan and just wanted some more information on what goes into that',
 "i'd like to complain about an incorrect amount that was charged in my last order",
 "i'm interested in a family package",
 "i understand it's definitely not disgusting to me i actually work at aah at home healthcare for a very long time so i dealt with a lot of patients that had diabetes so i understand that",
 'thank you for helping me understand yeah',
 'yes it is si you can trust the application because all the data that we put there and our encryption has been detailed so no worries on that one for you',
 "my bags checked to even go to the that's 

### Load Model

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Clustering

Getting Sentence Embeddings.

In [15]:
sentence_embeddings = model.encode(sentences, batch_size=2048, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Clustering the sentences based on embeddings.

In [179]:
clusters = util.community_detection(
  sentence_embeddings,
  min_community_size=1,
  threshold=.6
)

Let's see the clusters.

In [180]:
# help(util)

In [181]:
# ?util.community_detection

In [182]:
num_print_clusters = 10

In [183]:
len(clusters[9])

99

In [184]:
for i, cluster in enumerate(clusters[:num_print_clusters]):
  
  print(f'*** Cluster {i+1} ***')
  
  for sentence_id in cluster:
    print(sentences[sentence_id], end='\n\n')
  
  print('---------------------')

*** Cluster 1 ***
i need to complain about receiving poor customer service

the customer service was terrible and i'd like to file a complaint

i'd like to complain about the quality of customer service i received

that representative gave me bad customer service in the past

my experience with customer service is terrible

i have an issue with customer service

hi i would like to complain about a poor customer service interaction i just had

i would like to file a complaint about customer service

the customer service i received was awful

your customer service is very poor i received very poor customer service this afternoon

your company has offered very poor customer service

i need to place a complaint about an employee

can you help me get a complaint filed for my past customer service call

can i speak to someone about the bad quality of service i received today

i want to complain about a recent customer service call can you please transfer me to your supervisor i want to compl

In [190]:
[sentences[sentence_id] for sentence_id in clusters[-2002]]

["well i'm interested in law charity"]

In [186]:
len(clusters[-3000])

5

In [175]:
sum([len(cluster) for cluster in clusters])

10000

In [176]:
len(clusters)

8081

In [191]:
sum([int(len(cluster) > 5) for cluster in clusters])

221

In [193]:
sum([len(cluster) for cluster in clusters if len(cluster) >= 3])

6869

In [81]:
from bertopic import BERTopic

In [82]:
# ?BERTopic

In [83]:
# docs = [sentences[sentence_id] for sentence_id in clusters[0]]
docs = sentences

In [84]:
topic_model = BERTopic(verbose=True)

In [85]:
# ?BERTopic

In [86]:
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2023-06-29 03:47:12,777 - BERTopic - Transformed documents to Embeddings
2023-06-29 03:47:16,639 - BERTopic - Reduced dimensionality
2023-06-29 03:47:16,800 - BERTopic - Clustered reduced embeddings


In [87]:
max(topics)

234

In [88]:
topic_model.visualize_barchart()

In [96]:
sum([int(topic == 119) for topic in topics])

27

In [89]:
sum([int(topic == -1) for topic in topics])

2004

In [90]:
topic_model.visualize_documents(docs)

In [91]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|████████████████████████████████████████| 234/234 [00:00<00:00, 333.36it/s]


In [92]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [125]:
topic_model.get_topic(10)

[('bye', 0.09173009629307641),
 ('day', 0.08812298919241952),
 ('good', 0.044643648168267096),
 ('hope', 0.03934112534479766),
 ('nice', 0.035471614871627526),
 ('too', 0.03257884102156031),
 ('have', 0.031348933958564196),
 ('great', 0.028860016202974563),
 ('wonderful', 0.022858028102775617),
 ('thank', 0.022464632494816825)]

In [97]:
from top2vec import Top2Vec

In [98]:
# ?Top2Vec

In [100]:
model = Top2Vec(
  documents=sentences,
  embedding_model='all-MiniLM-L6-v2',
  speed="deep-learn", 
  workers=8, 
  embedding_batch_size=1024
)

2023-06-29 14:29:56,396 - top2vec - INFO - Pre-processing documents for training

The parameter 'token_pattern' will not be used since 'tokenizer' is not None'

2023-06-29 14:29:56,690 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2023-06-29 14:29:56,914 - top2vec - INFO - Creating joint document/word embedding
2023-06-29 14:30:23,826 - top2vec - INFO - Creating lower dimension embedding of documents
2023-06-29 14:30:27,762 - top2vec - INFO - Finding dense areas of documents
2023-06-29 14:30:27,901 - top2vec - INFO - Finding topics


In [105]:
model.get_num_topics()

146

In [106]:
topic_sizes, topic_nums = model.get_topic_sizes()

In [108]:
pd.DataFrame({
  'Topic Num': topic_nums,
  'Topic Size': topic_sizes
})

Unnamed: 0,Topic Num,Topic Size
0,0,303
1,1,295
2,2,278
3,3,248
4,4,235
...,...,...
141,141,19
142,142,19
143,143,18
144,144,18


In [117]:
topic_words, word_scores, topic_nums = model.get_topics()

In [122]:
topic_nums

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145])

In [121]:
pd.DataFrame(word_scores)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.584137,0.563421,0.413126,0.409819,0.404647,0.398360,0.382957,0.366475,0.360670,0.360616,...,0.271989,0.269543,0.268098,0.262623,0.262501,0.262329,0.259765,0.256213,0.255363,0.253665
1,0.573836,0.420447,0.340782,0.331361,0.325470,0.321978,0.321198,0.320937,0.315834,0.314814,...,0.247649,0.247452,0.246631,0.243098,0.242395,0.242155,0.240529,0.240451,0.240153,0.239404
2,0.673591,0.552435,0.473040,0.435967,0.415776,0.407501,0.406199,0.372859,0.371894,0.366565,...,0.272539,0.272080,0.271948,0.270974,0.268242,0.267978,0.267256,0.266340,0.266276,0.264636
3,0.633201,0.612929,0.590383,0.538754,0.520037,0.508910,0.492920,0.492630,0.489033,0.482046,...,0.358590,0.357976,0.357205,0.355164,0.352999,0.348324,0.346971,0.344854,0.344698,0.343356
4,0.484119,0.410197,0.383094,0.350483,0.343309,0.343208,0.341540,0.335717,0.330139,0.323284,...,0.258861,0.252131,0.251538,0.251469,0.250425,0.247389,0.246831,0.245452,0.244052,0.243168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,0.275678,0.265476,0.250943,0.240496,0.232147,0.215405,0.206159,0.205516,0.205066,0.204836,...,0.174529,0.173972,0.173901,0.173260,0.172740,0.172580,0.171893,0.170316,0.169843,0.169636
142,0.544411,0.409645,0.399287,0.272900,0.267560,0.243007,0.242738,0.235735,0.232722,0.228009,...,0.118276,0.116209,0.115786,0.115267,0.114983,0.114963,0.112732,0.111125,0.110613,0.109990
143,0.630472,0.419055,0.396100,0.373826,0.357307,0.356776,0.331188,0.328781,0.309963,0.295928,...,0.212066,0.211547,0.210591,0.209352,0.208349,0.208136,0.205509,0.204876,0.204276,0.201763
144,0.575552,0.277115,0.264263,0.236977,0.213905,0.193710,0.189718,0.186246,0.182658,0.176540,...,0.112768,0.111024,0.106937,0.106707,0.105521,0.105335,0.103852,0.103815,0.103649,0.103248


In [119]:
pd.DataFrame(np.array(topic_words))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,bye,goodbye,fine,appreciate,happy,alright,end,day,okay,welcome,...,wish,going,over,open,take,wanted,great,today,last,may
1,survey,satisfaction,customer,call,alright,regarding,calling,questions,service,rejected,...,on,number,do,after,will,send,got,request,happy,did
2,email,mail,send,received,alright,address,receive,okay,request,online,...,able,sorry,great,opened,help,order,billing,will,already,open
3,alright,okay,ok,great,fine,sure,yeah,awesome,perfect,that,...,about,interested,any,right,now,need,hey,only,available,got
4,assistance,alright,appreciate,able,sure,helpful,assist,help,worries,fine,...,doing,things,receive,repair,on,yes,other,great,your,trying
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,ma,appreciate,customer,call,calling,reception,afternoon,received,calls,us,...,mail,may,full,sorry,bye,gonna,delivery,survey,awesome,great
142,billing,address,paperless,bank,payments,mail,payment,account,receipt,transaction,...,on,complaint,thousand,customer,internet,well,purchased,delivery,first,store
143,hacked,account,password,stolen,store,purchased,locked,purchase,customer,transaction,...,open,possible,didn,problem,can,use,new,after,getting,complaint
144,insurance,coverage,policy,overseas,billing,deductible,country,foreign,plans,discount,...,price,worries,will,service,email,may,worry,experience,transfer,hope


In [126]:
topic_words[0]

array(['bye', 'goodbye', 'fine', 'appreciate', 'happy', 'alright', 'end',
       'day', 'okay', 'welcome', 'you', 'hey', 'thanks', 'all', 'hello',
       'received', 'go', 'ok', 'stay', 'sure', 'outstanding', 'opened',
       'rest', 'thank', 'afternoon', 'morning', 'hi', 'full', 'sorry',
       'your', 'care', 'sent', 'done', 'apologize', 'wonderful', 'able',
       'glad', 'talk', 'doing', 'worries', 'wish', 'going', 'over',
       'open', 'take', 'wanted', 'great', 'today', 'last', 'may'],
      dtype='<U14')

In [127]:
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)

ValueError: Hierarchical topic reduction has not been performed.

In [132]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=['complaint'], num_topics=5)

In [134]:
topic_nums

array([107,  12,  36,  95,  59])