# Install Dependencies

In [1]:
!pip install -U sentence-transformers
!pip install sklearn
!pip install plotly
!pip install nbformat>=4.2.0




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Dependencies

In [5]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


# Data to list

In [3]:
df1 = pd.read_csv("./Data/Sample_Dataset_Full.csv")
comments_list = df1.comment.tolist()

# Model to create embeddings

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')
# model2 = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking') # Use for different results

embeddings = model.encode(comments_list)

# Normalize the embeddings to unit length
embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

# Model to cluster the embeddings

In [7]:
number_of_clusters = 10 # Can be changed as per number of cluster sets required

clustering_model = KMeans(number_of_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

  super()._check_params_vs_input(X, default_n_init=10)


# View all sentences per Cluster

In [8]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(comments_list[sentence_id])

clustered_sentences

{5: ['Please move bills from held from publishing  to pending status as the accounts are now in "full billing" status .XNG004876XNG003407RZ6034874XNG003449XNG005972RZ6032589RZ6030906XNG006038RZ6044444XNG004942XNG004454XR4001777XNG004801XNG005113RZ6016871XNG005105RZ6044428RZ3021403Thank you!',
  'Please generate contribution bills for the following accounts below:XNG030103XR7063022Template attached.',
  ' please see attached template to create adjustment bill for 06H717321',
  ' please see attached template to create adjustment bill for RT5070758. This is an updated template from SR# 1210652 as I have included the sponsor fee to the program fee now. Everything should be in good order.',
  ' please see attached template to create adjustment bill for 06H444868 & RT5070758.',
  ' Please generate a termination bill for the following deceased account below:Account number:03X488013DOD: 9/26/21-non business dayTemplate attached.Thank you.',
  'Please move following accounts to full billing and

In [22]:
clustered_sentences[8]

['- the below trades are pending on our platform. Please advise on the trade status of these or if we are good to delete. 31-Dec-21\tBuy (pending)\tFAOIX:US\tMutual Fund\t177.538\t6,173.0031-Dec-21\tBuy (pending)\tFSPGX:US\tMutual Fund\t342.852\t10,217.00Thanks.',
 '- the below trades are pending on our platform. Please advise on the trade status of these or if we are good to delete. Account will be on TH in the meantime.31-Dec-21\tSell (pending)\tICIEX:US\tMutual Fund\t45.8\t1,036.0031-Dec-21\tSell (pending)\tIVOIX:US\tMutual Fund\t13.788\t27731-Dec-21\tSell (pending)\tIYMIX:US\tMutual Fund\t9.382\t39831-Dec-21\tSell (pending)\tIYGIX:US\tMutual Fund\t84.946\t3,160.00Thanks.',
 '- the below trade is showing as pending on our platform again. Please advise on the trade status of this. Account will be on TH in the meantime.3-Jan-22\tSell (pending)\tUAMA:US\tCommon Stock\t6,000\t54Thanks.',
 '- The below trade is pending on our platform. Please advise on the trade status of this. Account w

# Methods to plot the clusters in pie chart

Note: Cluster is created using different method. This is the reason the number of clusters is different in pie chart

In [20]:
def detect_clusters(embeddings, threshold=0.85, min_community_size=15, init_max_size=1000):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

def plot_clusters(clusters_to_show):
    NUM_CLUSTERS_TO_USE = 4
    if NUM_CLUSTERS_TO_USE > 20:
        NUM_CLUSTERS_TO_USE = 20

    sum = 0
    for cluster in clusters_to_show[:NUM_CLUSTERS_TO_USE]:
        sum += len(cluster)

    percentages = []
    for cluster in clusters_to_show:
        percentages.append((len(cluster)/sum)*100.0)

    labels = [f"Topic{i}" for i in range(1, NUM_CLUSTERS_TO_USE)]
    values = percentages

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
    fig.update_layout(width = 720, height = 720)

    return fig

In [24]:
# Plot the created clusters in pie chart

plot_clusters(detect_clusters(embeddings, min_community_size=20, threshold=0.95))