# Install Dependencies

In [1]:
!pip install -U sentence-transformers
!pip install sklearn
!pip install plotly
!pip install nbformat>=4.2.0




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Dependencies

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


# Data to list

In [3]:
df1 = pd.read_csv("./Data/Sample_Dataset_Full.csv")
comments_list = df1.comment.tolist()

# Model to create embeddings

In [4]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking') # Use for different results

embeddings = model.encode(comments_list)

# Normalize the embeddings to unit length
embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

# Detect Clusters

In [5]:
def detect_clusters(embeddings, threshold=0.85, min_community_size=15, init_max_size=1000):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    """
    1) Iterates over the cosine similarity scores to identify potential clusters. For each row (embedding), 
    it checks if the top similarity score (corresponding to the last column) is greater than or equal to the threshold value. 
    If it is, a new cluster is created.
    2) If the top similarity score is less than the threshold, it iterates over the top init_max_size most similar entries (embeddings) 
    to check if their similarity scores are greater than or equal to the threshold. If they are, the index of the embedding is added to the 
    new cluster.
    3) Overlapping communities are removed to ensure uniqueness. The function creates an empty list called unique_communities and a set 
    called extracted_ids to keep track of the indices that have already been added to a community. It iterates over the extracted communities 
    and checks if any index in the community has already been added to extracted_ids. If not, the community is added to unique_communities, 
    and the indices are added to extracted_ids.
    """
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [6]:
uniques_comm = detect_clusters(embeddings, min_community_size=20, threshold=0.95)

## Write Sentences inside clusters into seperate file

In [7]:
def write_cluster_sentences_to_file(comments_list, unique_communities):
        cluster_count = 1
        for cluster in unique_communities:
            output_file_name = f"./Output/cluster_{cluster_count}.txt"
            cluster_count += 1
            with open(output_file_name, 'w') as file:
                file.write("Cluster:\n")
                for index in cluster:
                    sentence = comments_list[index]
                    file.write(sentence + '\n')
                file.write('\n')

In [8]:
write_cluster_sentences_to_file(comments_list, uniques_comm)

# Plot Clusters

In [11]:
def plot_clusters(clusters_to_show):
    NUM_CLUSTERS_TO_USE = len(clusters_to_show)
    print(f"Number of clusters to use: {NUM_CLUSTERS_TO_USE}")
    # if NUM_CLUSTERS_TO_USE > 20:
    #     NUM_CLUSTERS_TO_USE = 20

    sum = 0
    for cluster in clusters_to_show[:NUM_CLUSTERS_TO_USE]:
        sum += len(cluster)

    percentages = []
    for cluster in clusters_to_show:
        percentages.append((len(cluster)/sum)*100.0)

    labels = [f"Cluster{i}" for i in range(1, NUM_CLUSTERS_TO_USE)]
    values = percentages

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
    fig.update_layout(width = 720, height = 720)

    return fig

In [12]:
# Plot the created clusters in pie chart

plot_clusters(uniques_comm)

Number of clusters to use: 19
