# Install Dependencies

In [1]:
#%pip install pip --upgrade

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install -U sentence-transformers
!pip install scikit-learn
!pip install plotly
!pip install nbformat>=4.2.0



# Import Dependencies

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


# Data Cleaning

In [26]:
def data_cleaning(input_file, output_file):
    list_data = ["<br/>","Hi team -","hi team -","Hello-","Hi Team,","Hi team,","Hello,","Hi Team:","Hello Team,","Hi,","Hi team-",
                 "Hi Team-","Hi Team.","Hello,","hello,","Hi Team","hi team","hello team","Hello!","team,","Team","====","HI","Hi team"]

    with open(input_file, 'r', encoding='utf-8') as file:
        data = file.read()
        for l_data in list_data:
            data = data.replace(l_data, "")

    # Opening our text file in write only mode to write the replaced content
    with open(output_file, 'w', encoding='utf-8-sig') as file:
        file.write(data)

    df = pd.read_csv(output_file)
    # Renaming the column
    df.rename(columns={"First Comment": "comments"}, inplace=True)
    # Fill empty (NaN) cells with 'NULL'
    df.fillna('NULL', inplace=True)  # Very important. Don't remove
    # Removing blank rows
    df.dropna(how='all', inplace=True)
    df.to_csv(output_file, index=False)
    print(f"Data cleaning complete. Cleaned data is stored in {output_file}")

In [27]:
data_cleaning("./Data/first_comment_cross_channel_small.csv", "./Data/Cleaned_Dataset.csv")

Data cleaning complete. Cleaned data is stored in ./Data/Cleaned_Dataset.csv


In [28]:
# Converting the cleaned dataset into list

df1 = pd.read_csv("./Data/Cleaned_Dataset.csv")
df1['comments'] = df1['comments'].apply(lambda x: x if isinstance(x, str) else "")
comments_list = df1.comments.tolist()

# Model to create embeddings

In [29]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking') # Use for different results

embeddings = model.encode(comments_list)

# Normalize the embeddings to unit length
embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

In [30]:
print(len(embeddings))

200


# Detect Clusters

In [31]:
def detect_clusters(embeddings, threshold=0.95, min_community_size=30, init_max_size=1000):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    """
    1) Iterates over the cosine similarity scores to identify potential clusters. For each row (embedding), 
    it checks if the top similarity score (corresponding to the last column) is greater than or equal to the threshold value. 
    If it is, a new cluster is created.
    2) If the top similarity score is less than the threshold, it iterates over the top init_max_size most similar entries (embeddings) 
    to check if their similarity scores are greater than or equal to the threshold. If they are, the index of the embedding is added to the 
    new cluster.
    3) Overlapping communities are removed to ensure uniqueness. The function creates an empty list called unique_communities and a set 
    called extracted_ids to keep track of the indices that have already been added to a community. It iterates over the extracted communities 
    and checks if any index in the community has already been added to extracted_ids. If not, the community is added to unique_communities, 
    and the indices are added to extracted_ids.
    """
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            # Only check top k most similar entries
            k = min(init_max_size, cos_scores.size(1))  # Ensure k does not exceed the number of embeddings
            top_val_large, top_idx_large = cos_scores[i].topk(k=k, largest=True)
            # top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [32]:
uniques_comm = detect_clusters(embeddings, min_community_size=5, threshold=0.95)

## Save sentences to file

### Save to seperate text file

In [33]:
# def write_cluster_sentences_to_file(comments_list, unique_communities):
#         cluster_count = 1
#         for cluster in unique_communities:
#             output_file_name = f"./Output/cluster_{cluster_count}.txt"
#             cluster_count += 1
#             with open(output_file_name, 'w') as file:
#                 file.write("Cluster:\n")
#                 for index in cluster:
#                     sentence = comments_list[index]
#                     file.write(sentence + '\n')
#                 file.write('\n')

# write_cluster_sentences_to_file(comments_list, uniques_comm)

### Save to csv file

In [34]:
def add_cluster_column_to_df(df, unique_communities):
    # Initialize the Cluster column with 0 (or another default value indicating no cluster)
    df['Cluster'] = 0

    # Assign cluster numbers to rows based on unique_communities
    for cluster_id, cluster in enumerate(unique_communities, start=1):
        for index in cluster:
            df.at[index, 'Cluster'] = cluster_id

    return df

In [35]:
final_df = add_cluster_column_to_df(df1, uniques_comm)

In [39]:
# Export the updated DataFrame to a CSV file
final_df.to_csv("./Output/cluster_number_data.csv", index=False)

# Plot Clusters

In [42]:
print(df1['Cluster'].value_counts())

Cluster
1     45
2     42
0     36
3     17
4     12
5      9
6      8
7      8
8      7
9      6
10     5
11     5
Name: count, dtype: int64


In [51]:
# Get the count of comments for each cluster, including all clusters
cluster_counts = df1['Cluster'].value_counts().sort_index()

# Create the pie chart
fig = go.Figure(data=[go.Pie(
    labels=cluster_counts.index,
    values=cluster_counts.values,
    hole=0.4
)])

# Update layout with a larger size
fig.update_layout(
    title="Cluster Distribution",
    title_x=0.5,
    width=700,
    height=700
)

# Show the pie chart
fig.show()