# Install Dependencies

In [1]:
!pip install -U sentence-transformers
!pip install sklearn
!pip install plotly
!pip install nbformat>=4.2.0


[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Dependencies

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import pickle
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


# Data Cleaning

In [3]:
def data_cleaning(input_file, output_file):
    list_data = ["<br/>","Hi team -","hi team -","Hello-","Hi Team,","Hi team,","Hello,","Hi Team:","Hello Team,","Hi,","Hi team-",
                 "Hi Team-","Hi Team.","Hello,","hello,","Hi Team","hi team","hello team","Hello!","team,","Team","====","HI","Hi team"]

    with open(input_file, 'r') as file:
        data = file.read()
        for l_data in list_data:
            data = data.replace(l_data, "")

    # Opening our text file in write only mode to write the replaced content
    with open(output_file, 'w') as file:
        file.write(data)

    df = pd.read_csv(output_file)
    # Renaming the column
    df.rename(columns={"First Comment": "comments"}, inplace=True)
    # Removing blank rows
    df.dropna(how='all', inplace=True)
    df.to_csv(output_file, index=False)
    print(f"Data cleaning complete. Cleaned data is stored in {output_file}")

In [4]:
data_cleaning("./Data/Raw_Dataset.csv", "./Data/Cleaned_Dataset.csv")

Data cleaning complete. Cleaned data is stored in ./Data/Cleaned_Dataset.csv


In [5]:
# Add id column to the dataset (Extra)

df_raw = pd.read_csv("./Data/Cleaned_Dataset.csv")

# Add the 'id' column with incrementing numbers starting from 1
df_raw['id'] = range(1, len(df_raw) + 1)

# Save the updated dataset back to CSV
df_raw.to_csv("./Data/Cleaned_Dataset.csv", index=False)

In [6]:
# Converting the cleaned dataset into list

df1 = pd.read_csv("./Data/Cleaned_Dataset.csv")
comments_list = df1.comments.tolist()

# Model to create, store, and read embeddings

In [7]:
def create_and_store_embeddings() -> np.ndarray:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking') # Use for different results
    embeddings = model.encode(comments_list, show_progress_bar=True)
    # Normalize the embeddings to unit length
    embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings


In [8]:
embeddings = create_and_store_embeddings()

Batches: 100%|██████████| 270/270 [04:00<00:00,  1.12it/s]


In [9]:
# Store sentences and embeddings to disc
with open("./Data/embeddings.pkl", "wb") as fOut:
    pickle.dump({'sentences': comments_list, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
#Load sentences & embeddings from disc
embeddings = np.empty_like(embeddings)
with open("./Data/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    embeddings = stored_data['embeddings']

# Detect Clusters

In [11]:
def detect_clusters(embeddings, threshold=0.85, min_community_size=15, init_max_size=1000):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

    # Filter for rows >= min_threshold
    extracted_communities = []
    """
    1) Iterates over the cosine similarity scores to identify potential clusters. For each row (embedding), 
    it checks if the top similarity score (corresponding to the last column) is greater than or equal to the threshold value. 
    If it is, a new cluster is created.
    2) If the top similarity score is less than the threshold, it iterates over the top init_max_size most similar entries (embeddings) 
    to check if their similarity scores are greater than or equal to the threshold. If they are, the index of the embedding is added to the 
    new cluster.
    3) Overlapping communities are removed to ensure uniqueness. The function creates an empty list called unique_communities and a set 
    called extracted_ids to keep track of the indices that have already been added to a community. It iterates over the extracted communities 
    and checks if any index in the community has already been added to extracted_ids. If not, the community is added to unique_communities, 
    and the indices are added to extracted_ids.
    """
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [12]:
uniques_comm = detect_clusters(embeddings, min_community_size=20, threshold=0.95)

## Save sentences per cluster into seperate file

In [17]:
def write_cluster_sentences_to_file(comments_list, unique_communities, df_input):
    cluster_count = 1
    for cluster in unique_communities:
        output_file_name = f"./Output/cluster_{cluster_count}.txt"
        cluster_count += 1
        with open(output_file_name, 'w') as file:
            file.write("Cluster:\n")
            for index in cluster:
                sentence = comments_list[index]
                id_value = df_input.iloc[index]['id']
                file.write(f"ID: {id_value}, Sentence: {sentence}\n")
            file.write('\n')

In [18]:
write_cluster_sentences_to_file(comments_list, uniques_comm, df1)

# Plot Clusters

In [15]:
def plot_clusters(clusters_to_show):
    NUM_CLUSTERS_TO_USE = len(clusters_to_show)
    print(f"Number of clusters to use: {NUM_CLUSTERS_TO_USE}")
    # if NUM_CLUSTERS_TO_USE > 20:
    #     NUM_CLUSTERS_TO_USE = 20

    sum = 0
    for cluster in clusters_to_show[:NUM_CLUSTERS_TO_USE]:
        sum += len(cluster)

    percentages = []
    for cluster in clusters_to_show:
        percentages.append((len(cluster)/sum)*100.0)

    labels = [f"Cluster{i}" for i in range(1, NUM_CLUSTERS_TO_USE)]
    values = percentages

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
    fig.update_layout(width = 720, height = 720)

    return fig

In [16]:
# Plot the created clusters in pie chart

plot_clusters(uniques_comm)

Number of clusters to use: 34
