In [1]:
# Imports:
import pandas as pd
import numpy as np

# Disable Warnings:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_csv('Combined_SPA_Callback.csv')
df

Unnamed: 0,Date,ID,Text
0,26-Jul-17,1.010040e+11,please call daughter Dianne Thomas 07920 07565...
1,28-Jul-17,1.010040e+11,please speak to sister - Alyson Powell on abpv...
2,28-Jul-17,1.010040e+11,KELLY HAS RUNG - HER PARENTS ARE DUE TO HAVE W...
3,31-Jul-17,1.010040e+11,"Mark Hitchings, Scheme Manager Swn Yr Afon con..."
4,28-Jul-17,1.010040e+11,SON BRIAN HASFORD RUN G- HE IS RESIDENT IN AUS...
...,...,...,...
11353,13-Jun-23,1.010000e+11,"Enquiring about a downstairs toilet, states he..."
11354,13-Jun-23,1.010000e+11,Phoning on behalf on her mother she has no fee...
11355,13-Jun-23,1.010000e+11,States on saturday they had to call an ambulan...
11356,13-Jun-23,1.010000e+11,Nadia Obaji daughter of Abdel Obaji contacted ...


In [3]:
from sentence_transformers import SentenceTransformer

# This cell takes ~3 mins to run

# Choosing SentenceTransformer model:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Creating embeddings from sentences:
sentences = list(df["Text"])
embeddings = model.encode(sentences)

In [4]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

# Default AffinityPropagation Run:
embedding_cluster = AffinityPropagation(random_state = 42).fit(embeddings)
no_of_clusters = len(embedding_cluster.cluster_centers_)
no_of_labels = len(embedding_cluster.labels_)
no_of_iterations = embedding_cluster.n_iter_

if no_of_iterations < 200:
    no_of_clusters = len(embedding_cluster.cluster_centers_)
    print("Params: Default, Clusters: {}, Iterations: {}".format(no_of_clusters, no_of_iterations))
    # Metrics:
    chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
    dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
    sil = silhouette_score(embeddings, embedding_cluster.labels_)
    print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
else:
    print("Default parameters did not converge.")

Default parameters did not converge.


In [5]:
# Changing damping by 0.05 per iteration:
damping_values = np.linspace(0.5, 1.0, 11)

# Hyper-parameter testing (damping):
for i in damping_values[1:-1]:
    embedding_cluster = AffinityPropagation(damping=i, max_iter=int(200*i), random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < int(200*i):
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("Damping: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Damping: {}, Did not converge.".format(i))
        print("=======================================================")

Damping: 0.55, Clusters: 642, Iterations: 68
CHI:10.877133011821346
DBI:3.0913426526208716
Sil:0.018443606793880463
Damping: 0.6, Clusters: 642, Iterations: 71
CHI:10.871827745313533
DBI:3.096206568015383
Sil:0.018035894259810448
Damping: 0.65, Clusters: 643, Iterations: 76
CHI:10.869385264451116
DBI:3.0918289299012827
Sil:0.01836998574435711
Damping: 0.7, Clusters: 643, Iterations: 83
CHI:10.872095953723212
DBI:3.0941992058740992
Sil:0.0183031614869833
Damping: 0.75, Clusters: 644, Iterations: 77
CHI:10.858889637094906
DBI:3.092357871119788
Sil:0.018253235146403313
Damping: 0.8, Clusters: 644, Iterations: 89
CHI:10.860354667995615
DBI:3.0914805483871883
Sil:0.018269771710038185
Damping: 0.8500000000000001, Clusters: 644, Iterations: 120
CHI:10.860354667995615
DBI:3.0914805483871883
Sil:0.018269771710038185
Damping: 0.9, Clusters: 644, Iterations: 171
CHI:10.860354667995615
DBI:3.0914805483871883
Sil:0.018269771710038185
Damping: 0.95, Did not converge.


In [6]:
# Changing preference by -5 per iteration:
preference_values = range(-50,-5,5)

# Hyper-parameter testing (preference):
for i in preference_values:
    embedding_cluster = AffinityPropagation(damping=0.55, max_iter=300, preference =i, random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < 300:
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        print("Preference: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Preference: {}, Did not converge.".format(i))
        print("=======================================================")

Preference: -50, Did not converge.
Preference: -45, Did not converge.
Preference: -40, Did not converge.
Preference: -35, Did not converge.
Preference: -30, Did not converge.
Preference: -25, Did not converge.
Preference: -20, Did not converge.
Preference: -15, Did not converge.
Preference: -10, Clusters: 71, Iterations: 92
CHI:51.609597154363996
DBI:4.262076766353555
Sil:0.015839833766222


In [7]:
# Changing preference by -1 per iteration:
preference_values = range(-14,-6)

# Hyper-parameter testing (preference):
for i in preference_values:
    embedding_cluster = AffinityPropagation(damping=0.55, max_iter=300, preference =i, random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < 300:
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        print("Preference: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Preference: {}, Did not converge.".format(i))
        print("=======================================================")

Preference: -14, Clusters: 52, Iterations: 164
CHI:64.44546042853989
DBI:4.31372702182739
Sil:0.0189349502325058
Preference: -13, Clusters: 54, Iterations: 269
CHI:62.91530053487421
DBI:4.278534635901026
Sil:0.018306400626897812
Preference: -12, Clusters: 56, Iterations: 103
CHI:60.68295322245294
DBI:4.3866154753816655
Sil:0.01870231330394745
Preference: -11, Clusters: 62, Iterations: 132
CHI:56.73117770728275
DBI:4.231823827677952
Sil:0.01792249083518982
Preference: -10, Clusters: 71, Iterations: 92
CHI:51.609597154363996
DBI:4.262076766353555
Sil:0.015839833766222
Preference: -9, Clusters: 82, Iterations: 94
CHI:46.65721434547085
DBI:4.167390576875096
Sil:0.016466712579131126
Preference: -8, Clusters: 93, Iterations: 120
CHI:42.524655459422284
DBI:4.097592471331018
Sil:0.016894100233912468
Preference: -7, Clusters: 106, Iterations: 96
CHI:38.514433431107285
DBI:4.062227090591765
Sil:0.016564609482884407
