In [1]:
# Imports:
import pandas as pd
import numpy as np
import re

# Disable Warnings:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_csv('Combined_SPA_Callback.csv')
df

Unnamed: 0,Date,ID,Text
0,26-Jul-17,1.010040e+11,please call daughter Dianne Thomas 07920 07565...
1,28-Jul-17,1.010040e+11,please speak to sister - Alyson Powell on abpv...
2,28-Jul-17,1.010040e+11,KELLY HAS RUNG - HER PARENTS ARE DUE TO HAVE W...
3,31-Jul-17,1.010040e+11,"Mark Hitchings, Scheme Manager Swn Yr Afon con..."
4,28-Jul-17,1.010040e+11,SON BRIAN HASFORD RUN G- HE IS RESIDENT IN AUS...
...,...,...,...
11353,13-Jun-23,1.010000e+11,"Enquiring about a downstairs toilet, states he..."
11354,13-Jun-23,1.010000e+11,Phoning on behalf on her mother she has no fee...
11355,13-Jun-23,1.010000e+11,States on saturday they had to call an ambulan...
11356,13-Jun-23,1.010000e+11,Nadia Obaji daughter of Abdel Obaji contacted ...


In [3]:
# Checking if a sentence is in caps:
def calculate_capital_percentage(string):
    total_letters = len(re.sub(r'[^a-zA-Z]', '', string))
    capital_letters = len(re.sub(r'[^A-Z]', '', string))
    capital_percentage = (capital_letters / total_letters) * 100
    return capital_percentage

# Function to clean text for NER:
def clean_text_for_NER(text):
    # Remove start/trailing spaces:
    text = text.strip()
    # Remove newline characters:
    text = re.sub(r'[\n\r]+', '. ',text)
    # Only keep Alphabets, Digits, Spaces, and Commonly Used Punctuations:
    text = re.sub(r'[^a-zA-Z0-9\s?,:"!.\']', '', text)
    # Remove extra spaces:
    text = re.sub(r'\s{2,}', ' ', text)
    # Join digits together:
    text = re.sub('(?<=\d) (?=\d)', '', text)
    text = text.strip()
    if calculate_capital_percentage(text) > 65:
        text = text.lower()
    # Capitalize the first character:
    text = text.capitalize()
    return text

# Creating a list of sentences:
df["Text"] = df["Text"].apply(clean_text_for_NER)

In [4]:
from sentence_transformers import SentenceTransformer

# This cell takes ~3 mins to run

# Choosing SentenceTransformer model:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Creating embeddings from sentences:
sentences = list(df["Text"])
embeddings = model.encode(sentences)

In [5]:
from sklearn.cluster import AffinityPropagation

from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

# Default AffinityPropagation Run:
embedding_cluster = AffinityPropagation(random_state = 42).fit(embeddings)
no_of_clusters = len(embedding_cluster.cluster_centers_)
no_of_labels = len(embedding_cluster.labels_)
no_of_iterations = embedding_cluster.n_iter_

if no_of_iterations < 200:
    no_of_clusters = len(embedding_cluster.cluster_centers_)
    print("Params: Default, Clusters: {}, Iterations: {}".format(no_of_clusters, no_of_iterations))
    # Metrics:
    chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
    dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
    sil = silhouette_score(embeddings, embedding_cluster.labels_)
    print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
else:
    print("Default parameters did not converge.")

Default parameters did not converge.


In [6]:
# Changing damping by 0.05 per iteration:
damping_values = np.linspace(0.5, 1.0, 11)

# Hyper-parameter testing (damping):
for i in damping_values[1:-1]:
    embedding_cluster = AffinityPropagation(damping=i, max_iter=int(200*i), random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < int(200*i):
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("Damping: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Damping: {}, Did not converge.".format(i))
        print("=======================================================")

Damping: 0.55, Did not converge.
Damping: 0.6, Clusters: 634, Iterations: 54
CHI:10.984721448052792
DBI:3.1035332554581356
Sil:0.01760571263730526
Damping: 0.65, Clusters: 635, Iterations: 77
CHI:10.969658806022816
DBI:3.1027433299882143
Sil:0.01775357499718666
Damping: 0.7, Clusters: 636, Iterations: 76
CHI:10.9615484232579
DBI:3.100277218450532
Sil:0.01771925389766693
Damping: 0.75, Clusters: 635, Iterations: 103
CHI:10.96468826605598
DBI:3.1071686014080395
Sil:0.017791269347071648
Damping: 0.8, Clusters: 636, Iterations: 130
CHI:10.96390664863451
DBI:3.090869883125845
Sil:0.018013866618275642
Damping: 0.8500000000000001, Did not converge.
Damping: 0.9, Did not converge.
Damping: 0.95, Did not converge.


In [7]:
# Changing preference by -5 per iteration:
preference_values = range(-50,-5,5)

# Hyper-parameter testing (preference):
for i in preference_values:
    embedding_cluster = AffinityPropagation(damping=0.6, max_iter=300, preference =i, random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < 300:
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        print("Preference: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Preference: {}, Did not converge.".format(i))
        print("=======================================================")

Preference: -50, Did not converge.
Preference: -45, Did not converge.
Preference: -40, Did not converge.
Preference: -35, Did not converge.
Preference: -30, Did not converge.
Preference: -25, Did not converge.
Preference: -20, Did not converge.
Preference: -15, Clusters: 48, Iterations: 110
CHI:67.99918744229795
DBI:4.455712008726006
Sil:0.019610514864325523
Preference: -10, Clusters: 73, Iterations: 68
CHI:50.12393258197703
DBI:4.242531542823061
Sil:0.018681667745113373


In [8]:
# Changing preference by -1 per iteration:
preference_values = range(-19,-11)

# Hyper-parameter testing (preference):
for i in preference_values:
    embedding_cluster = AffinityPropagation(damping=0.6, max_iter=300, preference =i, random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < 300:
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        print("Preference: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Preference: {}, Did not converge.".format(i))
        print("=======================================================")

Preference: -19, Did not converge.
Preference: -18, Did not converge.
Preference: -17, Clusters: 44, Iterations: 180
CHI:72.35313233269525
DBI:4.36957570772843
Sil:0.020304299890995026
Preference: -16, Clusters: 46, Iterations: 79
CHI:69.82039911373474
DBI:4.41580084806237
Sil:0.020509101450443268
Preference: -15, Clusters: 48, Iterations: 110
CHI:67.99918744229795
DBI:4.455712008726006
Sil:0.019610514864325523
Preference: -14, Clusters: 52, Iterations: 64
CHI:64.04815625339049
DBI:4.38304484483649
Sil:0.019502995535731316
Preference: -13, Clusters: 57, Iterations: 75
CHI:60.196423519567816
DBI:4.335697775885274
Sil:0.01833929866552353
Preference: -12, Clusters: 59, Iterations: 78
CHI:58.53194855084772
DBI:4.348757656618058
Sil:0.018105637282133102


In [13]:
# Changing preference by -1 per iteration:
preference_values = [-18]

# Hyper-parameter testing (preference):
for i in preference_values:
    embedding_cluster = AffinityPropagation(damping=0.6, max_iter=1000, preference =i, random_state=42).fit(embeddings)
    no_of_iterations = embedding_cluster.n_iter_ 
    if no_of_iterations < 1000:
        no_of_clusters = len(embedding_cluster.cluster_centers_)
        print("Preference: {}, Clusters: {}, Iterations: {}".format(i, no_of_clusters, no_of_iterations))
        # Metrics:
        chi = calinski_harabasz_score(embeddings, embedding_cluster.labels_)
        dbi = davies_bouldin_score(embeddings, embedding_cluster.labels_)
        sil = silhouette_score(embeddings, embedding_cluster.labels_)
        print("CHI:{}\nDBI:{}\nSil:{}".format(chi, dbi, sil))
        print("=======================================================")
    else:
        print("Preference: {}, Did not converge.".format(i))
        print("=======================================================")

Preference: -18, Clusters: 40, Iterations: 363
CHI:76.23894322680917
DBI:4.497016831012912
Sil:0.019599011167883873
