In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.metrics.cluster import adjusted_mutual_info_score, completeness_score, fowlkes_mallows_score, \
    adjusted_rand_score, homogeneity_score, v_measure_score
from sklearn.metrics.cluster import contingency_matrix

from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist

In [2]:
# some setting for this notebook to actually show the graphs inline
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Experiment 1: Baseline with Original df

In [3]:
df_orig = pd.read_csv('clean.csv')

In [4]:
df_orig.head()

Unnamed: 0,text,category
0,I am still waiting on my card?,card_arrival
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
3,Can I track my card while it is in the process...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival


In [5]:
# Embeddings from SBERT "all-mpnet-base-v2" + UMAP dim. reduced
X_dims = pd.read_csv("X_dims_e01.06.csv")

In [6]:
X_dims.shape
X_dims.head()

(10003, 12)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,8.263814,1.805695,5.121025,3.746852,4.953388,2.288532,4.528465,2.029929,4.144418,2.861202,9.808524,2.961788
1,8.083271,1.797391,5.236867,3.670772,5.069145,2.417079,4.629939,2.042497,4.054216,2.766853,10.033391,2.847773
2,8.091454,1.733188,4.890882,3.658099,5.059527,2.327522,4.621762,2.016839,4.227369,2.529474,9.923305,2.732634
3,8.020288,1.394174,4.71262,2.887314,4.918532,2.155109,4.558948,1.606846,4.477576,1.369636,10.054302,2.661582
4,8.024574,1.591017,5.324457,3.733921,5.423958,2.504446,4.715224,1.989726,4.000761,3.020289,9.902008,3.058281


In [7]:
#clustering - Original trimmed
clusterer = AgglomerativeClustering(n_clusters=77,
                                    affinity="euclidean",
                                    linkage="ward")
X_cluster_ids = clusterer.fit_predict(X_dims)

ch_score = calinski_harabasz_score(X_dims, X_cluster_ids)
si_score = silhouette_score(X_dims, X_cluster_ids)
db_score = davies_bouldin_score(X_dims, X_cluster_ids)

print("clusterer_si:", si_score)
print("clusterer_ch:", ch_score)
print("clusterer_db:", db_score)

clusterer_si: 0.48934207687418113
clusterer_ch: 4777.957652138638
clusterer_db: 0.7614597053820216


In [8]:
X_topic_ids = X_cluster_ids
y = df_orig['category']

In [9]:
print("\nCalculating metrics on ORIGINAL")

adjusted_rand = adjusted_rand_score(y, X_topic_ids)
adjusted_mutual_info = adjusted_mutual_info_score(y, X_topic_ids)
completeness = completeness_score(y, X_topic_ids)
fowlkes_mallows = fowlkes_mallows_score(y, X_topic_ids)
homogeneity = homogeneity_score(y, X_topic_ids)
v_measure = v_measure_score(y, X_topic_ids)
#contingency_matrix = contingency_matrix(y, X_topic_ids)
score = adjusted_rand

print("ARI:", adjusted_rand)
print("AMI:", adjusted_mutual_info)
print("Completeness:", completeness)
print("fowlkes_mallows:", fowlkes_mallows)
print("homogeneity:", homogeneity)
print("v_measure:", v_measure)
print("contingency_matrix:", contingency_matrix)


Calculating metrics on ORIGINAL
ARI: 0.6344750552388968
AMI: 0.8333488676081081
Completeness: 0.8526865459871235
fowlkes_mallows: 0.6409202745435881
homogeneity: 0.8392331020633447
v_measure: 0.8459063359196911
contingency_matrix: <function contingency_matrix at 0x7f6f41f9b378>


# Experiment 2: Trimmed DF - removed potential labelling errors

In [10]:
df_trimmed = pd.read_csv('Banking77_trimmed_updatedLabels_load.csv')

In [11]:
df_trimmed.head()

Unnamed: 0,id,text,category
0,0,I am still waiting on my card?,card_arrival
1,1,What can I do if my card still hasn't arrived ...,card_arrival
2,3,Can I track my card while it is in the process...,card_arrival
3,4,"How do I know if I will get my card, or if it ...",card_arrival
4,6,Do you have info about the card on delivery?,card_arrival


In [12]:
X_dims['id']=X_dims.index
X_dims.shape
X_dims.head()

(10003, 13)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,id
0,8.263814,1.805695,5.121025,3.746852,4.953388,2.288532,4.528465,2.029929,4.144418,2.861202,9.808524,2.961788,0
1,8.083271,1.797391,5.236867,3.670772,5.069145,2.417079,4.629939,2.042497,4.054216,2.766853,10.033391,2.847773,1
2,8.091454,1.733188,4.890882,3.658099,5.059527,2.327522,4.621762,2.016839,4.227369,2.529474,9.923305,2.732634,2
3,8.020288,1.394174,4.71262,2.887314,4.918532,2.155109,4.558948,1.606846,4.477576,1.369636,10.054302,2.661582,3
4,8.024574,1.591017,5.324457,3.733921,5.423958,2.504446,4.715224,1.989726,4.000761,3.020289,9.902008,3.058281,4


In [13]:
X_dims_trimmed = df_trimmed.merge(X_dims, on='id', how='left')
X_dims_trimmed.shape
X_dims_trimmed.head()

(8575, 15)

Unnamed: 0,id,text,category,0,1,2,3,4,5,6,7,8,9,10,11
0,0,I am still waiting on my card?,card_arrival,8.263814,1.805695,5.121025,3.746852,4.953388,2.288532,4.528465,2.029929,4.144418,2.861202,9.808524,2.961788
1,1,What can I do if my card still hasn't arrived ...,card_arrival,8.083271,1.797391,5.236867,3.670772,5.069145,2.417079,4.629939,2.042497,4.054216,2.766853,10.033391,2.847773
2,3,Can I track my card while it is in the process...,card_arrival,8.020288,1.394174,4.71262,2.887314,4.918532,2.155109,4.558948,1.606846,4.477576,1.369636,10.054302,2.661582
3,4,"How do I know if I will get my card, or if it ...",card_arrival,8.024574,1.591017,5.324457,3.733921,5.423958,2.504446,4.715224,1.989726,4.000761,3.020289,9.902008,3.058281
4,6,Do you have info about the card on delivery?,card_arrival,8.057352,1.702411,4.479055,3.592184,5.052916,2.27484,4.520807,2.025217,4.412294,2.110245,9.914685,2.55736


In [14]:
X_dims_trimmed = X_dims_trimmed.drop(['id','text','category'],axis=1)
X_dims_trimmed.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,8.263814,1.805695,5.121025,3.746852,4.953388,2.288532,4.528465,2.029929,4.144418,2.861202,9.808524,2.961788
1,8.083271,1.797391,5.236867,3.670772,5.069145,2.417079,4.629939,2.042497,4.054216,2.766853,10.033391,2.847773
2,8.020288,1.394174,4.71262,2.887314,4.918532,2.155109,4.558948,1.606846,4.477576,1.369636,10.054302,2.661582


In [18]:
#clustering - Original
clusterer = AgglomerativeClustering(n_clusters=77,
                                    affinity="euclidean",
                                    linkage="ward")
X_cluster_ids = clusterer.fit_predict(X_dims_trimmed)

ch_score = calinski_harabasz_score(X_dims_trimmed, X_cluster_ids)
si_score = silhouette_score(X_dims_trimmed, X_cluster_ids)
db_score = davies_bouldin_score(X_dims_trimmed, X_cluster_ids)

print("clusterer_si:", si_score)
print("clusterer_ch:", ch_score)
print("clusterer_db:", db_score)

clusterer_si: 0.49761317160148655
clusterer_ch: 4185.530208078744
clusterer_db: 0.7704669699676725


In [19]:
X_topic_ids = X_cluster_ids
y = df_trimmed['category']

In [20]:
print("\nCalculating metrics on TRIMMED df")

adjusted_rand = adjusted_rand_score(y, X_topic_ids)
adjusted_mutual_info = adjusted_mutual_info_score(y, X_topic_ids)
completeness = completeness_score(y, X_topic_ids)
fowlkes_mallows = fowlkes_mallows_score(y, X_topic_ids)
homogeneity = homogeneity_score(y, X_topic_ids)
v_measure = v_measure_score(y, X_topic_ids)
#contingency_matrix = contingency_matrix(y, X_topic_ids)
score = adjusted_rand

print("ARI:", adjusted_rand)
print("AMI:", adjusted_mutual_info)
print("Completeness:", completeness)
print("fowlkes_mallows:", fowlkes_mallows)
print("homogeneity:", homogeneity)
print("v_measure:", v_measure)
print("contingency_matrix:", contingency_matrix)


Calculating metrics on TRIMMED df
ARI: 0.6858603716915299
AMI: 0.856482054758372
Completeness: 0.8735422027440631
fowlkes_mallows: 0.6908880334068797
homogeneity: 0.8647738722997915
v_measure: 0.8691359231263109
contingency_matrix: <function contingency_matrix at 0x7f6f41f9b378>


# Experiment 3: Cleaned DF - updated potential labelling errors

In [21]:
df_clean = pd.read_csv('Banking77_cleaned_updatedLabels_load.csv')

In [22]:
df_clean.head()

Unnamed: 0,id,text,category
0,0,I am still waiting on my card?,card_arrival
1,1,What can I do if my card still hasn't arrived ...,card_arrival
2,2,I have been waiting over a week. Is the card s...,card_delivery_estimate
3,3,Can I track my card while it is in the process...,card_arrival
4,4,"How do I know if I will get my card, or if it ...",card_arrival


In [23]:
#clustering - Original
clusterer = AgglomerativeClustering(n_clusters=77,
                                    affinity="euclidean",
                                    linkage="ward")
X_cluster_ids = clusterer.fit_predict(X_dims)

ch_score = calinski_harabasz_score(X_dims, X_cluster_ids)
si_score = silhouette_score(X_dims, X_cluster_ids)
db_score = davies_bouldin_score(X_dims, X_cluster_ids)

print("clusterer_si:", si_score)
print("clusterer_ch:", ch_score)
print("clusterer_db:", db_score)

clusterer_si: 0.48513753883748995
clusterer_ch: 668789.575455519
clusterer_db: 0.5039306329705184


In [24]:
X_topic_ids = X_cluster_ids
y = df_clean['category']

In [25]:
print("\nCalculating metrics on CLEANED df")

adjusted_rand = adjusted_rand_score(y, X_topic_ids)
adjusted_mutual_info = adjusted_mutual_info_score(y, X_topic_ids)
completeness = completeness_score(y, X_topic_ids)
fowlkes_mallows = fowlkes_mallows_score(y, X_topic_ids)
homogeneity = homogeneity_score(y, X_topic_ids)
v_measure = v_measure_score(y, X_topic_ids)
#contingency_matrix = contingency_matrix(y, X_topic_ids)
score = adjusted_rand

print("ARI:", adjusted_rand)
print("AMI:", adjusted_mutual_info)
print("Completeness:", completeness)
print("fowlkes_mallows:", fowlkes_mallows)
print("homogeneity:", homogeneity)
print("v_measure:", v_measure)
print("contingency_matrix:", contingency_matrix)


Calculating metrics on CLEANED df
ARI: 0.4845525115999342
AMI: 0.7407718547320005
Completeness: 0.758432017959713
fowlkes_mallows: 0.49181700578471904
homogeneity: 0.7627928577368576
v_measure: 0.7606061873182468
contingency_matrix: <function contingency_matrix at 0x7f6f41f9b378>
