In [97]:
from sklearn.cluster import KMeans
from graph_filters import graph_filtering
from scipy.io import loadmat
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami
import numpy as np
from sklearn.metrics import adjusted_rand_score as ari
from time import time
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import mutual_info_score, adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering
import matplotlib as plt
import pandas as pd



In [None]:

#  /Users/prachiteechouhan/Documents/CPSC DSP /Topic-based clustering/SentenceTransformer-Smoothing/data/embeddings
metrics_all={}

def smoothing_kmeans():
    runs = 5
    degree = 2
    metrics_sm_kmeans={}
    for method in [
        None,  # no filter
        'sgc',
        's2gc',
        'dgc',
        'appnp'
    ]:
        print(f'{method}:')
        for dataset in ['abstract']: #'classic4', 'dbpedia', 'ohsumed', 'R8', '20ng','ag_news', 'bbc', 'classic3 ]:
            print(f'  {dataset}:')

            data = loadmat(f'/Users/prachiteechouhan/Documents/CPSC DSP /Topic-based clustering/SentenceTransformer-Smoothing/data/embeddings/{dataset}-embedding.mat')
            features = data['x']
            labels = data['y'].reshape(-1)

            n_clusters = len(np.unique(labels))

            t0 = time()
            if method:
                features = graph_filtering(features, method=method)
            pre_time = time() - t0
        
            metrics_sm = {'silhouettescore':[],'db_index':[],'ch_index':[],'ami': [], 'ari': []}

            for run in range(runs):
                t0 = time()
                Z = KMeans(n_clusters, n_init=10).fit_predict(features)
                metrics_sm['silhouettescore'].append(silhouette_score(features, Z, metric='euclidean'))
                metrics_sm['db_index'].append(davies_bouldin_score(features, Z))
                metrics_sm['ch_index'].append(calinski_harabasz_score(features, Z))
                
                metrics_sm['ami'].append(ami(labels, Z))
                metrics_sm['ari'].append(ari(labels, Z))
                #metrics_sm['time'].append(pre_time + time() - t0)
            results = {
                'mean': {k: (np.mean(v)).round(4) for k, v in metrics_sm.items()},
                'std': {k: (np.std(v)).round(4) for k, v in metrics_sm.items()}
            }
            means = results['mean']
            stds = results['std']
            means_std=[means,stds]
            #print(f'\tmeans: ', means['silhouettescore'],means['db_index'],means['ch_index'],means['ami'], means['ari'], sep='&')
            #print(f'\tstds: ', stds['silhouettescore'],stds['db_index'],stds['ch_index'],stds['ami'], stds['ari'], sep='&')
        metrics_sm_kmeans[method]=means_std
    #metrics_all['smoothing_kmeans']=metrics_sm_kmeans
    return metrics_sm_kmeans

In [95]:
def kmeans_clustering():
    for dataset in ['abstract']:
        data = loadmat(f'/Users/prachiteechouhan/Documents/CPSC DSP /Topic-based clustering/SentenceTransformer-Smoothing/data/embeddings/{dataset}-embedding.mat')
        features = data['x']
        labels = data['y'].reshape(-1)

        n_clusters = len(np.unique(labels))

        clusterer = KMeans(n_clusters=n_clusters, random_state=0)
        y_pred_kmeans = clusterer.fit_predict(features)
        # centers = clusterer.cluster_centers_
        silhouettescore = silhouette_score (features, y_pred_kmeans, metric='euclidean')
        
        ari = adjusted_rand_score(labels, y_pred_kmeans)
        mi = ami(labels, y_pred_kmeans) 
        db_index = davies_bouldin_score(features, y_pred_kmeans)
        ch_index = calinski_harabasz_score(features, y_pred_kmeans) 

        # Print the metric scores
        #print(f"K-Means for dataset: {dataset}")
        #print(f"Silhouette Score: {silhouettescore:.2f}")
        #print(f"Davies-Bouldin Index: {db_index:.2f}")
        #print(f"Calinski-Harabasz Index: {ch_index:.2f}")
        #print(f"Mutual Information (MI): {mi:.2f}")
        #print(f"Adjusted Rand Index: {ari:.2f}")
       
    metrics_kmeans={'silhouettescore':silhouettescore,'db_index':db_index,'ch_index':ch_index,'ami':mi,'ari':ari}
    return metrics_kmeans

In [96]:
def agglomerativeClustering():
    for dataset in ['abstract']:
        data = loadmat(f'/Users/prachiteechouhan/Documents/CPSC DSP /Topic-based clustering/SentenceTransformer-Smoothing/data/embeddings/{dataset}-embedding.mat')
        features = data['x']
        labels = data['y'].reshape(-1)

        n_clusters = len(np.unique(labels))

        hierarchical_cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
        y_pred_agglo = hierarchical_cluster.fit_predict(features)

        silhouette_AgglomerativeClustering = silhouette_score(features, y_pred_agglo)
        db_index_AgglomerativeClustering = davies_bouldin_score(features, y_pred_agglo)
        ch_index_AgglomerativeClustering = calinski_harabasz_score(features, y_pred_agglo)
        ari_AgglomerativeClustering = adjusted_rand_score(labels, y_pred_agglo)
        mi_AgglomerativeClustering = ami(labels, y_pred_agglo)
 
        # Print the metric scores
        #print(f"Agglomerative Clustering for dataset: {dataset}")
        #print(f"Silhouette Score: {silhouette_AgglomerativeClustering:.2f}")
        #print(f"Davies-Bouldin Index: {db_index_AgglomerativeClustering:.2f}")
        #print(f"Calinski-Harabasz Index: {ch_index_AgglomerativeClustering:.2f}")
        #print(f"Mutual Information (MI): {mi_AgglomerativeClustering:.2f}")
        #print(f"Adjusted Rand Index: {ari_AgglomerativeClustering:.2f}")
        
    metrics_kmeans={'silhouettescore':silhouette_AgglomerativeClustering,'db_index':db_index_AgglomerativeClustering,'ch_index':ch_index_AgglomerativeClustering,'ami':mi_AgglomerativeClustering,'ari':ari_AgglomerativeClustering}
    return metrics_kmeans

Smoothing - Graph Filter

In [24]:
eval_sm= smoothing_kmeans()

None:
  abstract:
	means: &0.081&3.3244&1023.2456&0.8342&0.8652
	stds: &1e-04&0.0008&0.0021&0.0005&0.0009
sgc:
  abstract:
	means: &0.1502&2.2744&2225.3038&0.8349&0.8686
	stds: &0.0&0.0&0.0&0.0&0.0
s2gc:
  abstract:
	means: &0.1138&2.7131&1554.3073&0.8402&0.8716
	stds: &0.0&0.0&0.0004&0.0002&0.0001
dgc:
  abstract:
	means: &0.1901&1.9422&3056.694&0.8296&0.8631
	stds: &0.0&0.0&0.0&0.0&0.0
appnp:
  abstract:
	means: &0.1046&2.8592&1397.8295&0.8404&0.8706
	stds: &0.0&0.0&0.0&0.0&0.0


In [45]:
print(eval_sm)

{None: [{'silhouettescore': 0.081, 'db_index': 3.3244, 'ch_index': 1023.2456, 'ami': 0.8342, 'ari': 0.8652}, {'silhouettescore': 1e-04, 'db_index': 0.0008, 'ch_index': 0.0021, 'ami': 0.0005, 'ari': 0.0009}], 'sgc': [{'silhouettescore': 0.1502, 'db_index': 2.2744, 'ch_index': 2225.3038, 'ami': 0.8349, 'ari': 0.8686}, {'silhouettescore': 0.0, 'db_index': 0.0, 'ch_index': 0.0, 'ami': 0.0, 'ari': 0.0}], 's2gc': [{'silhouettescore': 0.1138, 'db_index': 2.7131, 'ch_index': 1554.3073, 'ami': 0.8402, 'ari': 0.8716}, {'silhouettescore': 0.0, 'db_index': 0.0, 'ch_index': 0.0004, 'ami': 0.0002, 'ari': 0.0001}], 'dgc': [{'silhouettescore': 0.1901, 'db_index': 1.9422, 'ch_index': 3056.694, 'ami': 0.8296, 'ari': 0.8631}, {'silhouettescore': 0.0, 'db_index': 0.0, 'ch_index': 0.0, 'ami': 0.0, 'ari': 0.0}], 'appnp': [{'silhouettescore': 0.1046, 'db_index': 2.8592, 'ch_index': 1397.8295, 'ami': 0.8404, 'ari': 0.8706}, {'silhouettescore': 0.0, 'db_index': 0.0, 'ch_index': 0.0, 'ami': 0.0, 'ari': 0.0}]}


In [46]:

eval_s={}
for group, list_of_dicts in eval_sm.items():
    print(f'{group}: list_of_dicts[0].values()')
    eval_s[group]=list(list_of_dicts[0].values())

None: list_of_dicts[0].values()
sgc: list_of_dicts[0].values()
s2gc: list_of_dicts[0].values()
dgc: list_of_dicts[0].values()
appnp: list_of_dicts[0].values()


In [81]:
new_eval_s_df = pd.DataFrame(eval_s).transpose()
new_eval_s_df.columns=['Silhouette score', 'Davies-Bouldin Index','Calinski-Harabasz Index','Adjusted Mutual Information','Adjusted Rand Index']
new_eval_s_df

Unnamed: 0,Silhouette score,Davies-Bouldin Index,Calinski-Harabasz Index,Adjusted Mutual Information,Adjusted Rand Index
,0.081,3.3244,1023.2456,0.8342,0.8652
sgc,0.1502,2.2744,2225.3038,0.8349,0.8686
s2gc,0.1138,2.7131,1554.3073,0.8402,0.8716
dgc,0.1901,1.9422,3056.694,0.8296,0.8631
appnp,0.1046,2.8592,1397.8295,0.8404,0.8706


K_means

In [88]:
eval_kmeans= kmeans_clustering()

K-Means for dataset: abstract
Silhouette Score: 0.08
Davies-Bouldin Index: 3.32
Calinski-Harabasz Index: 1023.24
Mutual Information (MI): 0.83
Adjusted Rand Index: 0.87


In [89]:

new_eval_kmeans_df= pd.DataFrame(eval_kmeans, index=['K-means Clustering'])
new_eval_kmeans_df.columns=['Silhouette score', 'Davies-Bouldin Index','Calinski-Harabasz Index','Adjusted Mutual Information','Adjusted Rand Index']
new_eval_kmeans_df

Unnamed: 0,Silhouette score,Davies-Bouldin Index,Calinski-Harabasz Index,Adjusted Mutual Information,Adjusted Rand Index
K-means Clustering,0.081064,3.323358,1023.242821,0.834896,0.8664


Agglomerative Clustering

In [79]:
eval_agglo=agglomerativeClustering()


Agglomerative Clustering for dataset: abstract
Silhouette Score: 0.07
Davies-Bouldin Index: 3.56
Calinski-Harabasz Index: 896.46
Mutual Information (MI): 0.71
Adjusted Rand Index: 0.76


In [80]:
import pandas as pd

new_eval_agglo_df= pd.DataFrame(eval_agglo, index=['Agglomerative Clustering'])
new_eval_agglo_df.columns=['Silhouette score', 'Davies-Bouldin Index','Calinski-Harabasz Index','Adjusted Mutual Information','Adjusted Rand Index']
new_eval_agglo_df

Unnamed: 0,Silhouette score,Davies-Bouldin Index,Calinski-Harabasz Index,Adjusted Mutual Information,Adjusted Rand Index
Agglomerative Clustering,0.071277,3.560338,896.458517,0.710783,0.758978


In [92]:
evaluation_metric = pd.concat([new_eval_s_df,new_eval_agglo_df, new_eval_kmeans_df])


In [93]:
evaluation_metric

Unnamed: 0,Silhouette score,Davies-Bouldin Index,Calinski-Harabasz Index,Adjusted Mutual Information,Adjusted Rand Index
,0.081,3.3244,1023.2456,0.8342,0.8652
sgc,0.1502,2.2744,2225.3038,0.8349,0.8686
s2gc,0.1138,2.7131,1554.3073,0.8402,0.8716
dgc,0.1901,1.9422,3056.694,0.8296,0.8631
appnp,0.1046,2.8592,1397.8295,0.8404,0.8706
Agglomerative Clustering,0.071277,3.560338,896.458517,0.710783,0.758978
K-means Clustering,0.081064,3.323358,1023.242821,0.834896,0.8664
