# Common functions

In [52]:
from sklearn.datasets import fetch_openml

import pandas as pd
import numpy as np
from scipy import sparse

import networkx as nx
import community.community_louvain as community_louvain
from cdlib import algorithms

import matplotlib as mpl
import matplotlib.pyplot as plt

import umap
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from sklearn.metrics import DistanceMetric

In [53]:
def reduce_dimensionality(array, n_components=15):
    model = umap.UMAP(random_state=42, n_components=n_components, metric="cosine")
    reduced_dim_vec = model.fit_transform(array)
    vectors_list = np.array_split(reduced_dim_vec, len(reduced_dim_vec))
    return pd.Series(vectors_list).apply(lambda x: x.squeeze())

In [54]:
def dbscan_clusteriser(vectors:pd.Series, eps=0.25, min_samples=8):
    vectors = np.stack(np.array(vectors))
    model = DBSCAN(eps=eps,min_samples=min_samples)
    model.fit(vectors)
    return pd.Series(model.labels_)

In [55]:
def create_graph(vectors:pd.Series, threshold=None):
    vector_list = vectors.tolist()   #Parse pandas.Series to list
    W_sparse = sparse.csr_matrix(np.asarray(vector_list))  #Create Compress Sparse Row matrix
    cos_sim = cosine_similarity(W_sparse)   #Calculate cosine similarity between all nodes
    if threshold == None:
        threshold = np.quantile(cos_sim.flatten(), [0.92])[0]   #Set up a threshold based on percentile
    print(threshold)
    adj_matrix = (cos_sim > threshold).astype(int)   #Create adjacency matrix based on threshold
    adj_matrix = adj_matrix * cos_sim   #Add weights to adjecency matrix
    g = nx.convert_matrix.from_numpy_array(adj_matrix)   #Create a graph
    print("Number of nodes:", g.number_of_nodes())
    print("Number of edges:", g.number_of_edges())
    return g

In [56]:
def louvain_clusteriser(vectors:pd.Series, threshold=None):
    g = create_graph(vectors, threshold)
    partition = community_louvain.best_partition(g)
    return pd.Series(list(partition.values()))

In [57]:
def eigenvector_clusteriser(vectors: pd.Series, threshold=None):
    g = create_graph(vectors, threshold)
    print("Number of nodes:", nx.number_of_nodes(g))
    print("Number of edges:", nx.number_of_edges(g))
    print("Is directed:", nx.is_directed(g))
    comms = algorithms.eigenvector(g)
    comms = comms.communities
    comms = dict(sorted({item:list_id for list_id, l in enumerate(comms) for item in l}.items()))
    return pd.Series(comms)

In [58]:
def map_to_most_common(df: pd.DataFrame):
    df["cluster"] = df["cluster"].fillna(-1)   # Some algorithms return -1 when they don't find a community for a node.

    # Create a pandas.Series where index is a cluster's label provided by algorithm and values are true labels
    mappings_series = df.groupby("cluster")["class"].apply(lambda x: x.mode().iloc[0]).reset_index()
    mappings = dict(zip(mappings_series["cluster"], mappings_series["class"]))

    df["mapped_cluster"] = df["cluster"].map(mappings)
    return df


# MNIST Experiments

In [59]:
import tensorflow as tf
(X, y), _ = tf.keras.datasets.mnist.load_data()
X = X.reshape(X.shape[0], -1)
y = y.astype(np.uint8)

In [60]:
X_train = X[:20000]

In [62]:
y

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [11]:
X_reduced = reduce_dimensionality(X_train)

## DBSCAN

In [12]:
clustered_dbscan_data = dbscan_clusteriser(X_reduced, eps=0.24, min_samples=8)

In [13]:
dbscan_df = pd.DataFrame(y[:20000], columns = ["class"])
dbscan_df["cluster"] = clustered_dbscan_data

In [14]:
dbscan_df = map_to_most_common(dbscan_df)

In [15]:
print("Accuracy:")
accuracy_score(dbscan_df["class"], dbscan_df["mapped_cluster"])

Accuracy:


0.77995

In [16]:
print("NMI:")
normalized_mutual_info_score(dbscan_df["class"], dbscan_df["mapped_cluster"])

NMI:


0.8333803386579535

In [17]:
print("Number of unique classes:")
dbscan_df["cluster"].nunique()

Number of unique classes:


14

## Louvain method

In [21]:
clustered_louvain_data = louvain_clusteriser(X_reduced, threshold=0.9991)

0.9991
Number of nodes: 20000
Number of edges: 5592847


In [22]:
louvain_df = pd.DataFrame(y[:20000], columns=["class"])
louvain_df["cluster"] = clustered_louvain_data

In [23]:
louvain_df = map_to_most_common(louvain_df)

In [24]:
print("Accuracy:")
accuracy_score(louvain_df["class"], louvain_df["mapped_cluster"])

Accuracy:


0.91965

In [25]:
print("NMI:")
normalized_mutual_info_score(louvain_df["class"], louvain_df["mapped_cluster"])

NMI:


0.8731660669076261

In [26]:
print("Number of unique classes:")
louvain_df["cluster"].nunique()

Number of unique classes:


12

In [42]:
louvain_df

Unnamed: 0,class,cluster,mapped_cluster
0,5,10,5
1,0,1,0
2,4,4,4
3,1,3,1
4,9,2,9
...,...,...,...
19995,9,4,4
19996,5,10,5
19997,1,6,1
19998,4,4,4


## Newman's Leading Eigenvector algorithm

In [44]:
clustered_eigenvector_data = eigenvector_clusteriser(X_reduced, threshold=0.9991)

0.9991
Number of nodes: 20000
Number of edges: 5592847
Number of nodes: 20000
Number of edges: 5592847
Is directed: False


In [45]:
eigenvector_df = pd.DataFrame(y[:20000], columns=["class"])
eigenvector_df["cluster"] = clustered_eigenvector_data

In [46]:
eigenvector_df = map_to_most_common(eigenvector_df)

In [47]:
print("Accuracy:")
accuracy_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])

Accuracy:


0.953

In [48]:
print("NMI:")
normalized_mutual_info_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])

NMI:


0.8939314181130542

In [49]:
print("Number of unique classes:")
eigenvector_df["cluster"].nunique()

Number of unique classes:


12

In [50]:
print("Accuracy:")
accuracy_score(eigenvector_df["class"][:500], eigenvector_df["mapped_cluster"][:500])

Accuracy:


0.946

In [51]:
print("NMI:")
normalized_mutual_info_score(eigenvector_df["class"][:500], eigenvector_df["mapped_cluster"][:500])

NMI:


0.8991845352528646