# Common functions

In [1]:
from sklearn.datasets import fetch_openml

import pandas as pd
import numpy as np
from scipy import sparse

import networkx as nx
import community.community_louvain as community_louvain
from cdlib import algorithms

import matplotlib as mpl
import matplotlib.pyplot as plt

import umap
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from sklearn.metrics import DistanceMetric

import random
import cv2

from sklearn.metrics.cluster import adjusted_rand_score

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'leidenalg', 'graph_tool', 'infomap', 'bayanpy'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw', 'pyclustering'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'infomap', 'wurlitzer', 'leidenalg'}


In [2]:
def reduce_dimensionality(array, n_components=15):
    model = umap.UMAP(random_state=42, n_components=n_components, metric="cosine")
    reduced_dim_vec = model.fit_transform(array)
    vectors_list = np.array_split(reduced_dim_vec, len(reduced_dim_vec))
    return pd.Series(vectors_list).apply(lambda x: x.squeeze())

In [3]:
def dbscan_clusteriser(vectors:pd.Series, eps=0.25, min_samples=8):
    vectors = np.stack(np.array(vectors))
    model = DBSCAN(eps=eps,min_samples=min_samples)
    model.fit(vectors)
    return pd.Series(model.labels_)

In [4]:
def create_graph(vectors:pd.Series, threshold=None):
    vector_list = vectors.tolist()   #Parse pandas.Series to list
    W_sparse = sparse.csr_matrix(np.asarray(vector_list))  #Create Compress Sparse Row matrix
    cos_sim = cosine_similarity(W_sparse)   #Calculate cosine similarity between all nodes
    if threshold == None:
        threshold = np.quantile(cos_sim.flatten(), [0.92])[0]   #Set up a threshold based on percentile
    print(threshold)
    adj_matrix = (cos_sim > threshold).astype(int)   #Create adjacency matrix based on threshold
    adj_matrix = adj_matrix * cos_sim   #Add weights to adjecency matrix
    g = nx.convert_matrix.from_numpy_array(adj_matrix)   #Create a graph
    print("Number of nodes:", g.number_of_nodes())
    print("Number of edges:", g.number_of_edges())
    return g

In [5]:
def louvain_clusteriser(vectors:pd.Series, threshold=None):
    g = create_graph(vectors, threshold)
    partition = community_louvain.best_partition(g)
    return pd.Series(list(partition.values()))

In [6]:
def eigenvector_clusteriser(vectors: pd.Series, threshold=None):
    g = create_graph(vectors, threshold)
    print("Number of nodes:", nx.number_of_nodes(g))
    print("Number of edges:", nx.number_of_edges(g))
    print("Is directed:", nx.is_directed(g))
    comms = algorithms.eigenvector(g)
    comms = comms.communities
    comms = dict(sorted({item:list_id for list_id, l in enumerate(comms) for item in l}.items()))
    return pd.Series(comms)

In [7]:
def map_to_most_common(df: pd.DataFrame):
    df["cluster"] = df["cluster"].fillna(-1)   # Some algorithms return -1 when they don't find a community for a node.

    # Create a pandas.Series where index is a cluster's label provided by algorithm and values are true labels
    mappings_series = df.groupby("cluster")["class"].apply(lambda x: x.mode().iloc[0]).reset_index()
    mappings = dict(zip(mappings_series["cluster"], mappings_series["class"]))

    df["mapped_cluster"] = df["cluster"].map(mappings)
    return df


# Fetal Experiments

In [8]:
# Loading the images
imgs = np.load('cropped_imgs_fetal_regions.npy')
imgs.shape

(180, 224, 224, 3)

In [9]:
with open("manual_annot_CRL_Sagittal.txt", "r") as file:
    # Read the contents of the file
    file_contents = file.read()
lines = file_contents.split('\n')
index = [int(i.split(' ')[0]) for i in lines]
labels = [' '.join(i.split(' ')[1:]) for i in lines]
manual_labels_df = pd.DataFrame()
manual_labels_df['Image ID'] = index
manual_labels_df['label'] = labels

manual_labels_df['label'] = manual_labels_df['label'].apply(lambda x: x.strip())

manual_labels_df = manual_labels_df[manual_labels_df['label'] != 'remove']

num_labels = {'head': 0, 'neck': 1, 'body': 2}

manual_labels_df['encoded_label'] = manual_labels_df['label'].map(num_labels)

y = np.array(manual_labels_df['encoded_label'])

In [10]:
min_angle = -30
max_angle = 30

def rotate_image(image, angle):
    # Get image center coordinates
    center = (image.shape[1] / 2, image.shape[0] / 2)

    # Perform the rotation
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (image.shape[1], image.shape[0]))

    return rotated_image
rotated_images = []
for img in imgs:
    angle = random.uniform(min_angle, max_angle)
    rotated_img = rotate_image(img, angle)
    rotated_images.append(rotated_img)

rotated_images = np.array(rotated_images)

x = np.concatenate((imgs, rotated_images), axis=0)

y =  np.concatenate((y, y), axis=0)

In [11]:
# Gray all the images
x_uint8 = (x * 255).astype(np.uint8)
x_gray = np.zeros((x.shape[0], x.shape[1], x.shape[2]), dtype=np.uint8)
for i in range(len(x)):
    x_gray[i] = cv2.cvtColor(x_uint8[i], cv2.COLOR_RGB2GRAY)

In [12]:
y = y.astype(np.uint8)

In [13]:
print(x_gray.shape)
print(y.shape)

(360, 224, 224)
(360,)


In [14]:
reshaped_array = np.reshape(x_gray, (360, 224, 224))

# Resize each image to 32x32
resized_images = np.empty((360, 32, 32))
for i in range(360):
    resized_images[i] = cv2.resize(reshaped_array[i], (32, 32), interpolation=cv2.INTER_AREA)
x_gray = resized_images 

In [15]:
# Linearize the image (360, 224*224)
x_lin = x_gray.reshape(x_gray.shape[0], x_gray.shape[1]*x_gray.shape[2])
print(x_lin.shape)

(360, 1024)


In [16]:
X_reduced = reduce_dimensionality(x_lin)

## DBSCAN

In [17]:
clustered_dbscan_data = dbscan_clusteriser(X_reduced, eps=0.24, min_samples=1)

In [18]:
dbscan_df = pd.DataFrame(y, columns = ["class"])
dbscan_df["cluster"] = clustered_dbscan_data

In [19]:
dbscan_df = map_to_most_common(dbscan_df)

In [20]:
print("Accuracy:")
accuracy_score(dbscan_df["class"], dbscan_df["mapped_cluster"])

Accuracy:


0.8472222222222222

In [21]:
# Using the rand-score 
print("Adjusted rand-score: ", adjusted_rand_score(list(dbscan_df["class"]), list(dbscan_df["mapped_cluster"])))

Adjusted rand-score:  0.5967901148780411


In [22]:
print("NMI:")
normalized_mutual_info_score(dbscan_df["class"], dbscan_df["mapped_cluster"])

NMI:


0.5149372181196473

In [23]:
print("Number of unique classes:")
dbscan_df["cluster"].nunique()

Number of unique classes:


146

## Louvain method

In [24]:
clustered_louvain_data = louvain_clusteriser(X_reduced, threshold=0.9991)

0.9991
Number of nodes: 360
Number of edges: 5104


In [25]:
louvain_df = pd.DataFrame(y, columns=["class"])
louvain_df["cluster"] = clustered_louvain_data

In [26]:
louvain_df = map_to_most_common(louvain_df)

In [27]:
print("Accuracy:")
accuracy_score(louvain_df["class"], louvain_df["mapped_cluster"])

Accuracy:


0.5638888888888889

In [28]:
# Using the rand-score 
print("Adjusted rand-score: ", adjusted_rand_score(list(louvain_df["class"]), list(louvain_df["mapped_cluster"])))

Adjusted rand-score:  0.12110124086298543


In [29]:
print("NMI:")
normalized_mutual_info_score(louvain_df["class"], louvain_df["mapped_cluster"])

NMI:


0.10160599982016862

In [30]:
print("Number of unique classes:")
louvain_df["cluster"].nunique()

Number of unique classes:


9

In [31]:
louvain_df

Unnamed: 0,class,cluster,mapped_cluster
0,1,0,1
1,1,0,1
2,0,1,0
3,0,1,0
4,2,3,2
...,...,...,...
355,2,2,2
356,0,4,0
357,0,4,0
358,2,8,2


## Newman's Leading Eigenvector algorithm

In [32]:
clustered_eigenvector_data = eigenvector_clusteriser(X_reduced, threshold=0.9991)

0.9991
Number of nodes: 360
Number of edges: 5104
Number of nodes: 360
Number of edges: 5104
Is directed: False


In [33]:
eigenvector_df = pd.DataFrame(y, columns=["class"])
eigenvector_df["cluster"] = clustered_eigenvector_data

In [34]:
eigenvector_df = map_to_most_common(eigenvector_df)

In [35]:
print("Accuracy:")
accuracy_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])

Accuracy:


0.5777777777777777

In [36]:
# Using the rand-score 
print("Adjusted rand-score: ", adjusted_rand_score(list(eigenvector_df["class"]), list(eigenvector_df["mapped_cluster"])))

Adjusted rand-score:  0.13873114686279606


In [37]:
print("NMI:")
normalized_mutual_info_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])

NMI:


0.10674900913932957

In [38]:
print("Number of unique classes:")
eigenvector_df["cluster"].nunique()

Number of unique classes:


10

### Tweaking the hyper-parameters

In [39]:
def louvain(t_val):
    acc, nmi, clus = 0, 0, 0
    clustered_louvain_data = louvain_clusteriser(X_reduced, threshold=t_val)
    louvain_df = pd.DataFrame(y, columns=["class"])
    louvain_df["cluster"] = clustered_louvain_data
    louvain_df = map_to_most_common(louvain_df)
    acc = accuracy_score(louvain_df["class"], louvain_df["mapped_cluster"])
    nmi = normalized_mutual_info_score(louvain_df["class"], louvain_df["mapped_cluster"])
    clus = louvain_df["cluster"].nunique()

    return acc, nmi, clus

In [40]:
threshold_values = np.linspace(0.5, 1, 1000)
acc, nmi, clus = [], [], []
for i in threshold_values:
    a, n, c = louvain(i)
    acc.append(round(a*100, 2))
    nmi.append(round(n, 2))
    clus.append(c)
    

0.5
Number of nodes: 360
Number of edges: 64980
0.5005005005005005
Number of nodes: 360
Number of edges: 64980
0.501001001001001
Number of nodes: 360
Number of edges: 64980
0.5015015015015015
Number of nodes: 360
Number of edges: 64980
0.502002002002002
Number of nodes: 360
Number of edges: 64980
0.5025025025025025
Number of nodes: 360
Number of edges: 64980
0.503003003003003
Number of nodes: 360
Number of edges: 64980
0.5035035035035035
Number of nodes: 360
Number of edges: 64980
0.504004004004004
Number of nodes: 360
Number of edges: 64980
0.5045045045045045
Number of nodes: 360
Number of edges: 64980
0.505005005005005
Number of nodes: 360
Number of edges: 64980
0.5055055055055055
Number of nodes: 360
Number of edges: 64980
0.506006006006006
Number of nodes: 360
Number of edges: 64980
0.5065065065065065
Number of nodes: 360
Number of edges: 64980
0.507007007007007
Number of nodes: 360
Number of edges: 64980
0.5075075075075075
Number of nodes: 360
Number of edges: 64980
0.508008008008

In [41]:
print("Accuracies: ", acc)
print("NMI: ", nmi)
print("Clusters: ", clus)

Accuracies:  [50.83, 50.56, 50.56, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.56, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83, 50.83,

In [42]:
def newman(t_val):
    acc, nmi, clus = 0, 0, 0
    clustered_eigenvector_data = eigenvector_clusteriser(X_reduced, threshold=t_val)
    eigenvector_df = pd.DataFrame(y, columns=["class"])
    eigenvector_df["cluster"] = clustered_eigenvector_data
    eigenvector_df = map_to_most_common(eigenvector_df)
    acc = accuracy_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])
    nmi = normalized_mutual_info_score(eigenvector_df["class"], eigenvector_df["mapped_cluster"])
    clus = eigenvector_df["cluster"].nunique()

    return acc, nmi, clus