## Installation

In [None]:
# Install Scikit Learn if not installed
!pip install -U scikit-learn

In [None]:
# Install pyClustering
!pip install pyclustering

In [None]:
# Learning Vector Quantisation
!pip install sklvq

In [None]:
#Self organising maps
!pip install sklearn-som

In [None]:
#Adaptive resonance theory based clustering
!pip install art-python

In [None]:
#for Spectral clustering
!pip install pyamg

## Load Files

In [None]:
#directory
import os

In [None]:
#data
import numpy as np
import pandas as pd

In [None]:
#plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
DATA_DIR = os.getcwd() + "/Data/"
DATA_FILE = "Data_File.csv"

In [None]:
df = pd.read_csv(DATA_DIR+ DATA_FILE,header=0)

In [None]:
columns = df.columns.to_numpy()
needed_columns = columns[4:]
data = df[needed_columns]
rows = data.index.to_numpy().astype(str)
nSamples = rows.shape[0]
columns = data.columns.to_numpy()

In [None]:
columns

In [None]:
### display data
fig= go.Figure(data=go.Heatmap( z=data.to_numpy(), x = columns, y= rows) )

#fig = px.imshow(data)
fig.update_layout(
    width = 600, height = 2400,
    autosize = False )

## Scikit-Learn

### Algorithms used:
#### [1] K-Means [2] Affinity Propagation [3] Mean Shift [4] Spectral Clustering [5] OPTICS

In [None]:
#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import OPTICS, cluster_optics_dbscan

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import pairwise_distances

In [None]:
# Create a temporary copy of dataframe
df_temp = df.copy()

max_radius = np.max(pairwise_distances(data.to_numpy()))/2

In [None]:
sample = data.to_numpy() + 1e-4

#### KMeans Instance

In [None]:
# Assign levels 
for nClusters in range(2,16):

    # Create the bang process
    model_instance = KMeans(n_clusters=nClusters, init="random").fit(sample)

    # Obtain clustering results
    model_clusters = model_instance.labels_
    #model_noise = model_instance.noise_

    df_temp["Cluster"] = model_clusters

    if np.unique(model_clusters).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        model_clusters,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output/Scikit_Learn/KMeans/KMeans_nClusters_{nClusters}.csv")

####  Affinity Propagation Instance

In [None]:
# Assign levels 
for damping in np.linspace(0.5,0.99,20):

    # Create the bang process
    model_instance = AffinityPropagation(damping=damping, random_state=5).fit(sample)

    # Obtain clustering results
    model_clusters = model_instance.labels_
    #model_noise = model_instance.noise_

    df_temp["Cluster"] = model_clusters

    if np.unique(model_clusters).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        model_clusters,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output/Scikit_Learn/Affinity_Propagation/AffinityPropagation_damping_{int(damping*100)}.csv")

#### Mean Shift Instance

In [None]:
for damping in np.linspace(0.01, max_radius, 20):

    # Create the bang process
    model_instance = MeanShift(bandwidth=damping).fit(sample)

    # Obtain clustering results
    model_clusters = model_instance.labels_
    #model_noise = model_instance.noise_

    df_temp["Cluster"] = model_clusters

    if np.unique(model_clusters).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        model_clusters,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output/Scikit_Learn/MeanShift/MeanShift_bandwidth_{int(damping*100)}.csv")

#### Spectral Clustering Instance

In [None]:
for nClusters in range(2,20):
    # Create the bang process
    model_instance = SpectralClustering(n_clusters=nClusters,
                                        eigen_solver="arpack",
                                        affinity = "nearest_neighbors",
                                        assign_labels="cluster_qr").fit(sample)

    # Obtain clustering results
    model_clusters = model_instance.labels_
    #model_noise = model_instance.noise_

    df_temp["Cluster"] = model_clusters

    if np.unique(model_clusters).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        model_clusters,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output/Scikit_Learn/Spectral/Spectral_nClusters_{nClusters}.csv")

#### OPTICS Instance

In [None]:
for nClusters in range(2,7):
    # Create the bang process
    model_instance = OPTICS(min_samples=nClusters).fit(sample)

    # Obtain clustering results
    model_clusters = model_instance.labels_
    #model_noise = model_instance.noise_

    df_temp["Cluster"] = model_clusters

    if np.unique(model_clusters).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        model_clusters,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        model_clusters)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output/Scikit_Learn/OPTICS/OPTICS_nClusters_{nClusters}.csv")