## Installation

In [None]:
# Install Scikit Learn if not installed
!pip install -U scikit-learn

In [None]:
# Install pyClustering
!pip install pyclustering

In [None]:
# Learning Vector Quantisation
!pip install sklvq

In [None]:
#Self organising maps
!pip install sklearn-som

In [None]:
#Adaptive resonance theory based clustering
!pip install art-python

In [None]:
!pip install pygad

## Load Files

In [None]:
#directory
import os

In [None]:
#data
import numpy as np
import pandas as pd

In [None]:
#plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
DATA_DIR = os.getcwd() + "/Data/"
DATA_FILE = "Data_File.csv"

In [None]:
df = pd.read_csv(DATA_DIR+ DATA_FILE,header=0)

In [None]:
columns = df.columns.to_numpy()
needed_columns = columns[4:]
data = df[needed_columns]
rows = data.index.to_numpy().astype(str)
nSamples = rows.shape[0]
columns = data.columns.to_numpy()

In [None]:
columns

In [None]:
### display data
fig= go.Figure(data=go.Heatmap( z=data.to_numpy(), x = columns, y= rows) )

#fig = px.imshow(data)
fig.update_layout(
    width = 600, height = 2400,
    autosize = False )

## pyclustering

### Algorithms used:
#### [1] CLARANS, [2] CURE, [3] Expectation Maximisation Algorithm , [4] Genetic Algorithm, [5] Fuzzy C-Means, [6] CLIQUE, [7] BANG, [8] BIRCH, [9] Self Organising Maps, [10] DBSCAN, [11] OPTICS 

In [None]:
# [1] BANG 
from pyclustering.cluster.bang import bang, bang_visualizer

# [2] BIRCH
from pyclustering.cluster.birch import birch

# [3] CLARANS 
from pyclustering.cluster.clarans import clarans

# [4] CLIQUE
from pyclustering.cluster.clique import clique, clique_visualizer

# [5] CURE
from pyclustering.cluster.cure import cure

# [6] DBSCAN
from pyclustering.cluster.dbscan import dbscan

# [7] Expectation Maximisation Algorithm 
from pyclustering.cluster.ema import ema, ema_visualizer

# [8] Fuzzy C-Means
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.fcm import fcm

# [9] Genetic Algorithm
from pyclustering.cluster.ga import genetic_algorithm, ga_observer

# [10] G-Means
from pyclustering.cluster.gmeans import gmeans

# [10] Self Organising Maps
from pyclustering.cluster.somsc import somsc

# [11] OPTICS 
from pyclustering.cluster.optics import optics, ordering_analyser, ordering_visualizer

from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

from sklearn.metrics import pairwise_distances

import pygad

In [None]:
df_min_max_scaled = data.copy()
  
# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = (df_min_max_scaled[column]- df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min()) 
    
    
df_min_max_scaled    

In [None]:
### display data
fig= go.Figure(data=go.Heatmap( z=df_min_max_scaled.to_numpy(), x = columns, y= rows) )

#fig = px.imshow(data)
fig.update_layout(
    width = 600, height = 2400,
    autosize = False )

In [None]:
sample = (df_min_max_scaled.to_numpy() + 1e-6).tolist()

# Create a temporary copy of dataframe
df_temp = df.copy()

max_radius = np.max(pairwise_distances(df_min_max_scaled.to_numpy()))/2

In [None]:
#fitness_measure = "Silhouette"

#fitness_measure = "Calinski"

fitness_measure = "Davis"

#### BANG Instance

In [None]:
# Assign levels 
for levels in range(2,30):

    # Create the bang process
    model_instance = bang(sample,levels)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/BANG/BANG_levels_{levels}.csv")

#### BIRCH Instance

In [None]:
num_generations = 100
num_parents_mating = 4

sol_per_pop = 10

num_genes = 2
gene_type = [int,[float,4]]
gene_space = [{'low': 2, 'high': 12} ,{'low':0.01, 'high':max_radius} ]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_BIRCH(solution,solution_idx):
    global sample, data, fitness_measure
    nClusters = solution[0]
    diameter = solution[1]
    # Create the bang process
    model_instance = birch(sample,nClusters,diameter =diameter)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        fitness = 0
    else:
        if fitness_measure == "Silhouette":
            fitness = -silhouette_score(data.to_numpy(),clustering_result,metric='euclidean')
        
        elif fitness_measure == "Calinski":
            fitness = -calinski_harabasz_score(data.to_numpy(),clustering_result)
        
        elif fitness_measure == "Davis":
            fitness = -davies_bouldin_score(data.to_numpy(),clustering_result)
        
    return fitness

In [None]:
ga_instance_BIRCH = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_BIRCH,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_BIRCH.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_BIRCH.best_solution()
    nClust = best_solution[0]
    nNeighbours = best_solution[1]
    data_saver.append([nClust,
                       nNeighbours,
                       best_solution_fitness])
data_saver_columns = ["number of clusters", "number of neighbours","Best solution CH index"]
df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)

if fitness_measure == "Silhouette":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/BIRCH/BIRCH_SilhouetteIndex.csv")
    
elif fitness_measure == "Calinski":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/BIRCH/BIRCH_CHIndex.csv")
    
elif fitness_measure == "Davis":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/BIRCH/BIRCH_DBIndex.csv")

#### CLARANS Instance

In [None]:
num_generations = 100
num_parents_mating = 4

sol_per_pop = 10

num_genes = 2
gene_type = [int,int]
gene_space = [{'low': 2, 'high': 11} ,{'low':0, 'high':5} ]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_CLARANS(solution,solution_idx):
    global sample, data
    nClusters = solution[0]
    neighbours = solution[1]
    # Create the bang process
    model_instance = clarans(sample,nClusters,10,neighbours)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        fitness = 0
    else:
        if fitness_measure == "Silhouette":
            fitness = -silhouette_score(data.to_numpy(),clustering_result,metric='euclidean')
        
        elif fitness_measure == "Calinski":
            fitness = -calinski_harabasz_score(data.to_numpy(),clustering_result)
        
        elif fitness_measure == "Davis":
            fitness = -davies_bouldin_score(data.to_numpy(),clustering_result)
        
    return fitness

In [None]:
ga_instance_CLARANS = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_CLARANS,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_CLARANS.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_CLARANS.best_solution()
    nClust = best_solution[0]
    nNeighbours = best_solution[1]
    data_saver.append([nClust,
                       nNeighbours,
                       best_solution_fitness])
data_saver_columns = ["number of clusters", "number of neighbours","Best solution CH index"]
df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)

if fitness_measure == "Silhouette":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CLARANS/CLARANS_SilhouetteIndex.csv")
    
elif fitness_measure == "Calinski":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CLARANS/CLARANS_CHIndex.csv")
    
elif fitness_measure == "Davis":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CLARANS/CLARANS_DBIndex.csv")

#### CLIQUE Instance

In [None]:
threshold = 0
for nIntervals in range(10,100,5):
    # Create the CLIQUE process
    model_instance = clique(sample,nIntervals,threshold)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CLIQUE/CLIQUE_nIntervals_{nIntervals}.csv")

In [None]:
model_instance = clique(sample,10,threshold)
model_instance.process()
cells = model_instance.get_cells()

In [None]:
model_instance.get_clusters()

#### CURE Instance

In [None]:
num_generations = 100
num_parents_mating = 4

sol_per_pop = 10

num_genes = 2
gene_type = [int,int]
gene_space = [{'low': 2, 'high': 11} ,{'low':1, 'high':5} ]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"

crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_CURE(solution,solution_idx):
    global sample, data
    nClusters = solution[0]
    nRepresentatives = solution[1]
    # Create the bang process
    model_instance = cure(sample,nClusters, number_represent_points = nRepresentatives)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        fitness = 0
    else:
        if fitness_measure == "Silhouette":
            fitness = -silhouette_score(data.to_numpy(),clustering_result,metric='euclidean')
        
        elif fitness_measure == "Calinski":
            fitness = -calinski_harabasz_score(data.to_numpy(),clustering_result)
        
        elif fitness_measure == "Davis":
            fitness = -davies_bouldin_score(data.to_numpy(),clustering_result)
        
    return fitness

In [None]:
ga_instance_CURE = pygad.GA(num_generations = num_generations,
                            num_parents_mating = num_parents_mating,
                            fitness_func = fitness_func_CURE,
                            sol_per_pop = sol_per_pop,
                            num_genes = num_genes,
                            gene_type = gene_type,
                            gene_space = gene_space,
                            parent_selection_type = parent_selection_type,
                            keep_parents = keep_parents,
                            crossover_type = crossover_type,
                            crossover_probability = crossover_probability,
                            mutation_type = mutation_type,
                            mutation_probability = mutation_probability,
                            mutation_percent_genes = mutation_percent_genes,
                            parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_CURE.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_CURE.best_solution()
    nClust = best_solution[0]
    nNeighbours = best_solution[1]
    data_saver.append([nClust,
                       nNeighbours,
                       best_solution_fitness])
data_saver_columns = ["number of clusters", "number of neighbours","Best solution CH index"]
df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)

if fitness_measure == "Silhouette":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CURE/CURE_SilhouetteIndex.csv")
    
elif fitness_measure == "Calinski":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CURE/CURE_CHIndex.csv")
    
elif fitness_measure == "Davis":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/CURE/CURE_DBIndex.csv")

####  DBSCAN Instance

In [None]:
num_generations = 100
num_parents_mating = 4

sol_per_pop = 10

num_genes = 2
gene_type = [int,[float,4]]
gene_space = [{'low': 2, 'high': 11} ,{'low':0.01, 'high':max_radius} ]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_DBSCAN(solution,solution_idx):
    global sample, data
    nNeighbours = solution[0]
    diameter = solution[1]
    # Create the bang process
    model_instance = dbscan(sample,diameter,nNeighbours)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        fitness = 0
    else:
        if fitness_measure == "Silhouette":
            fitness = -silhouette_score(data.to_numpy(),clustering_result,metric='euclidean')
        
        elif fitness_measure == "Calinski":
            fitness = -calinski_harabasz_score(data.to_numpy(),clustering_result)
        
        elif fitness_measure == "Davis":
            fitness = -davies_bouldin_score(data.to_numpy(),clustering_result)
        
    return fitness

In [None]:
ga_instance_DBSCAN = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_DBSCAN,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4
                             )

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_DBSCAN.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_DBSCAN.best_solution()
    nClust = best_solution[0]
    nNeighbours = best_solution[1]
    data_saver.append([nClust,
                       nNeighbours,
                       best_solution_fitness])
data_saver_columns = ["number of clusters", "number of neighbours","Best solution CH index"]
df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)

if fitness_measure == "Silhouette":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/DBSCAN/DBSCAN_SilhouetteIndex.csv")
    
elif fitness_measure == "Calinski":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/DBSCAN/DBSCAN_CHIndex.csv")
    
elif fitness_measure == "Davis":
    df_data_saver.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/DBSCAN/DBSCAN_DBIndex.csv")
    

#### Expectation Maximisation Algorithm Instance

In [None]:
# This method uses Gaussian Mixture Model 
for nClusters in range(2,15):
    #print(f"Iteration nClusters = {nClusters}")
    # Create the bang process
    model_instance = ema(sample,nClusters)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/EMA/EMA_nClusters_{nClusters}.csv")

#### Fuzzy C-Means Instance

In [None]:
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

for nClusters in range(2,15):
    #print(f"Iteration nClusters = {nClusters}")
    # Create the bang process
    # initialize
    initial_centers = kmeans_plusplus_initializer(sample,
                                                  nClusters,
                                                  kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize()
    model_instance = fcm(sample,initial_centers)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/Fuzzy_C_Means/FCM_nClusters_{nClusters}.csv")

#### Genetic Algorithm Instance

In [None]:
# This method uses Gaussian Mixture Model 
for nClusters in range(2,15):
    #print(f"Iteration nClusters = {nClusters}")
    # Create the bang process
    model_instance = genetic_algorithm(data = sample,
                                       count_clusters = nClusters,
                                       chromosome_count=100,
                                       population_count=200,
                                       count_mutation_gens=2)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/GA/GA_nClusters_{nClusters}.csv")

#### G-Means Instance

In [None]:
for initClusters in range(2,25):
    #print(f"Iteration nClusters = {nClusters}")
    # Create the bang process
    model_instance = gmeans(sample, k_init=1, repeat=10, k_max= initClusters)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/GMeans/GMeans_maxCentres_{initClusters}.csv")
    

#### Self Organising Maps Instance

In [None]:
for nClusters in range(2,15):
    #print(f"Iteration nClusters = {nClusters}")
    # Create the bang process
    model_instance = somsc(sample, nClusters)
    model_instance.process()

    # Obtain clustering results
    model_clusters = model_instance.get_clusters()
    #model_noise = model_instance.get_noise()

    clustering_result = np.zeros(rows.shape)
    for itr, clusters in enumerate(model_clusters):
        num_cluster = itr+1
        for members in clusters:
            clustering_result[members]=num_cluster

    df_temp["Cluster"] = clustering_result

    if np.unique(clustering_result).size<=1.0:
        df_temp["Silhouette Score"] = [0]+ [""]*(nSamples-1)
        df_temp["Davis-Bouldin"] = [0]+ [""]*(nSamples-1)
        df_temp["Calinski-Harbasz"] = [0]+ [""]*(nSamples-1)
    else:
        df_temp["Silhouette Score"] = [silhouette_score(data.to_numpy(),
                                                        clustering_result,
                                                        metric='euclidean')] + [""]*(nSamples-1)

        df_temp["Davis-Bouldin"] = [davies_bouldin_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)

        df_temp["Calinski-Harbasz"] = [calinski_harabasz_score(data.to_numpy(),
                                                        clustering_result)] + [""]*(nSamples-1)
        
    df_temp.to_csv(f"Output_Reduced_Dimensions_Normalised/Pyclustering/SOFM/SOFM_nClusters_{nClusters}.csv")
    