## Imports

In [None]:
#directory
import os

In [None]:
#data
import numpy as np
import pandas as pd

In [None]:
#plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
#Clustering
from sklearn import metrics
from sklearn.metrics import pairwise_distances

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import estimate_bandwidth

## Load Files

In [None]:
DATA_DIR = os.getcwd() + "/Data/"
DATA_FILE = "Data_File.csv"

In [None]:
df = pd.read_csv(DATA_DIR+ DATA_FILE,header=0)

In [None]:
columns = df.columns.to_numpy()

In [None]:
needed_columns = columns[5:columns.shape[0]-1]

In [None]:
data = df[needed_columns]

In [None]:
rows = data.index.to_numpy().astype(str)
columns = data.columns.to_numpy()

In [None]:
### display data
fig= go.Figure(data=go.Heatmap( z=data.to_numpy(), x = columns, y= rows) )

#fig = px.imshow(data)
fig.update_layout(
    width = 4880, height = 2400,
    autosize = False )

## Kmeans clustering

In [None]:
silhouette_scores = []
x_itr = []

In [None]:
for itr in range(2,60):
    kmeans_model = KMeans(n_clusters=itr, random_state=1).fit(data)
    labels = kmeans_model.labels_
    score = metrics.silhouette_score(data, labels, metric='euclidean')
    silhouette_scores.append(score)
    x_itr.append(itr)

In [None]:
fig = go.Figure(data=go.Scatter(x=x_itr, y=silhouette_scores))
fig.update_layout(
                title="Silhouette Scores for K-Means",
                yaxis_title= "Score",
                xaxis_title= "Number of clusters"
                )
fig.update_layout(
                width = 1000, height = 500,
                autosize = False
                )
fig.write_image(f"images/Silhouette_scores_K_Means.png")
fig.show()

In [None]:
## AffinityPropagation clustering

In [None]:
silhouette_scores = []
x_itr = []
n_clusters = []

In [None]:
for itr in range(50):
    damping = 0.5 + (0.5*itr/50)
    AP_model = AffinityPropagation(damping=damping, random_state=1).fit(data)
    labels = AP_model.labels_
    n_clusters.append(np.max(labels))
    score = metrics.silhouette_score(data, labels, metric='euclidean')
    silhouette_scores.append(score)
    x_itr.append(damping)

In [None]:
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Scatter(x=x_itr, y=silhouette_scores),
             row=1,col=1)


fig.add_trace(go.Scatter(x=x_itr, y=n_clusters),
             row=2,col=1)

fig.update_yaxes(title_text="Score", row=1, col=1)

fig.update_yaxes(title_text="#clusters", row=2, col=1)

fig.update_xaxes(title_text="Damping", row=2, col=1)
fig.update_layout(
                title="Silhouette Scores for AffinityPropagation"
                )

fig.update_layout(
                width = 1000, height = 750,
                autosize = False
                )
fig.write_image(f"images/Silhouette_scores_AffinityPropagation.png")
fig.show()


## Mean Shift

In [None]:
silhouette_scores = []
x_itr = []
quantile_itr = []
bandwidth_itr = []
n_clusters = []

In [None]:
for itr in range(45):
    quantile = 0.05 + (1*itr/50)
    bandwidth = estimate_bandwidth(data,quantile=quantile)
    MS_model = MeanShift(bandwidth = bandwidth).fit(data)
    labels = MS_model.labels_
    n_clusters.append(np.max(labels))
    if not np.max(labels)==0:
        score = metrics.silhouette_score(data, labels, metric='euclidean')
        silhouette_scores.append(score)
        quantile_itr.append(quantile)
        bandwidth_itr.append(bandwidth)
        x_itr.append(itr)
    else:
        silhouette_scores.append(0)
        quantile_itr.append(quantile)
        bandwidth_itr.append(bandwidth)
        x_itr.append(itr)

In [None]:
fig = make_subplots(rows=3, cols=1)

fig.add_trace(go.Scatter(x=quantile_itr, y=silhouette_scores),
             row=1,col=1)


fig.add_trace(go.Scatter(x=quantile_itr, y=bandwidth_itr),
             row=2,col=1)

fig.add_trace(go.Scatter(x=quantile_itr, y=n_clusters),
             row=3,col=1)

fig.update_yaxes(title_text="Score", row=1, col=1)

fig.update_yaxes(title_text="Bandwidth estimate", row=2, col=1)

fig.update_yaxes(title_text="#clusters", row=3, col=1)

fig.update_xaxes(title_text="Quantile for bandwidth estimation", row=3, col=1)

fig.update_layout(
                title="Silhouette Scores for MeanShift"
                )
fig.update_layout(
                width = 1000, height = 1000,
                autosize = False
                )
fig.write_image(f"images/Silhouette_scores_MeanShift.png")
fig.show()

## Spectral clustering

In [None]:
silhouette_scores = []
x_itr = []

In [None]:
for itr in range(2,60):
    Spectral_model = SpectralClustering(n_clusters=itr, random_state=1).fit(data)
    labels = Spectral_model.labels_
    score = metrics.silhouette_score(data, labels, metric='euclidean')
    silhouette_scores.append(score)
    x_itr.append(itr)

In [None]:
fig = go.Figure(data=go.Scatter(x=x_itr, y=silhouette_scores))
fig.update_layout(
                title="Silhouette Scores for Spectral Clustering",
                yaxis_title= "Score",
                xaxis_title= "Number of clusters"
                )
fig.update_layout(
                width = 1000, height = 500,
                autosize = False
                )
fig.write_image(f"images/Silhouette_scores_Spectral.png")
fig.show()

## Final test

In [None]:
Spectral_model = SpectralClustering(n_clusters=2, random_state=1).fit(data)
labels = Spectral_model.labels_

In [None]:
fit_data = data.to_numpy()[np.argsort(labels)]
fig= go.Figure(data=go.Heatmap( z = fit_data,
                                        x = columns,
                                        y = rows[np.argsort(labels)])) 
fig.update_layout(width = 4880, height = 2400,
                  autosize = False )

fig.write_image(f"images/Silhouette_spectral_clustering.png")

In [None]:
labels

In [None]:
rows[np.argsort(labels)]