In [None]:
import pandas as pd
import numpy as np
import datetime

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering

In [None]:
pre_process_df=pd.read_csv("data.csv")

In [None]:
#SCALED
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df.iloc[:,1:])
scaled_df = pd.DataFrame(data=scaled_df,columns=df.iloc[:,1:].columns)

In [None]:
def kmeans_clustering(df, n_clusters):
    labels =  KMeans(n_clusters = n_clusters, random_state=0).fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    return (labels, sil_score)

def kmedoids_clustering(df, n_clusters):
    kmds_model= KMedoids(n_clusters=n_clusters,random_state=0)
    labels = kmds_model.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df, labels)
    #dunn_score= base.dunn(list(df['KMEDOIDS_CLUSTER_NO']))
    return(labels,sil_score,db_index)

def agglomerative_clustering(df, n_clusters):
    agg_model =  AgglomerativeClustering(n_clusters = n_clusters)
    labels = agg_model.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df,labels)
    return (labels,sil_score,db_index)

def birch_clustering(df, n_clusters):
    birch_model = Birch(n_clusters=n_clusters)
    labels= birch_model.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df,labels)
    return (labels,sil_score,db_index)

def Gaussian_Modelling(df,n_clusters):
    gm_model = GaussianMixture(n_components=n_clusters, random_state=0)
    labels=gm_model.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df,labels)
    return (labels,sil_score,db_index) ### K Medoids Clustering

def dbscan_clustering(df, r,ms):
    dbscan_model = DBSCAN(eps=r, min_samples=ms)
    labels= dbscan_model.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df,labels)
    return(labels,sil_score,db_index)

def spectral_clustering(df,n_clusters):
    spectral_model_rbf = SpectralClustering(n_clusters = n_clusters, affinity ='rbf')
    labels_rbf = spectral_model_rbf.fit_predict(df)
    sil_score = metrics.silhouette_score(df, labels)
    db_index=  metrics.davies_bouldin_score(df,labels)
    return(labels,sil_score,db_index)

## K means Custering

The K-means clustering algorithm computes centroids and repeats until the optimal centroid is found. It is presumptively known how many clusters there are. It is also known as the flat clustering algorithm. The number of clusters found from data by the method is denoted by the letter ‘K’ in K-means.

1. First, we need to provide the number of clusters, K, that need to be generated by this algorithm.
2. Next, choose K data points at random and assign each to a cluster. Briefly, categorize the data based on the number of data points.
3. The cluster centroids will now be computed.
4. Iterate the steps below until we find the ideal centroid, which is the assigning of data points to clusters that do not vary.

4.1. The sum of squared distances between data points and centroids would be calculated first.
4.2. At this point, we need to allocate each data point to the cluster that is closest to the others (centroid).
4.3. Finally, compute the centroids for the clusters by averaging all of the cluster’s data points.

In [None]:
sil_scores = [kmeans_clustering(scaled_df, i)[1] for i in range(2,12)]
db_index =   [kmeans_clustering(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=kmeans_clustering(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=kmeans_clustering(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())

### K Medoids Clustering

The steps followed by the K-Medoids algorithm for clustering are as follows:

1. Randomly choose ‘k’ points from the input data (‘k’ is the number of clusters to be formed). The correctness of the choice of k’s value can be assessed using methods such as silhouette method.

2. Each data point gets assigned to the cluster to which its nearest medoid belongs.

3. For each data point of cluster i, its distance from all other data points is computed and added. The point of ith cluster for which the computed sum of distances from other points is minimal is assigned as the medoid for that cluster.

4. Steps (2) and (3) are repeated until convergence is reached i.e. the medoids stop moving.

In [None]:
sil_scores = [kmedoids_clustering(scaled_df, i)[1] for i in range(2,12)]
db_index =   [kmedoids_clustering(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=kmedoids_clustering(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=kmedoids_clustering(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())

### Agglomerative_clustering

1. Initially, all the data-points are a cluster of its own.
2. Take two nearest clusters and join them to form one single cluster.
3. Proceed recursively step 2 until you obtain the desired number of clusters.

There are some methods which are used to calculate the similarity between two clusters:

1. Distance between two closest points in two clusters.
2. Distance between two farthest points in two clusters.
3. The average distance between all points in the two clusters.
4. Distance between centroids of two clusters.


In [None]:
sil_scores = [agglomerative_clustering(scaled_df, i)[1] for i in range(2,12)]
db_index =   [agglomerative_clustering(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=agglomerative_clustering(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=agglomerative_clustering(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())

### Birch_clustering

Clustering using Hierarchies (BIRCH) is a clustering algorithm that can cluster large datasets by first generating a small and compact summary of the large dataset that retains as much information as possible. This smaller summary is then clustered instead of clustering the larger dataset.

His maximum number is called the threshold. We will learn more about what this threshold value is. Parameters of BIRCH Algorithm :

1. threshold :  threshold is the maximum number of data points a sub-cluster in the leaf node of the CF tree can hold.
2. branching_factor : This parameter specifies the maximum number of CF sub-clusters in each node (internal node).
3. n_clusters : The number of clusters to be returned after the entire BIRCH algorithm is complete i.e., number of clusters after the final clustering step. If set to None, the final clustering step is not performed and intermediate clusters are returned.

In [None]:
sil_scores = [birch_clustering(scaled_df, i)[1] for i in range(2,12)]
db_index =   [birch_clustering(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=birch_clustering(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=birch_clustering(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())

## Gaussian Mixture Model

Gaussian Mixture Models (GMMs) assume that there are a certain number of Gaussian distributions, and each of these distributions represent a cluster
let’s say we have three Gaussian distributions (more on that in the next section) – GD1, GD2, and GD3. These have a certain mean (μ1, μ2, μ3) and variance (σ1, σ2, σ3) value respectively. For a given set of data points, our GMM would identify the probability of each data point belonging to each of these distributions.

Gaussian Mixture Models are probabilistic models and use the soft clustering approach for distributing the points in different clusters

Expectation-Maximization (EM) is a statistical algorithm for finding the right model parameters. 
 the Expectation-Maximization algorithm has two steps:

E-step: In this step, the available data is used to estimate (guess) the values of the missing variables
M-step: Based on the estimated values generated in the E-step, the complete data is used to update the parameters



In [None]:
sil_scores = [Gaussian_Modelling(scaled_df, i)[1] for i in range(2,12)]
db_index =   [Gaussian_Modelling(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=Gaussian_Modelling(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=Gaussian_Modelling(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())

## DBSCAN Clustering

DBSCAN stands for Density-Based Spatial Clustering of Applications with Noise.

DBSCAN is a density-based clustering algorithm that works on the assumption that clusters are dense regions in space separated by regions of lower density. 

It groups ‘densely grouped’ data points into a single cluster. It can identify clusters in large spatial datasets by looking at the local density of the data points. The most exciting feature of DBSCAN clustering is that it is robust to outliers. It also does not require the number of clusters to be told beforehand, unlike K-Means, where we have to specify the number of centroids.

DBSCAN requires only two parameters: epsilon and minPoints. Epsilon is the radius of the circle to be created around each data point to check the density and minPoints is the minimum number of data points required inside that circle for that data point to be classified as a Core point.

DBSCAN creates a circle of epsilon radius around every data point and classifies them into Core point, Border point, and Noise. A data point is a Core point if the circle around it contains at least ‘minPoints’ number of points. If the number of points is less than minPoints, then it is classified as Border Point, and if there are no other data points around any data point within epsilon radius, then it treated as Noise.



In [None]:
sil_scores = [dbscan_clustering(data,i,j)[1] for i in np.arange(0.1,0.5,0.01) for j in range(2,6)]
db_index =  [dbscan_clustering(data,i,j)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
data['dbscan_cluster_no_db']=dbscan_clustering(data,db_index.index(min(db_index))+2)[0]
data['dbscan_cluster_no_sil']=dbscan_clustering(data,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",data['dbscan_cluster_no_db'].value_counts())
print("Sil Clusters:\n",data['dbscan_cluster_no_sil'].value_counts())

## Spectral clustering (graph distance)

Spectral clustering treats each data point as a graph-node and thus transforms the clustering problem into a graph-partitioning problem.

The three major steps involved in spectral clustering are: constructing a similarity graph, projecting data onto a lower-dimensional space, and clustering the data. Given a set of points S in a higher-dimensional space, it can be elaborated as follows:

1. Form a distance matrix
2. Transform the distance matrix into an affinity matrix A
3. Compute the degree matrix D and the Laplacian matrix L = D – A.
4. Find the eigenvalues and eigenvectors of L.
5. With the eigenvectors of k largest eigenvalues computed from the previous step form a matrix.
6. Normalize the vectors.
7. Cluster the data points in k-dimensional spac

In [None]:
sil_scores = [spectral_clustering(scaled_df, i)[1] for i in range(2,12)]
db_index =   [spectral_clustering(scaled_df, i)[2] for i in range(2,12)]
print ("Max Sil score:",max(sil_scores))
print ("Min DB index:",min(db_index))
scaled_df['cluster_no_db']=spectral_clustering(scaled_df,db_index.index(min(db_index))+2)[0]
scaled_df['cluster_no_sil']=spectral_clustering(scaled_df,sil_scores.index(max(sil_scores))+2)[0]
print ("DB Clusters:\n",scaled_df['cluster_no_db'].value_counts())
print("Sil Clusters:\n",scaled_df['cluster_no_sil'].value_counts())