# Agglomerative Clustering

In [None]:
import numpy as np
import pandas as pd

## The Data

In [None]:
df = pd.read_csv(r'hclusters.csv')

In [None]:
df.head()

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

## Using Scikit-Learn

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

**metricstr or callable, default=”euclidean”**
Metric used to compute the linkage. Can be “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or “precomputed”. If linkage is “ward”, only “euclidean” is accepted. If “precomputed”, a distance matrix is needed as input for the fit method. If connectivity is None, linkage is “single” and affinity is not “precomputed” any valid pairwise distance metric can be assigned.


**linkage{‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’**

Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion.

‘ward’ minimizes the variance of the clusters being merged.

‘average’ uses the average of the distances of each observation of the two sets.

‘complete’ or ‘maximum’ linkage uses the maximum distances between all observations of the two sets.

‘single’ uses the minimum of the distances between all observations of the two sets.

In [None]:
s=[]
linkage_list=["average","single"]
affinity_list=["euclidean","manhattan","cosine"]
n=np.arange(2,20)
best=-1
best_affinity=" "
best_n=1
best_linkage= " "
for n_clusters in n:
    for affinity in affinity_list:
           for linkage in linkage_list:
                model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage)
                model = model.fit(df)
                silhouette=silhouette_score(df,model.labels_)#,metric='euclidean')
                s.append(silhouette)
                #print("Silhouette Score for number of clusters = "+ str(n_clusters)+ " "+str(affinity)+" and "+str(linkage)+" is "+str(silhouette)+".")
                if silhouette >= best:
                    best_n=n_clusters
                    best =silhouette 
                    best_affinity=affinity
                    best_linkage=linkage
print("Best agglomerative clustering model for this data is number of clusters="+" " + str(best_n)+" " +"with linkage of "
      +str(best_linkage)+" and affinity of "+str(best_affinity)+"." "Silhouette Score is "+str(best)+".")

In [None]:
model = AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage = "average")

In [None]:
cluster_labels = model.fit_predict(df)

In [None]:
cluster_labels

In [None]:
silhouette_score(df, cluster_labels)

In [None]:
model = AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage = "complete")
cluster_labels = model.fit_predict(df)

In [None]:
silhouette_score(df, cluster_labels)