In [None]:
# Import the necessary libraries (numpy, pandas, scikit-learn packages metrics and clustering)
import numpy as np
import pandas as pd
from sklearn import cluster
from sklearn import metrics

In [None]:
# Import the Balance Scale dataset available at http://archive.ics.uci.edu/ml/datasets/balance+scale 
data = pd.read_csv("balance-scale.data") #in the folder folder as the python .ipynb 

In [None]:
#let's take a look at the dataset
data #this is the dataset

In [None]:
data.values #dataset converted to 2D-array

In [None]:
# 3 This data set was generated to model psychological experimental results. 
# Each example is classified as having the balance scale tip to the right, tip to the left, or be balanced. 
# The attributes are the left weight, the left distance, the right weight, and the right distance. 
# The correct way to find the class is the greater of (left-distance * left-weight) and (right-distance * right-weight). 
# If they are equal, it is balanced. 
# Segment the outcome (first column) and remaining data (attributes) so we can use the attributes for clustering
X = data.values[:, 1:5] 
Y = data.values[:,0] #0 is for the index of the first column

In [None]:
print(X)
print("---------------------------------")
print(Y)

In [None]:
print(np.unique(Y))

In [None]:
# Scale the data that we are going to use for clustering
from sklearn.preprocessing import scale

In [None]:
# an example, not related to this dataset, but to better understand how scaling the dataset works
arr1=np.array([[1,55,120],[4,2,334],[2,20,300],[1,26,923],[3,43,876],[1,53,55]])
print(arr1, end="\n------------------------\n")
print("Before scaling, mean:",arr1.mean(axis=0))
print("Before scaling, std:",arr1.std(axis=0))

In [None]:
scaled_arr1 = scale(arr1)
print(scaled_arr1, end="\n------------------------\n")
print("After scaling, mean:",scaled_arr1.mean(axis=0))
print("After scaling, std:",scaled_arr1.std(axis=0))

In [None]:
print("Before scaling, mean:",arr1.mean(axis=0))
print("After scaling, mean:",scaled_arr1.mean(axis=0))
print("Before scaling, std:",arr1.std(axis=0))
print("After scaling, std:",scaled_arr1.std(axis=0))

In [None]:
# now back to our dataset
# Scale the data that we are going to use for clustering
scaled_data = scale(X)
scaled_data

In [None]:
# data vs. scaled_data
X = np.array(X,dtype=np.float64)
print("scaled_data")
print("mean=",scaled_data.mean(axis=0))
print("std=",scaled_data.std(axis=0))
print("min=",scaled_data.min(axis=0))
print("max=",scaled_data.max(axis=0))
print("Not scaled_data")
print("mean=",X.mean(axis=0))
print("std=",X.std(axis=0))
print("min=",X.min(axis=0))
print("max=",X.max(axis=0))

In [None]:
# We know that there are 3 possible categories for the data. 
# Create 3 data clusters using Agglomerative Hierarchical Clustering. 
# What are the silhouette score, homogeneity and completeness for these clusters? 
# (Helping hand, if you need to convert labels from strings to something else look at sklearn.preprocessing.LabelEncoder())

#from sklearn import cluster
from sklearn.preprocessing import LabelEncoder
Y2 = LabelEncoder().fit_transform(Y)

n_samples, n_features = scaled_data.shape
n_digits = len(np.unique(Y))

model = cluster.AgglomerativeClustering(n_clusters=n_digits, linkage="average", affinity="cosine")
model.fit(scaled_data)

#Silhouette refers to a method of interpretation and validation of consistency within clusters of data
print("silhouette_score = ", metrics.silhouette_score(scaled_data, model.labels_))

#all of the data points that we have of the same class are elements of the same cluster
print("completeness_score = ", metrics.completeness_score(Y2, model.labels_))
# all of the clusters contain only data points, which are members of a single class
print("homogeneity_score = ", metrics.homogeneity_score(Y2, model.labels_))

In [None]:
print(Y)
print(Y2)
print(model.labels_)

In [None]:
# What are the impact of different distance and affinity measures on the silhouette score, homogeneity and 
# completeness for these clusters 
# (options available at http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)? 
# What is the best combination?

#from sklearn import cluster
#from sklearn.preprocessing import LabelEncoder
n_samples, n_features = scaled_data.shape
n_digits = len(np.unique(Y))
Y2 = LabelEncoder().fit_transform(Y)
aff = ["euclidean", "l1", "l2", "manhattan", "cosine"]
link = ["ward", "complete", "average"] 
for a in aff:
    for l in link:
        if(l=="ward" and a!="euclidean"):
           continue
        else:
            print(a,l)
            model = cluster.AgglomerativeClustering(n_clusters=n_digits, linkage=l, affinity=a)
            model.fit(scaled_data)
            print("silhouette_score = ", metrics.silhouette_score(scaled_data, model.labels_))
            print("completeness_score = ", metrics.completeness_score(Y2, model.labels_))
            print("homogeneity_score = ", metrics.homogeneity_score(Y2, model.labels_))

In [None]:
# What are the impact of different distance and affinity measures on the silhouette score, homogeneity and 
# completeness for these clusters 
# (options available at http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)? 
# What is the best combination?

#from sklearn import cluster
#from sklearn.preprocessing import LabelEncoder

n_samples, n_features = scaled_data.shape
n_digits = len(np.unique(Y))
Y2 = LabelEncoder().fit_transform(Y)
aff = ["euclidean", "l1", "l2", "manhattan", "cosine"]
link = ["ward", "complete", "average"] 
result = []
for a in aff:
    for l in link:
        if(l=="ward" and a!="euclidean"):
           continue
        else:
            model = cluster.AgglomerativeClustering(n_clusters=n_digits, linkage=l, affinity=a)
            model.fit(scaled_data)
            result.append([a,l,metrics.silhouette_score(scaled_data, model.labels_),metrics.completeness_score(Y2, model.labels_),metrics.homogeneity_score(Y2, model.labels_)])
maxI = -1
maxV = 0
for i in range(0,len(result)):
  print(result[i])
  if(result[i][2]>maxV):
    maxV = result[i][2]
    maxI = i
print("Max silhouette_score: ", result[maxI])
maxI = -1
maxV = 0
for i in range(0,len(result)):
  #print(result[i])
  if(result[i][3]>maxV):
    maxV = result[i][3]
    maxI = i
print("Max completeness_score: ", result[maxI])
maxI = -1
maxV = 0
for i in range(0,len(result)):
  #print(result[i])
  if(result[i][4]>maxV):
    maxV = result[i][4]
    maxI = i
print("Max homogeneity_score: ", result[maxI])

In [None]:
# What are the impact of different distance and affinity measures on the silhouette score, homogeneity and 
# completeness for these clusters 
# (options available at http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)? 
# What is the best combination?

#from sklearn import cluster
#from sklearn.preprocessing import LabelEncoder

n_samples, n_features = scaled_data.shape
n_digits = len(np.unique(Y))
Y2 = LabelEncoder().fit_transform(Y)
aff = ["euclidean", "l1", "l2", "manhattan", "cosine"]
link = ["ward", "complete", "average"]
result = []
for a in aff:
    for l in link:
      for i in range(2,10):
        if(l=="ward" and a!="euclidean"):
           continue
        else:
            model = cluster.AgglomerativeClustering(n_clusters=i, linkage=l, affinity=a)
            model.fit(scaled_data)
            result.append([a,l,i,metrics.silhouette_score(scaled_data, model.labels_),metrics.completeness_score(Y2, model.labels_),metrics.homogeneity_score(Y2, model.labels_)])
maxI = -1
maxV = 0
for i in range(0,len(result)):
  #print(result[i])
  if(result[i][3]>maxV):
    maxV = result[i][3]
    maxI = i
print("Max silhouette_score: ", result[maxI])

In [None]:
# What are the silhouette score, homogeneity and completeness for different numbers of clusters created using KMeans?
#from sklearn import cluster
#from sklearn.preprocessing import LabelEncoder
n_samples, n_features = scaled_data.shape
n_digits = len(np.unique(Y))
Y2 = LabelEncoder().fit_transform(Y)
for k in range(2, 5):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(scaled_data)
    print(k)
    print("silhouette_score = ", metrics.silhouette_score(scaled_data, kmeans.labels_))
    print("completeness_score = ", metrics.completeness_score(Y2, kmeans.labels_))
    print("homogeneity_score = ", metrics.homogeneity_score(Y2, kmeans.labels_))

In [None]:
# dendrogram
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
Z = hierarchy.linkage(X[0:20], 'complete')
plt.figure(figsize=(10,5))
dn = hierarchy.dendrogram(Z)
plt.show()