<a href="https://colab.research.google.com/github/SarathSabu/SarathSabu/blob/main/Agglomerative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# For Google Colab integration
import os
from google.colab import drive
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_samples, silhouette_score

from google.colab import drive
drive.mount('/content/drive')

# For data manipulation
import pandas as pd
import numpy as np

##install the pywaffle package for visualization
!pip install pywaffle matplotlib
from pywaffle import Waffle


In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/Mall_Customers.csv'
df = pd.read_csv(file_path)


# calling head() method
df.head()

#Select variables of interest

In [None]:
df_sub= df[['Annual Income (k$)', 'Spending Score (1-100)']]

#Normalize the data (or Standardize if needed)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# data normalization
norm_scaler = MinMaxScaler()
data_norm = norm_scaler.fit_transform(df_sub)
##data_norm is the normalized data


#Look for the largest vertical gap as this indicates a significant increase in dissimilarity between clusters.
#Cut before the large vertical gap to avoid merging clusters that are too dissimilar and should remain separate.

#View it in Dendrogram

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(data_norm, 'single')


dendrogram(linked,
           orientation='top',
           show_leaf_counts=True)
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(data_norm, 'complete')


dendrogram(linked,
           orientation='top',
           show_leaf_counts=True)
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(data_norm, 'average')


dendrogram(linked,
           orientation='top',
           show_leaf_counts=True)
plt.show()

#Calculate Silhouette score

In [None]:
silhouette_scores = []
range_n_clusters = list(range(2, 10))  #You can adjust the number here for the range you want to test

for n_clusters in range_n_clusters:
    # Fit Agglomerative Clustering
    agglom = AgglomerativeClustering(n_clusters=n_clusters, linkage='single')
    cluster_labels = agglom.fit_predict(data_norm)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(data_norm, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores for different cluster numbers
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Scores for Single Agglomerative Clustering")
plt.show()



In [None]:
silhouette_scores = []
range_n_clusters = list(range(2, 10))  #You can adjust the number here for the range you want to test

for n_clusters in range_n_clusters:
    # Fit Agglomerative Clustering
    agglom = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    cluster_labels = agglom.fit_predict(data_norm)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(data_norm, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores for different cluster numbers
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Scores for Complete Agglomerative Clustering")
plt.show()


In [None]:
silhouette_scores = []
range_n_clusters = list(range(2, 10))  #You can adjust the number here for the range you want to test

for n_clusters in range_n_clusters:
    # Fit Agglomerative Clustering
    agglom = AgglomerativeClustering(n_clusters=n_clusters, linkage='average')
    cluster_labels = agglom.fit_predict(data_norm)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(data_norm, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores for different cluster numbers
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Scores for Average Agglomerative Clustering")
plt.show()

#Apply Agglomerative cluster

In [None]:
# Apply Agglomerative Clustering
AGNES_MIN= AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='single') ###adjust your number of clusters
df_min = df_sub.copy()
df_min['cluster'] = AGNES_MIN.fit_predict(data_norm)



In [None]:
# Plot clusters
unique_clusters = np.unique(df_min['cluster'])  # Find unique cluster labels
colors = plt.cm.get_cmap('plasma', len(unique_clusters))

for cluster_id in unique_clusters:
    plt.scatter(data_norm[df_min['cluster'] == cluster_id, 0],
                data_norm[df_min['cluster'] == cluster_id, 1],
                s=100, c=[colors(cluster_id)], label=f'Cluster {cluster_id}')

plt.legend()
plt.title("Agglomerative Clustering (Single Linkage) - Cluster Visualization")
plt.show()

In [None]:
# Apply Agglomerative Clustering
AGNES_AVG= AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='average')####Adjust your number of clusters
df_avg = df_sub.copy()
df_avg['cluster'] = AGNES_AVG.fit_predict(data_norm)

In [None]:
# Plot clusters
unique_clusters = np.unique(df_avg['cluster'])  # Find unique cluster labels
colors = plt.cm.get_cmap('plasma', len(unique_clusters))

for cluster_id in unique_clusters:
    plt.scatter(data_norm[df_avg['cluster'] == cluster_id, 0],
                data_norm[df_avg['cluster'] == cluster_id, 1],
                s=100, c=[colors(cluster_id)], label=f'Cluster {cluster_id}')

plt.legend()
plt.title("Agglomerative Clustering (Average Linkage) - Cluster Visualization")
plt.show()

In [None]:
# Apply Agglomerative Clustering
AGNES_MAX= AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='complete') ###adjust your number of clusters
df_max = df_sub.copy()
df_max['cluster'] = AGNES_MAX.fit_predict(data_norm)

In [None]:
# Plot clusters
unique_clusters = np.unique(df_max['cluster'])  # Find unique cluster labels
colors = plt.cm.get_cmap('plasma', len(unique_clusters))

for cluster_id in unique_clusters:
    plt.scatter(data_norm[df_max['cluster'] == cluster_id, 0],
                data_norm[df_max['cluster'] == cluster_id, 1],
                s=100, c=[colors(cluster_id)], label=f'Cluster {cluster_id}')

plt.legend()
plt.title("Agglomerative Clustering (Complete Linkage) - Cluster Visualization")
plt.show()

In [None]:
# Calculate silhouette score and silhouette samples
silhouette_avg_min = silhouette_score(data_norm, df_min['cluster'])
sample_silhouette_values_min = silhouette_samples(data_norm, df_min['cluster'])

# Print the average silhouette score
print(f"Average silhouette score for Single Linkage: {silhouette_avg_min}")


# Optionally, you can print silhouette samples
#print("Sample silhouette values:")
#print(sample_silhouette_values_min)

In [None]:
# Calculate silhouette score and silhouette samples
silhouette_avg_max = silhouette_score(data_norm, df_max['cluster'])
sample_silhouette_values_max = silhouette_samples(data_norm, df_max['cluster'])

# Print the average silhouette score
print(f"Average silhouette score for complete linkage: {silhouette_avg_max }")

# Optionally, you can print silhouette samples
#print("Sample silhouette values:")
#print(sample_silhouette_values_max)

In [None]:
# Calculate silhouette score and silhouette samples
silhouette_avg_avg = silhouette_score(data_norm, df_avg['cluster'])
sample_silhouette_values_avg = silhouette_samples(data_norm, df_avg['cluster'])

# Print the average silhouette score
print(f"Average silhouette score for average linkage: {silhouette_avg_avg}")

# Optionally, you can print or explore silhouette samples
#print("Sample silhouette values:")
#print(sample_silhouette_values_avg)

#We need to understand the detailed stats inside each cluster

In [None]:
##You do not need to change anything here! but if you do not like the color you can change the color here
def generate_cluster_profile_agnes(data_norm, k, df_original, metrics_select, linkage_used):

    # Apply AgglomerativeClustering (AGNES) with customizable metrics and linkage
    agnes_stats = AgglomerativeClustering(n_clusters=k, metric=metrics_select, linkage=linkage_used)
    df_sub = df_original.copy()
    df_sub['cluster'] = agnes_stats.fit_predict(data_norm)

    # Create a copy of the DataFrame for calculations
    df_calculate = df_sub.copy()
    df_calculate['cluster_result'] = 'Cluster ' + (df_calculate['cluster']).astype(str)

    # Exclude the 'cluster' column for mean calculation
    df_mean_calculation = df_calculate.drop(columns=['cluster'])
    df_mean_feature = df_mean_calculation.drop(columns=['cluster_result'])  # Exclude 'cluster_result' column for overall mean

    # Calculate Overall Mean for All Features in df_mean_feature
    overall_means = df_mean_feature.mean().to_frame().T
    overall_means.index = ['Overall']

    # Summarize Mean of Each Cluster
    df_cluster_summary = df_mean_calculation.groupby('cluster_result').mean()

    # Add Overall Mean Row to Cluster Summary
    df_profile = pd.concat([df_cluster_summary, overall_means], axis=0)

    # Calculate the count of items in each cluster
    cluster_counts = df_calculate['cluster_result'].value_counts()

    # Calculate the percentage of items in each cluster
    cluster_percentages = (cluster_counts / cluster_counts.sum()) * 100

    # Create a DataFrame with counts and percentages
    df_count_percentage = pd.DataFrame({
        'Count': cluster_counts,
        'Percentage': cluster_percentages
    })

    # Add a row for "Overall"
    df_count_percentage.loc['Overall'] = [len(df_calculate), 100.0]
    df_profile = pd.concat([df_profile, df_count_percentage], axis=1)
    df_overall = df_profile.loc['Overall']
    df_profile = df_profile.drop(index='Overall')

    # Sort the clusters by the Count column
    df_profile = df_profile.sort_values(by='Count', ascending=False)

    # Append the "Overall" row back to the sorted DataFrame
    df_profile = pd.concat([df_profile, df_overall.to_frame().T])

    # Format the profile DataFrame
    df_profile = df_profile.style.format({
        "Count": "{:.0f}",
        **{col: "{:.2f}" for col in df_profile.columns if col != "Count"}  # Two decimal places for all other columns
    }).background_gradient(cmap='Purples') ######Change colore here you can use Blues Purples, Oranges, Reds, Greens, Greys

    return df_profile



In [None]:
##Single linkage cluster stats
df_profile_min = generate_cluster_profile_agnes(data_norm, k=2, df_original=df_sub, metrics_select='euclidean', linkage_used='single')
display(df_profile_min)
print()

##Average linkage cluster stats
df_profile_avg = generate_cluster_profile_agnes(data_norm, k=5, df_original=df_sub, metrics_select='euclidean', linkage_used='average')
display(df_profile_avg)
print()

##Complete linkage cluster stats
df_profile_max = generate_cluster_profile_agnes(data_norm, k=5, df_original=df_sub, metrics_select='euclidean', linkage_used='complete')
display(df_profile_max)
print()

In [None]:
from pywaffle import Waffle


cluster_counts = df_min['cluster'].value_counts()
total = sum(cluster_counts)

# Plot the waffle chart with percentages added to the labels
fig = plt.figure(
    FigureClass=Waffle,
    rows=5,  # Number of rows in the waffle chart, you can adjust here
    values=cluster_counts,  # Values for each cluster
    title={'label': 'Cluster Distribution for Single Linkage', 'loc': 'center'},
    labels=[f"Cluster {i} ({count} - {round((count / total) * 100, 2)}%)" for i, count in cluster_counts.items()],
    legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1.2)}, #You can also adjust the number to adjust the legend location
    figsize=(8, 5) #adjust figure size
)

plt.show()


In [None]:
cluster_counts = df_avg['cluster'].value_counts()
total = sum(cluster_counts)

# Plot the waffle chart with percentages added to the labels
fig = plt.figure(
    FigureClass=Waffle,
    rows=5,  # Number of rows in the waffle chart, you can adjust here
    values=cluster_counts,  # Values for each cluster
    title={'label': 'Cluster Distribution for Average Linkage', 'loc': 'center'},
    labels=[f"Cluster {i} ({count} - {round((count / total) * 100, 2)}%)" for i, count in cluster_counts.items()],
    legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1.2)}, # You can also change the number to adjust the legend location
    figsize=(8, 5) #adjust figure size
)

plt.show()

In [None]:
cluster_counts = df_max['cluster'].value_counts()
total = sum(cluster_counts)

# Plot the waffle chart with percentages added to the labels
fig = plt.figure(
    FigureClass=Waffle,
    rows=5,  # Number of rows in the waffle chart, you can adjust here
    values=cluster_counts,  # Values for each cluster
    title={'label': 'Cluster Distribution for Average Linkage', 'loc': 'center'},
    labels=[f"Cluster {i} ({count} - {round((count / total) * 100, 2)}%)" for i, count in cluster_counts.items()],
    legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1.2)}, #You can also adjust the number to adjust the legend location
    figsize=(8, 5) #adjust figure size
)

plt.show()