In [None]:
##install the pywaffle package for visualization
!pip install pywaffle matplotlib

In [None]:
# For Google Colab integration
import os
from google.colab import drive
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import silhouette_score


from google.colab import drive
drive.mount('/content/drive')

# For data manipulation
import pandas as pd
import numpy as np


In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/Mall_Customers.csv'
df = pd.read_csv(file_path)


# calling head() method
df.head()

#Select variables of interest

In [None]:
df_sub= df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Initial Scatter Plot of Selected Variables (only for two variables)
## Note: This scatter plot is two-dimensional, as it visualizes the relationship between two variables.


In [None]:
import matplotlib.pyplot as plt


# Plotting the features
plt.figure(figsize=(8, 6))
plt.scatter(df_sub['Annual Income (k$)'], df_sub['Spending Score (1-100)'], c='blue', label='Customer')


plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Scatter Plot of Original Data')
plt.legend()
plt.grid(False)
plt.show()


#Normalize the data (or Standardize if needed)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# data normalization
norm_scaler = MinMaxScaler()
data_norm = norm_scaler.fit_transform(df_sub)
##data_norm is the normalized data

In [None]:
# Standardize the independent variables
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
data_std = std_scaler.fit_transform(df_sub)
##data_std is the normalized data

# Perform clustering on the selected variables


In [None]:
from sklearn.cluster import KMeans

### n_clusters: Specifies the number of clusters to form, which is 5 in this case
### init: The method for initialization, 'k-means++' ensures better initial centroids
### n_init: Number of times it performs clustering with different initializations of centroids, and chooses the best result (lowest sum of squared distances)

### random_state: Ensures reproducibility of the results by fixing the random number generator

model = KMeans(n_clusters = 7, init = 'k-means++', n_init = 10, random_state = 1)
model.fit(data_norm)
df_sub['cluster'] = model.fit_predict(data_norm)

#Plot the clusters (only for two variables)

In [None]:
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(df_sub['Annual Income (k$)'], df_sub['Spending Score (1-100)'], c=df_sub['cluster'], cmap='viridis', label='Customer')

# Adding labels and title
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('K-means Clustering')
plt.colorbar(label='Cluster')

plt.grid(False)
plt.show()

#looks for the point where the decrease in WCSS becomes insignificant relative to the previous clusters. Adding more clusters doesn’t significantly improve the fit.

In [None]:
# List to store WCSS (within-cluster sum of squares)
wcss = []

# Compute WCSS for a range of cluster numbers (1 to 10), you can change the number here if it is needed
for i in range(1, 11):###change here
    kmeans_WCSS = KMeans(n_clusters=i, init='k-means++', n_init=10, random_state=1)
    kmeans_WCSS.fit(data_norm)
    wcss.append(kmeans_WCSS.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6)) #Adjust figure size here
plt.plot(range(1, 11), wcss, marker='o')##adjust your selection of k range here to make it shows on plot
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.grid(True)
plt.show()

#A straightforward way to help you find the best k

##Distortion score is WCSS
##WCSS focus on minimizing the variance within clusters (intra-cluster compactness)


In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

# Initialize the KMeans model with same parameters
model_Elbow = KMeans(init='k-means++', n_init=10, random_state=1)

# Initialize the visualizer with the number of clusters to explore starting from 2, k=1 does not provide meaningful infor
visualizer = KElbowVisualizer(model_Elbow, k=(2, 10)) #You can adjust the k value here


visualizer.fit(data_norm)

# Display the elbow plot
visualizer.show()

##distortion score is WCSS


#Use Silouette score (can give you different optimal K)
##Silouett score look at both compactness(cohesion) and how well-seperate (separation) from each other. If clusters are close to each other, it might suggest few clusters



##WCSS focus on minimizing the variance within clusters (intra-cluster compactness). It might suggests more clusters

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

# Initialize the KMeans model with the same parameters
model_sil = KMeans(init='k-means++', n_init=10, random_state=1)


# Set the metric to 'silhouette' instead of the default 'distortion' (WCSS)
# Initialize the visualizer with the number of clusters to explore (start from 2)
visualizer = KElbowVisualizer(model_sil, k=(2, 10), metric='silhouette') ##you can change the value of k here


visualizer.fit(data_norm)

# Display the elbow plot with silhouette score
visualizer.show()


#Now we refine our model

In [None]:
optimal_k = 5  # Replace with the number of clusters determined from the Elbow Method
kmeans_optimal= KMeans(n_clusters=optimal_k, init='k-means++', n_init=10, random_state=1)

df_sub['cluster'] = kmeans_optimal.fit_predict(data_norm)


In [None]:
# Calculate the silhouette score for the fitted model
silhouette_avg = silhouette_score(data_norm, df_sub['cluster'])
# Print the silhouette score
print(f'\nSilhouette Score for k={optimal_k}: {silhouette_avg}\n')


visualizer = SilhouetteVisualizer(kmeans_optimal, colors='yellowbrick')
visualizer.fit(data_norm)
visualizer.show()

In [None]:
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(df_sub['Annual Income (k$)'], df_sub['Spending Score (1-100)'], c=df_sub['cluster'], cmap='viridis', label='Customer')

# Adding labels and title
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('K-means Clustering (Original Data)')
plt.colorbar(label='Cluster')

plt.grid(False)
plt.show()

#Visualize the cluster

In [None]:
df_sub

In [None]:
cluster_counts = df_sub['cluster'].value_counts()  # Get the counts for each cluster
total = cluster_counts.sum()  # Get the total number of data points

# Create a DataFrame to hold both counts and percentages
cluster_summary = pd.DataFrame({
    'Count': cluster_counts,
    'Percentage': round((cluster_counts / total) * 100, 2)  # Calculate percentage and round to 2 decimal places
})

# Display the table
display(cluster_summary)

In [None]:
from pywaffle import Waffle


cluster_counts = df_sub['cluster'].value_counts()
total = sum(cluster_counts)

# Plot the waffle chart with percentages added to the labels
fig = plt.figure(
    FigureClass=Waffle,
    rows=5,  # Number of rows in the waffle chart, you can adjust here
    values=cluster_counts,  # Values for each cluster
    title={'label': 'Cluster Distribution KMeans', 'loc': 'center'},
    labels=[f"Cluster {i} ({count} - {round((count / total) * 100, 2)}%)" for i, count in cluster_counts.items()],
    legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1.2)}, #You can also adjust the number to adjust the legend location
    figsize=(8, 5) #adjust figure size
)

plt.show()


#We need to understand the detailed stats inside each cluster


In [None]:
####You do not need to change anything here just run it to enable the generate_cluster_profile function#########
def generate_cluster_profile(data_norm, k, df_original):

    # Apply KMeans clustering
    kmeans_optimal = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=1)
    df_sub = df_original.copy()
    df_sub['cluster'] = kmeans_optimal.fit_predict(data_norm)

    # Create a copy of the DataFrame for calculations
    df_calculate = df_sub.copy()
    df_calculate['cluster_result'] = 'Cluster ' + (df_calculate['cluster']).astype(str)

    # Exclude the 'cluster' column for mean calculation
    df_mean_calculation = df_calculate.drop(columns=['cluster'])
    df_mean_feature = df_mean_calculation.drop(columns=['cluster_result'])  # Exclude 'cluster_result' column for overall mean

    # Calculate Overall Mean for All Features in df_mean_feature
    overall_means = df_mean_feature.mean().to_frame().T
    overall_means.index = ['Overall']

    # Summarize Mean of Each Cluster
    df_cluster_summary = df_mean_calculation.groupby('cluster_result').mean()

    # Add Overall Mean Row to Cluster Summary
    df_profile = pd.concat([df_cluster_summary, overall_means], axis=0)

    # Calculate the count of items in each cluster
    cluster_counts = df_calculate['cluster_result'].value_counts()

    # Calculate the percentage of items in each cluster
    cluster_percentages = (cluster_counts / cluster_counts.sum()) * 100

    # Create a DataFrame with counts and percentages
    df_count_percentage = pd.DataFrame({
        'Count': cluster_counts,
        'Percentage': cluster_percentages
    })

    # Add a row for "Overall"
    df_count_percentage.loc['Overall'] = [len(df_calculate), 100.0]
    df_profile = pd.concat([df_profile, df_count_percentage], axis=1)
    df_overall = df_profile.loc['Overall']
    df_profile = df_profile.drop(index='Overall')

    # Sort the clusters by the Count column
    df_profile = df_profile.sort_values(by='Count', ascending=False)

    # Append the "Overall" row back to the sorted DataFrame
    df_profile = pd.concat([df_profile, df_overall.to_frame().T])

    # Format the profile DataFrame
    df_profile = df_profile.style.format({
        "Count": "{:.0f}",
        **{col: "{:.2f}" for col in df_profile.columns if col != "Count"}  # Two decimal places for all other columns
    }).background_gradient(cmap='Purples')

    return df_profile



#To generate the cluster profile call the function

In [None]:
### change the number for k right now k is 5
##data_norm is our normalized data we call it data_norm
##df_sub is our select variables of interest

df_profile_k5 = generate_cluster_profile(data_norm, 5, df_sub) ##change the number here for k value
display(df_profile_k5)

#Lets compare k = 5 and k = 4

In [None]:
# K-means model with k=5 clusters, adjust n_clusters
model_1 = KMeans(n_clusters=5, init='k-means++', random_state=1) ##adjust cluster value
model_1.fit(data_norm)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(data_norm)  # data_norm is your normalized dataset

# Calculate and display the average silhouette score
avg_silhouette_5 = silhouette_score(data_norm, model_1.labels_)
print(f"Average Silhouette Score for k=5: {avg_silhouette_5:.3f}")
visualizer.show()  # Display the silhouette plot








# K-means model with k=4 clusters for comparison
model_2 = KMeans(n_clusters=4, init='k-means++', random_state=1) ###adjust cluster value
model_2.fit(data_norm)
visualizer_2 = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer_2.fit(data_norm)  # data_norm is your normalized dataset

# Calculate and display the average silhouette score
avg_silhouette_4 = silhouette_score(data_norm, model_2.labels_)
print(f"Average Silhouette Score for k=4: {avg_silhouette_4:.3f}")

visualizer_2.show()  # Display the silhouette plot



In [None]:
###call the generate_cluster_profile function to compare

df_profile_k4 = generate_cluster_profile(data_norm, 4, df_sub)
df_profile_k5 = generate_cluster_profile(data_norm, 5, df_sub)

display(df_profile_k4)
print()
display(df_profile_k5)