In [None]:
# For Google Colab integration
import os
from google.colab import drive
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import pairwise_distances

from google.colab import drive
drive.mount('/content/drive')




In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/Mall_Customers.csv'
df = pd.read_csv(file_path)


# calling head() method
df.head()

#Select variables of interest

In [None]:
df_sub= df[['Annual Income (k$)', 'Spending Score (1-100)']]

#Normalization or standardization

In [None]:
from sklearn.preprocessing import StandardScaler
#data standardization
scaler = StandardScaler()
data_std = scaler.fit_transform(df_sub)

In [None]:
from sklearn.preprocessing import MinMaxScaler
#data normalization
norm_scaler = MinMaxScaler()
data_norm = norm_scaler.fit_transform(df_sub)
##data_norm is the normalized data

In [None]:
from sklearn.cluster import DBSCAN
model = DBSCAN(eps=0.1, min_samples=5)
model.fit(data_norm)
df_sub['cluster'] = model.fit_predict(data_norm)






In [None]:
# Plot the results
plt.figure(figsize=(10, 6))

# Colors for clusters
unique_clusters = df_sub['cluster'].unique()
colors = plt.cm.get_cmap('twilight', len(unique_clusters))

for cluster in unique_clusters:
    if cluster == -1:
        # Noise points (outliers)
        plt.scatter(data_norm[df_sub['cluster'] == cluster, 0],
                    data_norm[df_sub['cluster'] == cluster, 1],
                    color='k', label='Outliers')
    else:
        # Plot clusters
        plt.scatter(data_norm[df_sub['cluster'] == cluster, 0],
                    data_norm[df_sub['cluster'] == cluster, 1],
                    color=colors(cluster), label=f'Cluster {cluster}')

plt.title('DBSCAN Clustering')
plt.xlabel('Normalized Annual Income (k$)')
plt.ylabel('Normalized Spending Score (1-100)')
plt.legend(loc='best')
plt.grid(False)

# Show the plot
plt.show()

#Evaluation
#Low Outlier Ratio (< 5%)
#Moderate Outlier Ratio (5% - 20%)
#High Outlier Ratio (>20%)

###Which one is a good outlier ratio highly depends on your specific dataset and problem domain

In [None]:
##you do not need to change anything here
from scipy.spatial.distance import cdist

def calculate_dbscan_metrics(data_norm, eps, min_samples):
    # Fit DBSCAN model
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(data_norm)

    # Number of clusters (excluding outlier)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    # Outlier ratio
    outlier_ratio = np.sum(labels == -1) / len(labels)

    # Calculate SSE (Sum of Squared Errors) within clusters
    sse = 0
    cluster_centers = {}
    for cluster_label in set(labels):
        if cluster_label == -1:
            continue
        cluster_points = data_norm[labels == cluster_label]
        cluster_center = np.mean(cluster_points, axis=0)
        cluster_centers[cluster_label] = cluster_center
        sse += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1) ** 2)

    # Calculate SSB (Sum of Squared Between-cluster)
    overall_mean = np.mean(data_norm, axis=0)
    ssb = 0
    for cluster_label, center in cluster_centers.items():
        cluster_size = np.sum(labels == cluster_label)
        ssb += cluster_size * np.sum((center - overall_mean) ** 2)

    # Total SSE + SSB
    sse_ssb_sum = sse + ssb

    # Calculate Silhouette Score (ignoring outliers)
    if n_clusters > 1:
        silhouette_avg = silhouette_score(data_norm[labels != -1], labels[labels != -1])
    else:
        silhouette_avg = np.nan  # Silhouette score is undefined for 1 or no clusters

    # Create DataFrame to display the results
    results = {
        'eps': [eps],
        'min_samples': [min_samples],
        'n_clusters': [n_clusters],
        'outlier_ratio': [outlier_ratio],
        'SSE': [sse],
        'SSB': [ssb],
        'SSE + SSB': [sse_ssb_sum],
        'Silhouette Score': [silhouette_avg]
    }

    df_results = pd.DataFrame(results)
    return df_results

#Consider other possible eps_value and min_sample_value

In [None]:
# Define parameter ranges
# Create an array of eps values starting from 0.1 to 1.0 (exclusive), with a step size of 0.1.
# This means eps will take values like 0.1, 0.2, 0.3, ..., up to 0.9.
eps_values = np.arange(0.1, 1.0, 0.1) # change eps_value here
min_samples_values = np.arange(3, 10) # change min_sample here

# List to store the results
results = []

# Loop through each combination of eps and min_samples
for eps in eps_values:
    for min_samples in min_samples_values:
        # Apply DBSCAN and calculate metrics
        df_metrics = calculate_dbscan_metrics(data_norm, eps, min_samples)
        results.append(df_metrics)

# Concatenate all results
results_df = pd.concat(results, ignore_index=True)
pd.options.display.float_format = "{:.2f}".format

# Sort by SSE + SSB
results_df_sorted = results_df.sort_values(by='SSE + SSB', ascending=True)
display(results_df_sorted)

# Sort by Silhouette Score
results_df_sorted = results_df.sort_values(by='Silhouette Score', ascending=False)
display(results_df_sorted)


#Rerun the model with better eps and min_samples

In [None]:
eps = 0.1
min_samples = 9
df_metrics = calculate_dbscan_metrics(data_norm, eps, min_samples)

# Display the metrics DataFrame
display(df_metrics)

In [None]:
eps = 0.1
min_samples = 8
df_metrics = calculate_dbscan_metrics(data_norm, eps, min_samples)

# Display the metrics DataFrame
display(df_metrics)

In [None]:
###You do not need to change anything here
def generate_cluster_profile_dbscan(data_norm, eps, min_samples, df_original):
    # Apply DBSCAN with customizable eps and min_samples
    dbscan_stats = DBSCAN(eps=eps, min_samples=min_samples)
    df_sub = df_original.copy()
    df_sub['cluster'] = dbscan_stats.fit_predict(data_norm)

    # Create a copy of the DataFrame for calculations
    df_calculate = df_sub.copy()
    df_calculate['cluster_result'] = 'Cluster ' + (df_calculate['cluster']).astype(str)

    # Exclude the 'cluster' column for mean calculation
    df_mean_calculation = df_calculate.drop(columns=['cluster'])
    df_mean_feature = df_mean_calculation.drop(columns=['cluster_result'])

    # Calculate Overall Mean for All Features in df_mean_feature
    overall_means = df_mean_feature.mean().to_frame().T
    overall_means.index = ['Overall']


    df_cluster_summary = df_mean_calculation.groupby('cluster_result').mean()


    df_profile = pd.concat([df_cluster_summary, overall_means], axis=0)

    # Calculate the count of items in each cluster
    cluster_counts = df_calculate['cluster_result'].value_counts()

    # Calculate the percentage of items in each cluster
    cluster_percentages = (cluster_counts / cluster_counts.sum()) * 100

    # Create a DataFrame with counts and percentages
    df_count_percentage = pd.DataFrame({
        'Count': cluster_counts,
        'Percentage': cluster_percentages
    })

    # Add a row for "Overall"
    df_count_percentage.loc['Overall'] = [len(df_calculate), 100.0]
    df_profile = pd.concat([df_profile, df_count_percentage], axis=1)
    df_overall = df_profile.loc['Overall']
    df_profile = df_profile.drop(index='Overall')

    # Sort the clusters by the Count column
    df_profile = df_profile.sort_values(by='Count', ascending=False)

    # Rename 'Cluster -1' to 'Outliers' in the final DataFrame
    df_profile.index = df_profile.index.str.replace('Cluster -1', 'Outliers')

    # Append the "Overall" row back to the sorted DataFrame
    df_profile = pd.concat([df_profile, df_overall.to_frame().T])

    # Format the profile DataFrame
    df_profile = df_profile.style.format({
        "Count": "{:.0f}",
        **{col: "{:.2f}" for col in df_profile.columns if col != "Count"}  # Two decimal places for all other columns
    }).background_gradient(cmap='Purples')  ###### Change color here to 'Blues', 'Purples', 'Oranges', etc.

    return df_profile



In [None]:
# data_norm is our normalized data, eps = 0.1 amd min_samples = 8 df_sub is our selected subsect dataset with features of interest
profile_1 = generate_cluster_profile_dbscan(data_norm, eps=0.1, min_samples=8, df_original=df_sub)
display(profile_1)
print()


profile_2 = generate_cluster_profile_dbscan(data_norm, eps=0.1, min_samples=9, df_original=df_sub)
display(profile_2)
