In [144]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

In [145]:
# Read data
def load_data(file_path):
    return pd.read_csv(file_path)


In [146]:
def preprocess_data(df):
    # Drop the "CustomerID" column
    # df = df.dropna(inplace=True)
    df = df.drop(["CustomerID"], axis=1)
       
    # Scale the numerical columns
    scaler = StandardScaler()
    numerical_columns = ["Age", "Annual_Income", "Spending_Score"]
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    return df

In [147]:
def perform_hierarchical_clustering(df, n_clusters):
    # Perform hierarchical clustering
    model = AgglomerativeClustering(n_clusters=n_clusters)
    df["cluster2"] = model.fit_predict(df)
    
    return df

In [148]:
df = load_data("segmented_customers.csv")
df.head()


Unnamed: 0,CustomerID,Gender,Age,Annual_Income,Spending_Score,cluster
0,1,1,19,15,39,3
1,2,1,21,15,81,4
2,3,0,20,16,6,3
3,4,0,23,16,77,4
4,5,0,31,17,40,3


In [149]:
df = preprocess_data(df)

In [150]:
df.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster
0,1,-1.424569,-1.738999,-0.434801,3
1,1,-1.281035,-1.738999,1.195704,4
2,0,-1.352802,-1.70083,-1.715913,3
3,0,-1.137502,-1.70083,1.040418,4
4,0,-0.563369,-1.66266,-0.39598,3


In [151]:
df = perform_hierarchical_clustering(df, n_clusters=5)
print(df)

     Gender       Age  Annual_Income  Spending_Score  cluster  cluster2
0         1 -1.424569      -1.738999       -0.434801        3         0
1         1 -1.281035      -1.738999        1.195704        4         3
2         0 -1.352802      -1.700830       -1.715913        3         0
3         0 -1.137502      -1.700830        1.040418        4         3
4         0 -0.563369      -1.662660       -0.395980        3         0
..      ...       ...            ...             ...      ...       ...
195       0 -0.276302       2.268791        1.118061        1         1
196       0  0.441365       2.497807       -0.861839        0         2
197       1 -0.491602       2.497807        0.923953        1         1
198       1 -0.491602       2.917671       -1.250054        0         2
199       1 -0.635135       2.917671        1.273347        1         1

[200 rows x 6 columns]


In [152]:
#calculate the score
c = 0
for i in range(len(df['cluster'])):
    if df['cluster'][i] == df['cluster2'][i]:
        c += 1
print(c/len(df['cluster']))


0.195


In [153]:
import numpy as np

class AgglomerativeClustering1:
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
    
    def fit_predict(self, X):
        n_samples, _ = X.shape
        distances = self._calculate_distance_matrix(X)
        cluster_assignments = np.arange(n_samples)
        current_cluster_count = n_samples
        
        while current_cluster_count > self.n_clusters:
            min_distance = np.inf
            min_i, min_j = None, None
            for i in range(n_samples):
                for j in range(i+1, n_samples):
                    if cluster_assignments[i] != cluster_assignments[j] and distances[i, j] < min_distance:
                        min_distance = distances[i, j]
                        min_i, min_j = i, j
            cluster_assignments[cluster_assignments == cluster_assignments[min_j]] = cluster_assignments[min_i]
            current_cluster_count -= 1
        
        return cluster_assignments
    
    def _calculate_distance_matrix(self, X):
        n_samples = X.shape[0]
        distances = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(i+1, n_samples):
                distances[i, j] = np.linalg.norm(X[i] - X[j])
                distances[j, i] = distances[i, j]
        return distances


In [154]:
def perform_hierarchical_clustering(df, n_clusters):
    # Perform hierarchical clustering
    model = AgglomerativeClustering1(n_clusters=n_clusters)
    
    return df

In [156]:
df = perform_hierarchical_clustering(df, n_clusters=5)
df.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster2
0,1,-1.424569,-1.738999,-0.434801,3,0
1,1,-1.281035,-1.738999,1.195704,4,3
2,0,-1.352802,-1.70083,-1.715913,3,0
3,0,-1.137502,-1.70083,1.040418,4,3
4,0,-0.563369,-1.66266,-0.39598,3,0


In [159]:
#calculate the score
c = 0
for i in range(len(df['cluster'])):
    if df['cluster'][i] != df['cluster2'][i]:
        c += 1
print(c)


161


In [160]:
#silouette score
from sklearn.metrics import silhouette_score
silhouette_score(df, df['cluster2'])

0.5476242034629109