In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [2]:
# Read data
def load_data(file_path):
    return pd.read_csv(file_path)


In [3]:
def preprocess_data(df):
    df.dropna(inplace=True)
    # Scale the numerical columns
    scaler = StandardScaler()
    numerical_columns = list(df.columns)
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    return df

In [4]:
def perform_hierarchical_clustering(df, n_clusters):
    # Perform hierarchical clustering
    model = AgglomerativeClustering(n_clusters=n_clusters)
    df['cluster2'] = model.fit_predict(df)
    
    return df

In [5]:
df = load_data("creditcard.csv")
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df = preprocess_data(df)

In [7]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.996583,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,0.244964,-0.041599
1,-1.996583,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,-0.342475,-0.041599
2,-1.996562,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,1.160686,-0.041599
3,-1.996562,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,0.140534,-0.041599
4,-1.996541,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,-0.073403,-0.041599


In [None]:
df = perform_hierarchical_clustering(df, n_clusters=5)
print(df)

In [9]:
# Calculate the silhouette score
score = silhouette_score(df, df['cluster2'])
print(score)

0.5476242034629109


In [10]:
class AgglomerativeClustering1:
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
    
    def fit_predict(self, X):
        n_samples, _ = X.shape
        distances = self._calculate_distance_matrix(X)
        cluster_assignments = np.arange(n_samples)
        current_cluster_count = n_samples
        
        while current_cluster_count > self.n_clusters:
            min_distance = np.inf
            min_i, min_j = None, None
            for i in range(n_samples):
                for j in range(i+1, n_samples):
                    if cluster_assignments[i] != cluster_assignments[j] and distances[i, j] < min_distance:
                        min_distance = distances[i, j]
                        min_i, min_j = i, j
            cluster_assignments[cluster_assignments == cluster_assignments[min_j]] = cluster_assignments[min_i]
            current_cluster_count -= 1
        
        return cluster_assignments
    
    def _calculate_distance_matrix(self, X):
        n_samples = X.shape[0]
        distances = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(i+1, n_samples):
                distances[i, j] = np.linalg.norm(X[i] - X[j])
                distances[j, i] = distances[i, j]
        return distances


In [11]:
def perform_hierarchical_clustering_self(df, n_clusters):
    # Reset the index of the dataframe
    df = df.reset_index(drop=True)
    # Perform hierarchical clustering
    model = AgglomerativeClustering1(n_clusters=n_clusters)
    df["cluster3"] = model.fit_predict(df.values)
    return df

In [12]:
df_new = perform_hierarchical_clustering_self(df, n_clusters=5)
df_new = pd.DataFrame(df_new)
df_new.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster2,cluster3
0,1,-1.424569,-1.738999,-0.434801,3,0,0
1,1,-1.281035,-1.738999,1.195704,4,3,7
2,0,-1.352802,-1.70083,-1.715913,3,0,0
3,0,-1.137502,-1.70083,1.040418,4,3,7
4,0,-0.563369,-1.66266,-0.39598,3,0,0


In [14]:
#silouette score
silhouette_score(df_new, df_new['cluster3'])

0.7497501015197205

In [101]:
from collections import deque

class AgglomerativeClustering3:
    def __init__(self, n_clusters, distance_metric='euclidean'):
        self.n_clusters = n_clusters
        self.distance_metric = distance_metric
        
    def fit_predict(self, X):
        n_samples, _ = X.shape
        
        # Check if number of desired clusters is greater than number of samples
        if self.n_clusters > n_samples:
            raise ValueError("Number of desired clusters cannot be greater than number of samples")
        
        # Check if there are NaN values in the input
        if np.isnan(X).any():
            raise ValueError("Input contains NaN values")
        
        # Check if there are identical samples in the input
        unique_rows, unique_indices = np.unique(X, return_index=True, axis=0)
        if unique_rows.shape[0] != X.shape[0]:
            raise ValueError("Input contains identical samples")
        
        distances = self._calculate_distance_matrix(X)
        nearest_neighbors = self._calculate_nearest_neighbors(distances)
        cluster_assignments = np.arange(n_samples)
        current_cluster_count = n_samples
        
        while current_cluster_count > self.n_clusters:
            min_distance = np.inf
            min_i, min_j = None, None
            for i in range(n_samples):
                j = nearest_neighbors[i][0]
                if cluster_assignments[i] != cluster_assignments[j] and distances[i, j] < min_distance:
                    min_distance = distances[i, j]
                    min_i, min_j = i, j
            
            cluster_assignments[cluster_assignments == cluster_assignments[min_j]] = cluster_assignments[min_i]
            current_cluster_count -= 1
            nearest_neighbors[min_i].extend(nearest_neighbors[min_j])
            nearest_neighbors.pop(min_j)
            self._update_nearest_neighbors(nearest_neighbors, min_i, distances)
        
        return cluster_assignments
    
    def _calculate_distance_matrix(self, X):
        n_samples = X.shape[0]
        distances = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(i+1, n_samples):
                distances[i, j] = np.linalg.norm(X[i] - X[j])
                distances[j, i] = distances[i, j]
        return distances

    def _update_nearest_neighbors(self, nearest_neighbors, min_i, distances):
        n_samples = distances.shape[0]
        for j in range(n_samples):
            if min_i == j:
                continue
            if j in nearest_neighbors[min_i]:
                nearest_neighbors[min_i].remove(j)
            if distances[min_i, j] < distances[j, nearest_neighbors[j][0]]:
                nearest_neighbors[j][0] = min_i
            else:
                nearest_neighbors[j].append(min_i)


    def _calculate_nearest_neighbors(self, distances):
        n_samples = distances.shape[0]
        nearest_neighbors = [deque([j for j in range(n_samples) if j != i]) for i in range(n_samples)]
        for i in range(n_samples):
            nearest_neighbors[i] = deque(sorted(nearest_neighbors[i], key=lambda j: distances[i, j]))
        return nearest_neighbors


In [106]:
def perform_hierarchical_clustering_self_deque(df, n_clusters):
    # Reset the index of the dataframe
    df = df.reset_index(drop=True)
    # Perform hierarchical clustering
    model = AgglomerativeClustering1(n_clusters=n_clusters)
    df["cluster4"] = model.fit_predict(df.values)
    return df

In [109]:
df_performance = perform_hierarchical_clustering_self_deque(df, n_clusters=5)
df_performance = pd.DataFrame(df_performance)
df_performance.head()

Unnamed: 0,Gender,Age,Annual_Income,Spending_Score,cluster,cluster2,cluster3,cluster4
0,1,-1.424569,-1.738999,-0.434801,3,0,0,0
1,1,-1.281035,-1.738999,1.195704,4,4,7,7
2,0,-1.352802,-1.70083,-1.715913,3,0,0,0
3,0,-1.137502,-1.70083,1.040418,4,4,7,7
4,0,-0.563369,-1.66266,-0.39598,3,0,0,0


In [111]:
#silouette score
silhouette_score(df, df_performance['cluster4'])

0.7718081363433258