In [1]:
import numpy as np
import csv

#Matrix class to hold and manipulate 2D numerical data
class matrix:
    def __init__(self, filename=None):  
        self.array_2d = None
        if filename:
            self.load_from_csv(filename)

    # Loads a CSV file into a 2D NumPy array
    def load_from_csv(self, filename):
        with open(filename, 'r') as file:
            reader = csv.reader(file)
            self.array_2d = np.array([[float(val) for val in row] for row in reader])

    #Normalizes data in each column to [0, 1]using min-max scaling
    def standardise(self):
        data = self.array_2d
        for j in range(data.shape[1]):
            col = data[:, j]
            min_val = np.min(col)
            max_val = np.max(col)
            if max_val != min_val:
                data[:, j] = (col - min_val) / (max_val - min_val)
        self.array_2d = data

    #Computes Euclidean distance from a specific row to all rows in another matrix
    def get_distance(self, other_matrix, row_i):
        x = self.array_2d[row_i]
        distances = np.linalg.norm(other_matrix.array_2d - x, axis=1).reshape(-1, 1)
        return matrix_from_array(distances)

    #Computes weighted Euclidean distance from a row to all rows in another matrix
    def get_weighted_distance(self, other_matrix, weights, row_i):
        x = self.array_2d[row_i]
        w = weights.array_2d[0]
        dists = np.sum(w * (other_matrix.array_2d - x)**2, axis=1)
        return matrix_from_array(np.sqrt(dists).reshape(-1, 1))

    #Counts occurrences of each unique value in a single column matrix
    def get_count_frequency(self):
        if self.array_2d.shape[1] != 1:
            return 0
        values, counts = np.unique(self.array_2d, return_counts=True)
        return dict(zip(values.astype(int), counts))

#Helper function to create a matrix object from a NumPy array
def matrix_from_array(arr):
    m = matrix()
    m.array_2d = np.array(arr)
    return m

#Initializes random weights that sum to 1(for weighted distance calculation)
def get_initial_weights(m):
    weights = np.random.rand(m)
    weights /= np.sum(weights)
    return matrix_from_array(weights.reshape(1, -1))

#Computes centroids for each cluster based on current assignments
def get_centroids(data, S, K):
    centroids = []
    for k in range(1, K + 1):
        indices = np.where(S.array_2d.flatten() == k)[0]
        if len(indices) > 0:
            mean_vector = np.mean(data.array_2d[indices], axis=0)
        else:
            mean_vector = np.zeros(data.array_2d.shape[1])
        centroids.append(mean_vector)
    return matrix_from_array(np.array(centroids))

#Computes within-cluster separation(intra-cluster variance)
def get_separation_within(data, centroids, S, K):
    m = data.array_2d.shape[1]
    a = np.zeros(m)
    for k in range(1, K + 1):
        indices = np.where(S.array_2d.flatten() == k)[0]
        cluster_points = data.array_2d[indices]
        if len(cluster_points) > 0:
            diff = cluster_points - centroids.array_2d[k - 1]
            a += np.sum(diff ** 2, axis=0)
    return matrix_from_array(a.reshape(1, -1))

#Computes between-cluster separation(how far centroids are from overall mean)
def get_separation_between(data, centroids, S, K):
    m = data.array_2d.shape[1]
    overall_mean = np.mean(data.array_2d, axis=0)
    b = np.zeros(m)
    for k in range(1, K + 1):
        indices = np.where(S.array_2d.flatten() == k)[0]
        nk = len(indices)
        if nk > 0:
            diff = centroids.array_2d[k - 1] - overall_mean
            b += nk * diff ** 2
    return matrix_from_array(b.reshape(1, -1))

#Performs clustering using an iterative weighted k-means algorithm
def get_groups(data, K):
    data.standardise()  #Normalize data
    n, m = data.array_2d.shape
    weights = get_initial_weights(m)  # Random initial feature weights
    S = matrix_from_array(np.zeros((n, 1)))  # Cluster assignment
    centroids = matrix()
    
    #Randomly initialize centroids
    initial_indices = np.random.choice(n, K, replace=False)
    centroids.array_2d = data.array_2d[initial_indices]

    #Iterate until assignments do not change
    while True:
        new_S = np.zeros((n, 1))
        for i in range(n):
            # Compute weighted distances to centroids
            dists = np.sum(weights.array_2d[0] * (centroids.array_2d - data.array_2d[i]) ** 2, axis=1)
            new_S[i] = np.argmin(dists) + 1  # Assign to nearest centroid
        if np.array_equal(S.array_2d, new_S):
            break  # Converged
        S.array_2d = new_S
        centroids = get_centroids(data, S, K)
        weights = get_new_weights(data, centroids, weights, S, K)
    return S

# Updates feature weights based on within and between-cluster separations
def get_new_weights(data, centroids, old_weights, S, K):
    a = get_separation_within(data, centroids, S, K).array_2d[0]
    b = get_separation_between(data, centroids, S, K).array_2d[0]
    m = len(a)
    w = np.zeros(m)
    for j in range(m):
        if a[j] != 0:
            w[j] = b[j] / a[j]
    if np.sum(w) == 0:
        w = np.ones(m) / m  # Avoid division by zero
    else:
        w /= np.sum(w)
    return matrix_from_array(w.reshape(1, -1))

# Runs the clustering for k=2 to 10, 20 times each, and prints cluster sizes
def run_test():
    m = matrix('DataAnubavam.csv')  # Load data from CSV
    for k in range(2, 11):  #Try cluster sizes from 2 to 10
        for i in range(20):  #Repeat 20 times for variability
            S = get_groups(m, k)  #Get cluster assignments
            freq = S.get_count_frequency()  # Count points per cluster

            # Convert NumPy types to Python int for clean printing
            clean_freq = {
                int(k_.item() if hasattr(k_, 'item') else k_): 
                int(v_.item() if hasattr(v_, 'item') else v_) 
                for k_, v_ in freq.items()
            }

            print(f"{k} = {clean_freq}") 

#Entry point of the program
if __name__ == "__main__":
    run_test()


2 = {1: 89, 2: 89}
2 = {1: 106, 2: 72}
2 = {1: 106, 2: 72}
2 = {1: 58, 2: 120}
2 = {1: 59, 2: 119}
2 = {1: 48, 2: 130}
2 = {1: 106, 2: 72}
2 = {1: 66, 2: 112}
2 = {1: 66, 2: 112}
2 = {1: 106, 2: 72}
2 = {1: 111, 2: 67}
2 = {1: 116, 2: 62}
2 = {1: 72, 2: 106}
2 = {1: 112, 2: 66}
2 = {1: 66, 2: 112}
2 = {1: 106, 2: 72}
2 = {1: 111, 2: 67}
2 = {1: 72, 2: 106}
2 = {1: 66, 2: 112}
2 = {1: 109, 2: 69}
3 = {1: 57, 2: 56, 3: 65}
3 = {1: 65, 2: 57, 3: 56}
3 = {1: 64, 2: 57, 3: 57}
3 = {1: 100, 2: 44, 3: 34}
3 = {1: 29, 2: 49, 3: 100}
3 = {1: 56, 2: 48, 3: 74}
3 = {1: 57, 2: 64, 3: 57}
3 = {1: 65, 2: 57, 3: 56}
3 = {1: 105, 2: 1, 3: 72}
3 = {1: 65, 2: 56, 3: 57}
3 = {1: 57, 2: 62, 3: 59}
3 = {1: 58, 2: 64, 3: 56}
3 = {1: 34, 2: 44, 3: 100}
3 = {1: 57, 2: 57, 3: 64}
3 = {1: 73, 2: 57, 3: 48}
3 = {1: 44, 2: 100, 3: 34}
3 = {1: 56, 2: 64, 3: 58}
3 = {1: 65, 2: 56, 3: 57}
3 = {1: 44, 2: 100, 3: 34}
3 = {1: 57, 2: 57, 3: 64}
4 = {1: 35, 2: 26, 3: 57, 4: 60}
4 = {1: 56, 2: 33, 3: 37, 4: 52}
4 = {1: 60