In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
class KModes:
    def __init__(self, k, max_iter=100):
        self.k = k
        self.max_iter = max_iter

    def initialize_centroids(self, data):
        n_samples, n_features = data.shape
        # centroids is the two dimensional array that will contain k-rows and n_coloumns(n == number of features)
        centroids = np.zeros((self.k, n_features), dtype=object)
        # Now iterating through each of the k-centroid
        for i in range(self.k):
            centroid = []
            # above is the array that will contain the features of that particular centroid.
            for j in range(n_features):
                # we are iterating through the unique values of a particular feature and then randomly assigning one of them to the centroid.
                unique_values = np.unique(data.iloc[:, j])
                centroid.append(np.random.choice(unique_values))
            centroids[i] = centroid
        # now centroids will contain info about all the inital centroid(features)
        return centroids

    def compute_distance(self, data, centroids):
        # Distance is a 2_D array that will contain a row for each of the data point and then array for each row will contain the distance from the k-modes.
        distance = np.zeros((data.shape[0], len(centroids)), dtype=int)
        # i and centroid will be used to iterate simultenously using the 2d-array, enumerate function will be used for that .
        for i, centroid in enumerate(centroids):
            for j in range(data.shape[0]):
                # we are iterating through each of the data point and then we compare the number of mismatches and then do the sum.
                distance[j, i] = np.sum(data.iloc[j] != centroid)
        return distance

    def assign_clusters(self, data, centroids):
        # calculate the distance for each of the data points.
        distances = self.compute_distance(data, centroids)
        # Then get the min index of the centroid of each of the data point and then return the array.
        clusters = np.argmin(distances, axis=1)
        return clusters

    def update_centroids(self, data, clusters):
        centroids = []
        # used for storing the updates centroids.
        for cluster_index in range(self.k):
            # cluster_index is used to access the individual clusters
            # we then filter out the data points and extract the points only belonging to that cluster.
            cluster_data = data[clusters == cluster_index]
            centroid = []
            # we will iterate through the columns of the filtered cluster and then get the mode of each of the feature and 
            # then add that as the feature of the centroid and then return the centroid.
            for feature_column in cluster_data.columns:
                mode = cluster_data[feature_column].mode()[0]
                centroid.append(mode)
            centroids.append(centroid)
        return np.array(centroids)

    def fit(self, data):
        self.centroids = self.initialize_centroids(data)
        # this is basically doing the iterations to find the best fit and then stopping if the clusters don't change.
        for _ in range(self.max_iter):
            old_centroids = self.centroids.copy()
            clusters = self.assign_clusters(data, self.centroids)
            self.centroids = self.update_centroids(data, clusters)
            if np.array_equal(old_centroids, self.centroids):
                break
        self.clusters = clusters
