In [10]:
# Author: Hassan Ali
# Importing libraries
import pandas as pd  
import numpy as np  
from kmodes.kmodes import KModes  

In [12]:
# Creating numpy arrays for each variable, where variables contain categorical data.
v1 = np.array(['M', 'M', 'N', 'O', 'O', 'N', 'N', 'N', 'M', 'N'])
v2 = np.array(['Q', 'Q', 'P', 'Q', 'R', 'P', 'P', 'R', 'O', 'P'])
v3 = np.array(['X', 'Z', 'Z', 'W', 'Y', 'X', 'W', 'Y', 'X', 'Y'])

In [14]:
# Creating a pandas DataFrame from the arrays. This structure is suitable for the KModes algorithm.
data = pd.DataFrame({'v1':v1, 'v2':v2, 'v3':v3})

# Initializing the KModes algorithm with specified parameters:
# - n_clusters=2 indicates that the algorithm will try to find 2 clusters.
# - init="random" means the initial centroids will be chosen randomly.
# - n_init=5 means the algorithm will run with 5 different centroid seeds and the best output in terms of inertia will be chosen.
# - verbose=1 will print out logging information about the process.
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)

# Fitting the KModes model on the data and predicting clusters for each entry.
clusters = kmode.fit_predict(data)

# Inserting the predicted clusters into the original DataFrame as the first column.
# "True" in the last argument specifies that if the column "Cluster" exists, it will be overwritten.
data.insert(0, "Cluster", clusters, True)

# Printing the DataFrame to see the result with clusters.
print(data)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 9.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 3, cost: 8.0
Run 2, iteration: 2/100, moves: 0, cost: 8.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 2, cost: 9.0
Run 3, iteration: 2/100, moves: 1, cost: 9.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 8.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 10.0
Best run was number 2
   Cluster v1 v2 v3
0        2  M  Q  X
1        2  M  Q  Z
2        0  N  P  Z
3        0  O  Q  W
4        1  O  R  Y
5        0  N  P  X
6        0  N  P  W
7        1  N  R  Y
8        2  M  O  X
9        0  N  P  Y


In [16]:
# When using k-means with verbose=1, detailed information about the algorithm's progress is printed. 
# This is particularly useful for debugging or understanding the convergence process. 
# Here's a breakdown of what an example output:

# "Run 1": This indicates the current execution of the k-means algorithm. If multiple initializations are used 
# to avoid local minima, each "run" starts with a different set of randomly initialized centroids.

# "Iteration: 1/100": Shows the current iteration number out of a maximum set (in this case, 100). The k-means 
# algorithm iterates by reassigning data points to the nearest cluster centroid and then recalculating centroids 
# based on the current cluster memberships. This repeats until the algorithm meets its convergence criteria 
# or reaches the maximum number of iterations.

# "Moves: 0": Indicates the number of data points that changed their cluster assignment in this iteration. 
# A move count of 0 means no points were reassigned to a different cluster, suggesting all points are currently 
# closest to their assigned centroid.

# "Cost: 13.0": Represents the total within-cluster variance, or the sum of squared distances between each 
# data point and its assigned cluster's centroid. The k-means algorithm aims to minimize this cost. 
# A cost of 13.0 indicates the total cost at this iteration, with the goal being to reduce this value as 
# the algorithm progresses and centroids are adjusted to better fit the data points.