In [1]:
!pip install kmodes

Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl.metadata (8.1 kB)
Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2


In [None]:
#23BAI10355
#ANVESHA RASTOGI

**USING K-MODES LIBRARY**

In [6]:
import pandas as pd
from kmodes.kmodes import KModes
# Sample categorical data
data = {
     'F-Col': ['R', 'G', 'G', 'R', 'B', 'R', 'G', 'R'],
     'F-Diam': ['C1', 'C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2'],
     'F-com': ['AZ', 'AZ', 'AB', 'AB', 'AZ', 'AB', 'AZ', 'AZ']
}
df = pd.DataFrame(data)

# initial_centroids = [['R', 'C1', 'AZ'], ['G', 'C2', 'AB']]
# Create KModes instance with default 'Cao' initialization
# Use n_init=5 to run the algorithm multiple times with different centroids and choose the best result
km = KModes(n_clusters=2, init='Cao', n_init=5, verbose=1)
# Fit with printing cluster assignments at each iteration
clusters = km.fit_predict(df)
# After fitting, print each iteration's cluster assignments
# The labels_iter_ attribute is not available when using n_init > 1
# for i, labels in enumerate(km.labels_iter_):
#     print(f"Iteration {i+1} cluster assignments: {labels}")
df['Cluster'] = clusters
print("\nData with cluster assignment:")
print(df)

# Print points in each cluster
print("\nData points in Cluster 1:")
print(df[df['Cluster'] == 0])

print("\nData points in Cluster 2:")
print(df[df['Cluster']==1])

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 6.0

Data with cluster assignment:
  F-Col F-Diam F-com  Cluster
0     R     C1    AZ        0
1     G     C1    AZ        0
2     G     C2    AB        1
3     R     C2    AB        1
4     B     C1    AZ        0
5     R     C1    AB        0
6     G     C1    AZ        0
7     R     C2    AZ        0

Data points in Cluster 1:
  F-Col F-Diam F-com  Cluster
0     R     C1    AZ        0
1     G     C1    AZ        0
4     B     C1    AZ        0
5     R     C1    AB        0
6     G     C1    AZ        0
7     R     C2    AZ        0

Data points in Cluster 2:
  F-Col F-Diam F-com  Cluster
2     G     C2    AB        1
3     R     C2    AB        1


**WITHOUT USING K-MODE LIBRARY**

In [8]:
import pandas as pd
import numpy as np

# Sample categorical data
data = {
    'F-Col': ['R', 'G', 'G', 'R', 'B', 'R', 'G', 'R'],
    'F-Diam': ['C1', 'C1', 'C2', 'C2', 'C1', 'C1', 'C1', 'C2'],
    'F-com': ['AZ', 'AZ', 'AB', 'AB', 'AZ', 'AB', 'AZ', 'AZ']
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)

def categorical_dissim(a, b):
    # Count number of mismatches between two categorical vectors
    return np.sum(a != b)

def mode_of_column(series):
    # Return mode, handle multimodal by picking first mode
    return series.mode().iloc[0]

# Initialize centroids randomly from actual rows (comparable to kmodes init)
np.random.seed(42)
k = 2
centroid_indices = np.random.choice(df.index, size=k, replace=False)
centroids = df.loc[centroid_indices].reset_index(drop=True)

labels = pd.Series([-1] * len(df))

for iteration in range(10):
    # Assign clusters
    new_labels = []
    for idx, row in df.iterrows():
        distances = centroids.apply(lambda centroid: categorical_dissim(row, centroid), axis=1)
        new_labels.append(distances.idxmin())
    new_labels = pd.Series(new_labels)

    # Check convergence
    if new_labels.equals(labels):
        print(f"Converged after {iteration} iterations")
        break
    labels = new_labels

    # Update centroids: for each cluster feature mode
    new_centroids = []
    for cluster in range(k):
        cluster_points = df.loc[labels == cluster]
        if not cluster_points.empty:
            new_centroid = cluster_points.apply(mode_of_column)
            new_centroids.append(new_centroid)
        else:
            # If cluster empty, keep old centroid
            new_centroids.append(centroids.loc[cluster])
    centroids = pd.DataFrame(new_centroids).reset_index(drop=True)

# Assign cluster labels
df['Cluster'] = labels

print("\nFinal cluster centroids:")
print(centroids)

# Print data points in each cluster
for cluster in range(k):
    print(f"\nCluster {cluster+1} points:")
    print(df[df['Cluster']==cluster])

Original Data:
  F-Col F-Diam F-com
0     R     C1    AZ
1     G     C1    AZ
2     G     C2    AB
3     R     C2    AB
4     B     C1    AZ
5     R     C1    AB
6     G     C1    AZ
7     R     C2    AZ
Converged after 1 iterations

Final cluster centroids:
  F-Col F-Diam F-com
0     G     C1    AZ
1     R     C1    AB

Cluster 1 points:
  F-Col F-Diam F-com  Cluster
0     R     C1    AZ        0
1     G     C1    AZ        0
2     G     C2    AB        0
4     B     C1    AZ        0
6     G     C1    AZ        0
7     R     C2    AZ        0

Cluster 2 points:
  F-Col F-Diam F-com  Cluster
3     R     C2    AB        1
5     R     C1    AB        1
