![](images/thinkbig.png)
# Demonstration of distributed K-Modes clustering

First load the library:

In [1]:
from pyspark_kmodes import *

Create some sample data:

In [2]:
# Create the data set
import numpy as np
data = np.random.choice(["a", "b", "c"], (50000, 10))
data2 = np.random.choice(["e", "f", "g"], (50000, 10))
data = list(data) + list(data2)

from random import shuffle
shuffle(data)

# Create a Spark RDD from our sample data and decrease partitions to max_partions
max_partitions = 32

rdd = sc.parallelize(data)
rdd = rdd.coalesce(max_partitions)

Specify 2 cluster centers and a maximum of 10 iterations:

In [3]:
n_clusters = 2
max_iter = 10

method = EnsembleKModes(n_clusters, max_iter)

Fit the model using PySpark:

In [4]:
model = method.fit(rdd)

Iteration  0
Iteration  1
Iteration  2
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 32.0
Avg cost/partition: 4.0
Final centroids:
['a' 'a' 'a' 'a' 'b' 'a' 'b' 'c' 'a' 'c']
['f' 'f' 'f' 'f' 'g' 'e' 'g' 'f' 'g' 'g']


Inspect the results:

In [7]:
print(model.clusters)
print(method.mean_cost)

[array(['a', 'a', 'a', 'a', 'b', 'a', 'b', 'c', 'a', 'c'], dtype=object), array(['f', 'f', 'f', 'f', 'g', 'e', 'g', 'f', 'g', 'g'], dtype=object)]
6.649289999999995


In [9]:
predictions = method.predictions
datapoints = method.indexed_rdd
combined = datapoints.zip(predictions)
print(combined.take(10))

model.predict(rdd).take(5)

[((0, array(['f', 'g', 'f', 'e', 'f', 'g', 'e', 'e', 'g', 'f'], 
      dtype='<U1')), (0, 1)), ((1, array(['a', 'c', 'a', 'b', 'c', 'c', 'b', 'b', 'b', 'a'], 
      dtype='<U1')), (1, 0)), ((2, array(['f', 'f', 'f', 'g', 'f', 'f', 'f', 'e', 'e', 'g'], 
      dtype='<U1')), (2, 1)), ((3, array(['a', 'c', 'a', 'b', 'a', 'c', 'c', 'b', 'b', 'c'], 
      dtype='<U1')), (3, 0)), ((4, array(['a', 'b', 'b', 'b', 'c', 'b', 'a', 'a', 'c', 'c'], 
      dtype='<U1')), (4, 0)), ((5, array(['b', 'a', 'b', 'c', 'c', 'b', 'b', 'a', 'b', 'a'], 
      dtype='<U1')), (5, 0)), ((6, array(['f', 'e', 'e', 'f', 'f', 'f', 'f', 'f', 'f', 'g'], 
      dtype='<U1')), (6, 1)), ((7, array(['g', 'e', 'g', 'f', 'f', 'f', 'e', 'g', 'f', 'e'], 
      dtype='<U1')), (7, 1)), ((8, array(['c', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'b', 'c'], 
      dtype='<U1')), (8, 0)), ((9, array(['g', 'f', 'e', 'g', 'g', 'f', 'g', 'g', 'f', 'f'], 
      dtype='<U1')), (9, 1))]


[1, 0, 1, 0, 0]

In [10]:
model.predict(sc.parallelize(['e', 'e', 'f', 'e', 'e', 'f', 'g', 'e', 'f', 'e'])).collect()  

[0, 0, 1, 0, 0, 1, 0, 0, 1, 0]