# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) on [SparkNotebook](http://spark-notebook.io/) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

# Scala _K_-Means

In [ ]:
import smile.plot._
import clustering4ever.scala.clustering.kmeans.KMeans
import clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import _root_.scala.io.Source
import clustering4ever.spark.indexes.ExternalIndexes
import clustering4ever.scala.indexes.ExternalIndexes

import smile.plot._
import clustering4ever.scala.clustering.kmeans.KMeans
import clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import _root_.scala.io.Source
import clustering4ever.spark.indexes.ExternalIndexes
import clustering4ever.scala.indexes.ExternalIndexes


## Import data as Array[(Int, Array[Double])]

In [ ]:
val path = "/home/KyBe/tmpDS/aggregation.csv"
val data = Source.fromFile(path).getLines.map(_.split(",").map(_.toDouble)).zipWithIndex.map(_.swap).toArray

path: String = /home/KyBe/tmpDS/aggregation.csv
data: Array[(Int, Array[Double])] = Array((0,Array(15.55, 28.65)), (1,Array(14.9, 27.55)), (2,Array(14.45, 28.35)), (3,Array(14.15, 28.8)), (4,Array(13.75, 28.05)), (5,Array(13.35, 28.45)), (6,Array(13.0, 29.15)), (7,Array(13.45, 27.5)), (8,Array(13.6, 26.5)), (9,Array(12.8, 27.35)), (10,Array(12.4, 27.85)), (11,Array(12.3, 28.4)), (12,Array(12.2, 28.65)), (13,Array(13.4, 25.1)), (14,Array(12.95, 25.95)), (15,Array(12.9, 26.5)), (16,Array(11.85, 27.0)), (17,Array(11.35, 28.0)), (18,Array(11.15, 28.7)), (19,Array(11.25, 27.4)), (20,Array(10.75, 27.7)), (21,Array(10.5, 28.35)), (22,Array(9.65, 28.45)), (23,Array(10.25, 27.25)), (24,Array(10.75, 26.55)), (25,Array(11.7, 26.35)), (26,Array(11.6, 25.9)), (27,Array(11.9, 25.05)), (28,Array(12.6,...

## Parameters 

In [ ]:
val k = 6
val iterMax = 100
val epsilon = 0.001
// True defines the true Euclidean with square root, set it to false to avoid the square
val metric1 = new Euclidean(true)
val metric2 = new Cosine
// Minkowski(p) where p is the Minkowski parameter
val metric3 = new Minkowski(4)

k: Int = 6
iterMax: Int = 100
epsilon: Double = 0.001
metric1: clustering4ever.math.distances.scalar.Euclidean = Euclidean with root applied
metric2: clustering4ever.math.distances.scalar.Cosine = clustering4ever.math.distances.scalar.Cosine@4ffc1b5c
metric3: clustering4ever.math.distances.scalar.Minkowski = clustering4ever.math.distances.scalar.Minkowski@73c74a41


## Run the algorithm

In [ ]:
val model = KMeans.run(data, k, epsilon, iterMax, metric1)
val clusterized = data.map{ case (id, v) => (model.predict(v), (id, v)) }

model: clustering4ever.scala.clustering.kmeans.KMeansModel = clustering4ever.scala.clustering.kmeans.KMeansModel@2cd0d06e
clusterized: Array[(model.ClusterID, (Int, Array[Double]))] = Array((4,(0,Array(15.55, 28.65))), (1,(1,Array(14.9, 27.55))), (1,(2,Array(14.45, 28.35))), (1,(3,Array(14.15, 28.8))), (1,(4,Array(13.75, 28.05))), (1,(5,Array(13.35, 28.45))), (1,(6,Array(13.0, 29.15))), (1,(7,Array(13.45, 27.5))), (1,(8,Array(13.6, 26.5))), (1,(9,Array(12.8, 27.35))), (1,(10,Array(12.4, 27.85))), (1,(11,Array(12.3, 28.4))), (1,(12,Array(12.2, 28.65))), (1,(13,Array(13.4, 25.1))), (1,(14,Array(12.95, 25.95))), (1,(15,Array(12.9, 26.5))), (1,(16,Array(11.85, 27.0))), (1,(17,Array(11.35, 28.0))), (1,(18,Array(11.15, 28.7))), (1,(19,Array(11.25, 27.4))), (1,(20,Array(10.75, 27.7))), (1,(21,...

## Plot clustering results

In [ ]:
val rawData = clusterized.map{ case (clusterID, (id, vector)) => vector }
val labels = clusterized.map{ case (clusterID, (id, vector)) => clusterID }

plot(rawData, labels, '*', Palette.COLORS)

rawData: Array[Array[Double]] = Array(Array(15.55, 28.65), Array(14.9, 27.55), Array(14.45, 28.35), Array(14.15, 28.8), Array(13.75, 28.05), Array(13.35, 28.45), Array(13.0, 29.15), Array(13.45, 27.5), Array(13.6, 26.5), Array(12.8, 27.35), Array(12.4, 27.85), Array(12.3, 28.4), Array(12.2, 28.65), Array(13.4, 25.1), Array(12.95, 25.95), Array(12.9, 26.5), Array(11.85, 27.0), Array(11.35, 28.0), Array(11.15, 28.7), Array(11.25, 27.4), Array(10.75, 27.7), Array(10.5, 28.35), Array(9.65, 28.45), Array(10.25, 27.25), Array(10.75, 26.55), Array(11.7, 26.35), Array(11.6, 25.9), Array(11.9, 25.05), Array(12.6, 24.05), Array(11.9, 24.5), Array(11.1, 25.2), Array(10.55, 25.15), Array(10.05, 25.95), Array(9.35, 26.6), Array(9.3, 27.25), Array(9.2, 27.8), Array(7.5, 28.25), Array(8.55, 27.45), Ar...

## Inspect performance metrics

In [ ]:
val trueLabels = Source.fromFile("/home/KyBe/tmpDS/labels").getLines.map(_.toInt).toArray

val (_, trueLabelsFrom0) = ExternalIndexes.prepareList(trueLabels)

val predicts = clusterized.map(_._1)

val trueAndPredictRDD2 = sc.parallelize(trueLabelsFrom0.zip(predicts))

val nmi = ExternalIndexes.nmi(trueLabelsFrom0, predicts, "sqrt")
val nmi2 = SparkExternalsIndexes.nmi(sc, trueAndPredictRDD2, "sqrt")

(nmi, nmi2)

trueLabels: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...