# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) on [SparkNotebook](http://spark-notebook.io/) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

# Scala _K_-Means

In [ ]:
import _root_.smile.plot._
import _root_.clustering4ever.scala.clustering.kmeans.KMeans
import _root_.clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import _root_.scala.io.Source
import _root_.clustering4ever.spark.indexes.ExternalIndexes
import _root_.clustering4ever.scala.indexes.{ExternalIndexes => ScalaExternalIndexes}
import _root_.clustering4ever.scala.indexes.NmiNormalizationNature._

import _root_.smile.plot._
import _root_.clustering4ever.scala.clustering.kmeans.KMeans
import _root_.clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import _root_.scala.io.Source
import _root_.clustering4ever.spark.indexes.ExternalIndexes
import _root_.clustering4ever.scala.indexes.{ExternalIndexes=>ScalaExternalIndexes}
import _root_.clustering4ever.scala.indexes.NmiNormalizationNature._


## Download dataset Aggregation

In [ ]:
:sh wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/aggregation.csv
:sh wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/labels

--2018-04-08 16:46:50--  http://www.clustering4ever.org/Datasets/Aggregation/aggregation.csv
Resolving www.clustering4ever.org (www.clustering4ever.org)... 62.210.16.62
Connecting to www.clustering4ever.org (www.clustering4ever.org)|62.210.16.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8063 (7.9K) [text/csv]
Saving to: ‘/tmp/aggregation.csv.1’

     0K .......                                               100% 1.52G=0s

2018-04-08 16:46:50 (1.52 GB/s) - ‘/tmp/aggregation.csv.1’ saved [8063/8063]

:sh: Scheme missing.
--2018-04-08 16:46:50--  http://wget/
Resolving wget (wget)... failed: Name or service not known.
wget: unable to resolve host address ‘wget’
--2018-04-08 16:46:50--  http://www.clustering4ever.org/Datasets/Aggregation/labels
Reusing existing connection to www.clustering4ever.org:80.
HTTP request sent, awaiting response... 200 OK
Length: 1576 (1.5K)
Saving to: ‘/tmp/labels’

     0K .                                                     100% 

## Import data as Array[Array[Double]]

In [ ]:
val path = "/tmp/aggregation.csv"
val data = Source.fromFile(path).getLines.map(_.split(",").map(_.toDouble)).toArray
val labelsPath = "/tmp/labels"

path: String = /tmp/aggregation.csv
data: Array[Array[Double]] = Array(Array(15.55, 28.65), Array(14.9, 27.55), Array(14.45, 28.35), Array(14.15, 28.8), Array(13.75, 28.05), Array(13.35, 28.45), Array(13.0, 29.15), Array(13.45, 27.5), Array(13.6, 26.5), Array(12.8, 27.35), Array(12.4, 27.85), Array(12.3, 28.4), Array(12.2, 28.65), Array(13.4, 25.1), Array(12.95, 25.95), Array(12.9, 26.5), Array(11.85, 27.0), Array(11.35, 28.0), Array(11.15, 28.7), Array(11.25, 27.4), Array(10.75, 27.7), Array(10.5, 28.35), Array(9.65, 28.45), Array(10.25, 27.25), Array(10.75, 26.55), Array(11.7, 26.35), Array(11.6, 25.9), Array(11.9, 25.05), Array(12.6, 24.05), Array(11.9, 24.5), Array(11.1, 25.2), Array(10.55, 25.15), Array(10.05, 25.95), Array(9.35, 26.6), Array(9.3, 27.25), Array(9.2, 27.8), Array(7....

## Parameters 

In [ ]:
val k = 6
val iterMax = 100
val epsilon = 0.001
// True defines the true Euclidean with square root, set it to false to avoid the square
val metric1 = new Euclidean(true)
// Minkowski(p) where p is the Minkowski parameter
val metric3 = new Minkowski(4)

k: Int = 6
iterMax: Int = 100
epsilon: Double = 0.001
metric1: clustering4ever.math.distances.scalar.Euclidean = Euclidean with root applied
metric3: clustering4ever.math.distances.scalar.Minkowski = clustering4ever.math.distances.scalar.Minkowski@1e0610d


## Run the algorithm

In [ ]:
val model = KMeans.run(data, k, epsilon, iterMax, metric1)
val clusterized = data.map( v => (model.predict(v), v) )

model: clustering4ever.scala.clustering.kmeans.KMeansModel = clustering4ever.scala.clustering.kmeans.KMeansModel@7ba94bcd
clusterized: Array[(model.ClusterID, Array[Double])] = Array((1,Array(15.55, 28.65)), (5,Array(14.9, 27.55)), (5,Array(14.45, 28.35)), (5,Array(14.15, 28.8)), (5,Array(13.75, 28.05)), (5,Array(13.35, 28.45)), (5,Array(13.0, 29.15)), (5,Array(13.45, 27.5)), (5,Array(13.6, 26.5)), (5,Array(12.8, 27.35)), (5,Array(12.4, 27.85)), (5,Array(12.3, 28.4)), (5,Array(12.2, 28.65)), (5,Array(13.4, 25.1)), (5,Array(12.95, 25.95)), (5,Array(12.9, 26.5)), (5,Array(11.85, 27.0)), (5,Array(11.35, 28.0)), (5,Array(11.15, 28.7)), (5,Array(11.25, 27.4)), (5,Array(10.75, 27.7)), (5,Array(10.5, 28.35)), (5,Array(9.65, 28.45)), (5,Array(10.25, 27.25)), (5,Array(10.75, 26.55)), (5,Array(11...

## Plot clustering results

In [ ]:
val rawData = clusterized.map{ case (clusterID, (id, vector)) => vector }
val labels = clusterized.map{ case (clusterID, (id, vector)) => clusterID }

plot(rawData, labels, '*', Palette.COLORS)

rawData: Array[Array[Double]] = Array(Array(15.55, 28.65), Array(14.9, 27.55), Array(14.45, 28.35), Array(14.15, 28.8), Array(13.75, 28.05), Array(13.35, 28.45), Array(13.0, 29.15), Array(13.45, 27.5), Array(13.6, 26.5), Array(12.8, 27.35), Array(12.4, 27.85), Array(12.3, 28.4), Array(12.2, 28.65), Array(13.4, 25.1), Array(12.95, 25.95), Array(12.9, 26.5), Array(11.85, 27.0), Array(11.35, 28.0), Array(11.15, 28.7), Array(11.25, 27.4), Array(10.75, 27.7), Array(10.5, 28.35), Array(9.65, 28.45), Array(10.25, 27.25), Array(10.75, 26.55), Array(11.7, 26.35), Array(11.6, 25.9), Array(11.9, 25.05), Array(12.6, 24.05), Array(11.9, 24.5), Array(11.1, 25.2), Array(10.55, 25.15), Array(10.05, 25.95), Array(9.35, 26.6), Array(9.3, 27.25), Array(9.2, 27.8), Array(7.5, 28.25), Array(8.55, 27.45), Ar...

## Inspect performance metrics

In [ ]:
val trueLabels = Source.fromFile(labelsPath).getLines.map(_.toInt).toArray

val (_, trueLabelsFrom0) = ScalaExternalIndexes.prepareList(trueLabels)

val predicts = clusterized.map(_._1)

val trueAndPredictRDD2 = sc.parallelize(trueLabelsFrom0.zip(predicts))

val nmi = ScalaExternalIndexes.nmi(trueLabelsFrom0, predicts, SQRT)
val nmi2 = ExternalIndexes.nmi(sc, trueAndPredictRDD2, SQRT)

(nmi, nmi2)

trueLabels: Array[Int] = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...