# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) on [SparkNotebook](http://spark-notebook.io/) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

# Spark _K_-Means

In [ ]:
import smile.plot._
import scala.io.Source
import scala.collection.{mutable, immutable}
import clustering4ever.spark.clustering.kmeans.KMeans
import clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import clustering4ever.spark.indexes.ExternalIndexes
import clustering4ever.scala.indexes.NmiNormalizationNature._
import clustering4ever.util.SparkImplicits._
import clustering4ever.scala.clusterizables.RealClusterizable
import clustering4ever.scala.vectorizables.RealVector

import smile.plot._
import scala.io.Source
import scala.collection.{mutable, immutable}
import clustering4ever.spark.clustering.kmeans.KMeans
import clustering4ever.math.distances.scalar.{Euclidean, Cosine, Minkowski}
import clustering4ever.spark.indexes.ExternalIndexes
import clustering4ever.scala.indexes.NmiNormalizationNature._
import clustering4ever.util.SparkImplicits._
import clustering4ever.scala.clusterizables.RealClusterizable
import clustering4ever.scala.vectorizables.RealVector


## Download dataset Aggregation

In [ ]:
:sh wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/aggregation.csv
:sh wget -P /tmp/ http://www.clustering4ever.org/Datasets/Aggregation/labels

--2018-09-19 14:31:42--  http://www.clustering4ever.org/Datasets/Aggregation/aggregation.csv
Resolving www.clustering4ever.org (www.clustering4ever.org)... 62.210.16.62
Connecting to www.clustering4ever.org (www.clustering4ever.org)|62.210.16.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8063 (7.9K) [text/csv]
Saving to: ‘/tmp/aggregation.csv.2’

     0K .......                                               100% 38.1K=0.2s

2018-09-19 14:31:43 (38.1 KB/s) - ‘/tmp/aggregation.csv.2’ saved [8063/8063]

:sh: Scheme missing.
--2018-09-19 14:31:43--  http://wget/
Resolving wget (wget)... failed: Name or service not known.
wget: unable to resolve host address ‘wget’
--2018-09-19 14:31:43--  http://www.clustering4ever.org/Datasets/Aggregation/labels
Reusing existing connection to www.clustering4ever.org:80.
HTTP request sent, awaiting response... 200 OK
Length: 1576 (1.5K)
Saving to: ‘/tmp/labels.11’

     0K .                                                     

## Import data as Array[Array[Double]]

In [ ]:
val datasetSize = 500000
val dim = 4
val useAggregationDS = true
val dp = 16

val path = "/tmp/aggregation.csv"
val rawRdd = if( useAggregationDS ) sc.textFile(path, dp).map( x => mutable.ArrayBuffer(x.split(",").map(_.toDouble):_*)) else sc.parallelize(List.fill(datasetSize)(mutable.ArrayBuffer.fill(dim)(scala.util.Random.nextDouble)), dp)
val rdd = rawRdd.zipWithIndex.map{ case (v, id) => new RealClusterizable(id, new RealVector(v)) }.cache
val labelsPath = "/tmp/labels"

rdd.count

datasetSize: Int = 500000
dim: Int = 4
useAggregationDS: Boolean = true
dp: Int = 16
path: String = /tmp/aggregation.csv
rawRdd: org.apache.spark.rdd.RDD[scala.collection.mutable.ArrayBuffer[Double]] = MapPartitionsRDD[2] at map at <console>:89
rdd: org.apache.spark.rdd.RDD[clustering4ever.scala.clusterizables.RealClusterizable[Long,scala.collection.mutable.ArrayBuffer[Double],scala.collection.mutable.ArrayBuffer[Double]]] = MapPartitionsRDD[4] at map at <console>:90
labelsPath: String = /tmp/labels
res4: Long = 788


## Parameters 

In [ ]:
val k = 7
val iterMax = 40
val epsilon = 0.5

k: Int = 7
iterMax: Int = 40
epsilon: Double = 0.5


## Run and measure the algorithm time, you can give a well prepared RDD[Clusterizable] or use implicit conversion and gives RDD[Seq[Double]]

In [ ]:
val t1 = System.currentTimeMillis
val model = KMeans.run(
  sc,
  rdd,
  k,
  epsilon,
  iterMax
)
val t2 = System.currentTimeMillis
(t2 - t1) / 1000D

t1: Long = 1537360307921
model: clustering4ever.spark.clustering.kmeans.KMeansModel[Long,scala.collection.mutable.ArrayBuffer[Double],scala.collection.mutable.ArrayBuffer[Double],clustering4ever.scala.clusterizables.RealClusterizable[Long,scala.collection.mutable.ArrayBuffer[Double],scala.collection.mutable.ArrayBuffer[Double]],clustering4ever.math.distances.scalar.Euclidean[scala.collection.mutable.ArrayBuffer[Double]]] = clustering4ever.spark.clustering.kmeans.KMeansModel@76383067
t2: Long = 1537360309053
res7: Double = 1.132


In [ ]:
val clusterized = model.centerPredict(rdd)
val lclusterized = clusterized.collect

clusterized: org.apache.spark.rdd.RDD[clustering4ever.scala.clusterizables.RealClusterizable[Long,scala.collection.mutable.ArrayBuffer[Double],scala.collection.mutable.ArrayBuffer[Double]]] = MapPartitionsRDD[34] at map at K-CommonsSpark.scala:154
lclusterized: Array[clustering4ever.scala.clusterizables.RealClusterizable[Long,scala.collection.mutable.ArrayBuffer[Double],scala.collection.mutable.ArrayBuffer[Double]]] = Array(RealClusterizable(0,clustering4ever.scala.vectorizables.RealVector@76c9fb00,None,Some(3)), RealClusterizable(1,clustering4ever.scala.vectorizables.RealVector@63b7052d,None,Some(0)), RealClusterizable(2,clustering4ever.scala.vectorizables.RealVector@35926016,None,Some(0)), RealClusterizable(3,clustering4ever.scala.vectorizables.RealVector@2a51cbe4,None,Some(0)), RealC...

## Plot clustering results

In [ ]:
val rawData = lclusterized.map(_.vector.toArray).toArray
val labels = lclusterized.map( cz => cz.clusterID.get ).toArray

plot(rawData, labels, '*', Palette.COLORS)

java.awt.HeadlessException
  at java.awt.GraphicsEnvironment.checkHeadless(GraphicsEnvironment.java:204)
  at java.awt.Window.<init>(Window.java:536)
  at java.awt.Frame.<init>(Frame.java:420)
  at javax.swing.JFrame.<init>(JFrame.java:233)
  at smile.plot.Window$.frame(Window.scala:49)
  at smile.plot.Window$.apply(Window.scala:33)
  at smile.plot.Operators$class.plot(Operators.scala:86)
  at smile.plot.package$.plot(package.scala:23)
  ... 69 elided


## Inspect performance metrics

In [ ]:
val trueLabels = sc.parallelize(Source.fromFile(labelsPath).getLines.map(_.toInt).toSeq)

val (_, trueLabelsFrom0) = ExternalIndexes.prepareLabels(trueLabels)

val predicts = clusterized.map(_.clusterID.get)

val trueAndPredictRDD = sc.parallelize(trueLabelsFrom0.collect.zip(predicts.collect).map{ case(gt, pred) => (gt.toInt, pred) })

val nmi = ExternalIndexes.nmi(sc, trueAndPredictRDD, SQRT)

nmi

trueLabels: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[35] at parallelize at <console>:100
trueLabelsFrom0: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[40] at map at ExternalIndexes.scala:73
predicts: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[41] at map at <console>:104
trueAndPredictRDD: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[42] at parallelize at <console>:106
nmi: Double = 0.5988478670110992
res12: Double = 0.5988478670110992
