# [Clustering4Ever](https://github.com/Clustering4Ever/Clustering4Ever) on [SparkNotebook](http://spark-notebook.io/) by [LIPN](https://lipn.univ-paris13.fr/) [A3](https://lipn.univ-paris13.fr/accueil/equipe/a3/) team

# Spark _K_-Modes

In [ ]:
import smile.plot._
import clustering4ever.spark.clustering.kmodes.KModes
import clustering4ever.math.distances.binary.{Hamming, Vari, MeanMahanttan}
import _root_.scala.io.Source
import smile.feature.Standardizer
import smile.data.NumericAttribute
import smile.data.Attribute.Type._
import smile.plot._
import smile.projection._
import smile.manifold._
import java.awt.Color

import smile.plot._
import clustering4ever.spark.clustering.kmodes.KModes
import clustering4ever.math.distances.binary.{Hamming, Vari, MeanMahanttan}
import _root_.scala.io.Source
import smile.feature.Standardizer
import smile.data.NumericAttribute
import smile.data.Attribute.Type._
import smile.plot._
import smile.projection._
import smile.manifold._
import java.awt.Color


## Import data as Array[(Int, Array[Int])]

In [ ]:
val path = "/home/KyBe/tmpDS/digits.csv"
//val sdata = Source.fromFile(path).getLines.map(_.split(",").map(_.toInt))
//val sdata = Seq.fill(10000)(Array.fill(10)(scala.util.Random.nextInt(2)))
//val data = sc.parallelize(sdata).zipWithIndex.map(_.swap).cache
val data = sc.textFile(path).map(_.split(",").map(_.toInt)).zipWithIndex.map(_.swap).cache

path: String = /home/KyBe/tmpDS/digits.csv
data: org.apache.spark.rdd.RDD[(Long, Array[Int])] = MapPartitionsRDD[4] at map at <console>:77


## Parameters 

In [ ]:
val k = 6
val iterMax = 100
val epsilon = 0.001
val metric1 = new Hamming
val metric2 = new MeanMahanttan
val metric3 = new Vari

k: Int = 6
iterMax: Int = 100
epsilon: Double = 0.001
metric1: clustering4ever.math.distances.binary.Hamming = clustering4ever.math.distances.binary.Hamming@6ce84c97
metric2: clustering4ever.math.distances.binary.MeanMahanttan = clustering4ever.math.distances.binary.MeanMahanttan@7e3035e1
metric3: clustering4ever.math.distances.binary.Vari = clustering4ever.math.distances.binary.Vari@3beb0628


## Run the algorithm

In [ ]:
val model = KModes.run(sc, data, k, epsilon, iterMax, metric1)
val clusterized = data.map{ case (id, v) => (model.predict(v), (id, v)) }

model: clustering4ever.spark.clustering.kmodes.KModesModel = clustering4ever.spark.clustering.kmodes.KModesModel@2c709947
clusterized: org.apache.spark.rdd.RDD[(model.ClusterID, (Long, Array[Int]))] = MapPartitionsRDD[8] at map at <console>:82


### Transform binary data into a grid to visualize it. Here a 15 x 16 grid for digits data

In [ ]:
val centroids = model.centroids.toArray
val formatedCentroids = centroids.map(_._2).map(_.map(_.toDouble).grouped(15).toArray)

centroids: Array[(Int, Array[Int])] = Array((2,Array(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0)), (5,Array(0, 0, 0, 1, 1...

### Visualize centroids

In [ ]:
formatedCentroids.foreach( centroid => hexmap(centroid, Array(Palette.BLACK, Palette.LIGHT_GRAY)))

## Standardize data and apply a PCA on it

In [ ]:
val types = for( i <- (0 until data.first._2.size).toArray ) yield(new NumericAttribute(i.toString))

val standardize = new Standardizer

val dataAsDouble = data.map(_._2.map(_.toDouble)).collect

standardize.learn(types.toArray, dataAsDouble)

val standardizedData = clusterized.map{ case (clusterID, (id, v)) => (clusterID, (id, (v.map(_.toDouble)))) }.collect

types: Array[smile.data.NumericAttribute] = Array(NUMERIC[0], NUMERIC[1], NUMERIC[2], NUMERIC[3], NUMERIC[4], NUMERIC[5], NUMERIC[6], NUMERIC[7], NUMERIC[8], NUMERIC[9], NUMERIC[10], NUMERIC[11], NUMERIC[12], NUMERIC[13], NUMERIC[14], NUMERIC[15], NUMERIC[16], NUMERIC[17], NUMERIC[18], NUMERIC[19], NUMERIC[20], NUMERIC[21], NUMERIC[22], NUMERIC[23], NUMERIC[24], NUMERIC[25], NUMERIC[26], NUMERIC[27], NUMERIC[28], NUMERIC[29], NUMERIC[30], NUMERIC[31], NUMERIC[32], NUMERIC[33], NUMERIC[34], NUMERIC[35], NUMERIC[36], NUMERIC[37], NUMERIC[38], NUMERIC[39], NUMERIC[40], NUMERIC[41], NUMERIC[42], NUMERIC[43], NUMERIC[44], NUMERIC[45], NUMERIC[46], NUMERIC[47], NUMERIC[48], NUMERIC[49], NUMERIC[50], NUMERIC[51], NUMERIC[52], NUMERIC[53], NUMERIC[54], NUMERIC[55], NUMERIC[56], NUMERIC[57], NUM...

In [ ]:
val readyToPca = standardizedData.map(_._2._2)

val pcaModel = pca(readyToPca)

pcaModel.setProjection(3)

val pcaizedData = standardizedData.map{ case (clusterID, (id, v)) => (clusterID, pcaModel.project(v)) }

readyToPca: Array[Array[Double]] = Array(Array(0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...

In [ ]:
val pureData = pcaizedData.map(_._2)
val labels = pcaizedData.map(_._1)
val pcizedCentroids = centroids.map{ case (_, centroid) =>  (Int.MaxValue, pcaModel.project(standardize.transform(centroid.map(_.toDouble)))) }

pureData: Array[Array[Double]] = Array(Array(0.8581670566923537, 4.491446006457944, 1.3492074366872653), Array(0.3812071416844144, 3.2096660209117225, 0.030868635243768594), Array(2.040289609732212, 3.281287381274164, 0.5883708950800963), Array(0.6009966267867405, 3.1202592201212456, 1.039892669726206), Array(2.188639715061769, 3.017027197526276, 0.15057016702522696), Array(0.33402248854777394, -1.0384366071886362, -0.9078965691317149), Array(2.0582940809526598, 4.122284171373905, 1.9544274974383697), Array(1.5809368270391944, 3.7739566822936377, 1.2726865712592756), Array(1.8419422874284388, 4.275665793297685, 1.8050967388646428), Array(1.2835788275741016, 3.3133122571689695, 1.8535870004193489), Array(1.563510656399107, 3.079184695975594, 0.9349226924825416), Array(2.0847380739786225,...

In [ ]:
plot(pureData, labels, '.', Palette.COLORS)

res28: smile.plot.Window = Window(javax.swing.JFrame[frame0,460,53,1000x1000,invalid,layout=java.awt.BorderLayout,title=Smile Plot 1,resizable,normal,defaultCloseOperation=DISPOSE_ON_CLOSE,rootPane=javax.swing.JRootPane[,5,25,990x970,invalid,layout=javax.swing.JRootPane$RootLayout,alignmentX=0.0,alignmentY=0.0,border=,flags=16777673,maximumSize=,minimumSize=,preferredSize=],rootPaneCheckingEnabled=true],smile.plot.PlotCanvas[,0,0,0x0,invalid,layout=java.awt.BorderLayout,alignmentX=0.0,alignmentY=0.0,border=,flags=9,maximumSize=,minimumSize=,preferredSize=])


In [ ]:
val window = plot(pureData, labels, '.', Palette.COLORS)
window.canvas.points("centroid", pcizedCentroids.map(_._2), 'Q', Color.BLACK)

window: smile.plot.Window = Window(javax.swing.JFrame[frame1,460,53,1000x1000,invalid,layout=java.awt.BorderLayout,title=Smile Plot 2,resizable,normal,defaultCloseOperation=DISPOSE_ON_CLOSE,rootPane=javax.swing.JRootPane[,5,25,990x970,invalid,layout=javax.swing.JRootPane$RootLayout,alignmentX=0.0,alignmentY=0.0,border=,flags=16777673,maximumSize=,minimumSize=,preferredSize=],rootPaneCheckingEnabled=true],smile.plot.PlotCanvas[,0,0,0x0,invalid,layout=java.awt.BorderLayout,alignmentX=0.0,alignmentY=0.0,border=,flags=9,maximumSize=,minimumSize=,preferredSize=])
res30: smile.plot.ScatterPlot = smile.plot.ScatterPlot@ea47185


## Visualization with t-sne

In [ ]:
val sne = tsne(readyToPca, 3)

In [ ]:
plot(sne.getCoordinates, labels, '.', Palette.COLORS)

## Include centroids

In [ ]:
val readyTsneCentroids = centroids.map{ case (_, centroid) =>  (Int.MaxValue, standardize.transform(centroid.map(_.toDouble))) }

val readyToTsne = readyToPca ++ readyTsneCentroids.map(_._2)

readyTsneCentroids: Array[(Int, Array[Double])] = Array((2147483647,Array(-0.4691973206300246, -0.724769091822159, -1.0047612781803712, 0.7311989859280826, 0.5818236344745712, 0.4731577041265375, 0.4050460242033411, 0.3752743814604061, 0.3838921816267829, 0.45004943939012954, 0.5440847285433487, -1.4007074931910235, -1.0209698667667724, -0.7473734209867955, -0.4818399603409751, -0.6701124716326963, -1.0595190793022655, 0.6889766255545046, 0.5386811852615968, 0.4419949476766309, 0.36391836974993397, 0.30373202063736837, 0.2784733628071156, 0.3076416819747157, 0.3577255499568549, 0.42899948314926056, 0.5502546049722432, -1.419712279393056, -1.0353755795234765, -0.6646353033940223, -0.7056049015940564, -1.1534022844770293, 0.6622910964242819, 0.5502546049722432, 0.4620448429462271, 0.45004...

In [ ]:
val sne2 = tsne(readyToTsne, 2)

sne2: smile.manifold.TSNE = smile.manifold.TSNE@444302c7


In [ ]:
val dataTsne = sne2.getCoordinates.take(standardizedData.size)
val dataCentroids = sne2.getCoordinates.takeRight(10)

val window = plot(dataTsne, labels, '.', Palette.COLORS)
window.canvas.points("centroid", dataCentroids, 'Q', Color.BLACK)

dataTsne: Array[Array[Double]] = Array(Array(18.12326109744452, 0.7448072405429023), Array(19.069433560915773, 4.330583829177866), Array(15.700965442159209, -2.0326731448993938), Array(14.739782902497167, 0.2552924471014526), Array(18.121068186767076, 4.9937268020856616), Array(-0.665269310861698, -5.532738482182505), Array(20.145151948533076, -0.7301585122130199), Array(18.539876913104802, 3.0017123437022737), Array(20.37164391415772, 0.5737364078924903), Array(17.116773666784393, -2.05760023561448), Array(19.468421227545768, -1.6753187209386633), Array(14.472768554725572, -0.6622259459328816), Array(14.737311863567891, 2.4925196651917325), Array(15.146246268648131, 0.697766823116357), Array(16.732671571610727, 3.3390819380191616), Array(16.715194899273694, 4.835664728371025), Array(13...