# KMeans function

https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html



In [2]:
#api for k-means algo
from pyspark.ml.clustering import KMeans,KMeansModel

#ClusteringEvaluator
from pyspark.ml.evaluation import ClusteringEvaluator
#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.ClusteringEvaluator.html
#The metric computes the Silhouette measure using the squared Euclidean distance.

from numpy import array
from math import sqrt

#Build the api for the entrance
from pyspark.context import SparkContext,SparkConf
from pyspark.rdd import RDD
from pyspark.sql import SparkSession
import os
import sys


#Config
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jre1.8.0_301'
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


sc = SparkContext(conf=SparkConf().setAppName("Kmeans").setMaster("local"))
print(f'sc:{sc}')
spark = SparkSession(sc)

sc:<SparkContext master=local appName=Kmeans>


In [14]:
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
        (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]
data

[(DenseVector([0.0, 0.0]), 2.0),
 (DenseVector([1.0, 1.0]), 2.0),
 (DenseVector([9.0, 8.0]), 2.0),
 (DenseVector([8.0, 9.0]), 2.0)]

In [6]:
df = spark.createDataFrame(data, ["features", "weighCol"])
df.collect()

[Row(features=DenseVector([0.0, 0.0]), weighCol=2.0),
 Row(features=DenseVector([1.0, 1.0]), weighCol=2.0),
 Row(features=DenseVector([9.0, 8.0]), weighCol=2.0),
 Row(features=DenseVector([8.0, 9.0]), weighCol=2.0)]

In [15]:
kmeans = KMeans(k=2)

In [16]:
kmeans.setSeed(1)

KMeans_910f28006274

In [17]:
kmeans.setWeightCol("weighCol")

KMeans_910f28006274

In [18]:
kmeans.setMaxIter(10)

KMeans_910f28006274

In [19]:
kmeans.getMaxIter()

10

In [20]:
kmeans.clear(kmeans.maxIter)

## Build Model

In [21]:
model = kmeans.fit(df)

In [23]:
model.getDistanceMeasure()

'euclidean'

In [24]:
model.setPredictionCol("newPrediction")

KMeansModel: uid=KMeans_910f28006274, k=2, distanceMeasure=euclidean, numFeatures=2

In [25]:
model.predict(df.head().features)

0

In [26]:
centers = model.clusterCenters()

In [27]:
len(centers)

2

In [28]:
transformed = model.transform(df).select("features", "newPrediction")

In [30]:
rows = transformed.collect()
rows

[Row(features=DenseVector([0.0, 0.0]), newPrediction=0),
 Row(features=DenseVector([1.0, 1.0]), newPrediction=0),
 Row(features=DenseVector([9.0, 8.0]), newPrediction=1),
 Row(features=DenseVector([8.0, 9.0]), newPrediction=1)]

In [31]:
rows[0].newPrediction == rows[1].newPrediction


True

In [32]:
rows[2].newPrediction == rows[3].newPrediction


True

In [33]:
model.hasSummary


True

In [36]:
summary = model.summary
summary.clusterSizes

[2, 2]

In [37]:
summary.trainingCost

4.0

### Save the output 


In [45]:
print(os.path.abspath('.'))

C:\Users\LeoShr\p_space\NTHU\MDA\CH3_K_means


In [48]:
temp_path = os.path.abspath('.')

In [None]:
# kmeans_path = temp_path + "/kmeans"
# kmeans.save(kmeans_path)
# kmeans2 = KMeans.load(kmeans_path)
# kmeans2.getK()

# model_path = temp_path + "/kmeans_model"
# model.save(model_path)
# model2 = KMeansModel.load(model_path)
# model2.hasSummary


# model.clusterCenters()[0] == model2.clusterCenters()[0]
# array([ True,  True], dtype=bool)
# model.clusterCenters()[1] == model2.clusterCenters()[1]
# array([ True,  True], dtype=bool)
# model.transform(df).take(1) == model2.transform(df).take(1)
