In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession.builder.appName('SparkSQL for ML').getOrCreate()

In [3]:
data_path = '/content/drive/MyDrive/Data/'
file_path = data_path + 'utilization.json'
df = spark.read.json(file_path)
df.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [7]:
VA = VectorAssembler(inputCols=['cpu_utilization', 'server_id', 'session_count', 'free_memory'], outputCol='features')
vcluster_df = VA.transform(df)
vcluster_df.show()

+---------------+-------------------+-----------+---------+-------------+--------------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|            features|
+---------------+-------------------+-----------+---------+-------------+--------------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|[0.57,100.0,47.0,...|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|[0.47,100.0,43.0,...|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|[0.56,100.0,62.0,...|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|[0.57,100.0,50.0,...|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|[0.35,100.0,43.0,...|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|[0.41,100.0,48.0,...|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|[0.57,100.0,58.0,...|
|           0.41|03/05/2019 08:41:14|        0.4| 

In [10]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)

In [11]:
kcluster_model = kmeans.fit(vcluster_df)

In [13]:
kcluster_model.clusterCenters()

[array([  0.65736901, 138.60101718,  76.67382735,   0.34256349]),
 array([  0.53245533, 122.38340495,  53.89029363,   0.46675541]),
 array([  0.67888584, 110.66395752,  79.4234258 ,   0.3209278 ])]