In [8]:
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
data = spark.read.csv('/home/sai/ex/ML/cls/seeds_dataset.csv', 
                      inferSchema=True, header=True)

In [4]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
assemble = VectorAssembler(inputCols=data.columns, outputCol='features')

In [7]:
input_df = assemble.transform(data)

In [9]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

In [12]:
#get standard deviation information and sets scalar model
scaled_data = scaler.fit(input_df).transform(input_df)

In [13]:
#model
model = KMeans(featuresCol='scaled_features', k=3)

In [14]:
km = model.fit(scaled_data)

In [15]:
#To compute cost
print(km.computeCost(scaled_data))

428.60820118716356


In [16]:
#get cluster centers
km.clusterCenters()

[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15410901, 10.38031464]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80061978, 10.41913733])]

In [17]:
#get actual prediction
km.transform(scaled_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows



# EX2

In [18]:
data = spark.read.csv('/home/sai/ex/ML/cls/hack_data.csv', 
                      inferSchema=True, header=True)

In [19]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [22]:
cols = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [23]:
assemble = VectorAssembler(inputCols=cols, outputCol='features')

In [24]:
input_df = assemble.transform(data)

In [25]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')

In [26]:
#get standard deviation information and sets scalar model
scaled_data = scaler.fit(input_df).transform(input_df)

In [41]:
#model with k=2
model2 = KMeans(featuresCol='scaled_features', k=2)

In [42]:
km2 = model2.fit(scaled_data)

In [43]:
#To compute cost
print(km2.computeCost(scaled_data))

601.7707512676716


In [44]:
#check model
#get actual prediction
km2.transform(scaled_data).select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [45]:
#model with k=3
model3 = KMeans(featuresCol='scaled_features', k=3)
km3 = model3.fit(scaled_data)
print(km3.computeCost(scaled_data))

434.1492898715845


In [48]:
km3.transform(scaled_data).select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   83|
|         0|   84|
+----------+-----+

