In [1]:
import findspark
findspark.init('/usr/local/spark')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mllib').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

### Reading a Text file

In [4]:
dataset = spark.read.load("customers.txt", format="csv", sep="\t", inferSchema="true", header="true")

In [5]:
dataset.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: integer (nullable = true)



In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
feat_cols = ['customer_id','customer_zipcode']
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [8]:
final_data = vec_assembler.transform(dataset)

In [9]:
from pyspark.ml.feature import StandardScaler

In [10]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [11]:
scalerModel = scaler.fit(final_data)

In [12]:
cluster_final_data = scalerModel.transform(final_data)

In [13]:
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
kmeans2 = KMeans(featuresCol='scaledFeatures',k=2)

In [14]:
model_k3 = kmeans3.fit(cluster_final_data)
model_k2 = kmeans2.fit(cluster_final_data)

In [15]:
wssse_k3 = model_k3.computeCost(cluster_final_data)
wssse_k2 = model_k2.computeCost(cluster_final_data)

In [16]:
print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse_k3))

With K=3
Within Set Sum of Squared Errors = 770.8508261684073


In [17]:
print("With K=2")
print("Within Set Sum of Squared Errors = " + str(wssse_k2))

With K=2
Within Set Sum of Squared Errors = 1529.5757895948038


In [18]:
for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 1529.5757895948038
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 770.8508261684073
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 450.49372912128274
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 389.7294943265452
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 292.9036624803742
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 243.10707907676206
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 207.04074791503984
------------------------------------------------------------


### Reading a JSON file

In [19]:
dataset = spark.read.load("products.json", format="json")

In [20]:
dataset.printSchema()

root
 |-- category_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_quantity: long (nullable = true)
 |-- salestxn_id: long (nullable = true)



In [21]:
feat_cols = ['product_quantity']
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [22]:
final_data = vec_assembler.transform(dataset)

In [23]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [24]:
scalerModel = scaler.fit(final_data)

In [25]:
cluster_final_data = scalerModel.transform(final_data)

In [26]:
kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)
model_k3 = kmeans3.fit(cluster_final_data)

In [27]:
wssse_k3 = model_k3.computeCost(cluster_final_data)

In [28]:
print("With K=3")
print("Within Set Sum of Squared Errors = " + str(wssse_k3))

With K=3
Within Set Sum of Squared Errors = 1238.16644055922


In [29]:
for k in range(2,9): 
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    wssse = model.computeCost(cluster_final_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 2662.9046391531947
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 1238.16644055922
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 469.21314110503494
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 1.332943218572636e-22
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 1.332943218572636e-22
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 1.332943218572636e-22
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 1.332943218572636e-22
------------------------------------------------------------
