<a href="https://colab.research.google.com/github/Ricardo-Jaramillo/PySpark/blob/main/11_KMeans_clustering_exmaple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KMeans example

## Install pyspark and download the data file

In [1]:
# Install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=03de227869f00e4cc9febd28274fcb16cfef306abc5d2575ecacb90d47d88917
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [6]:
# Download the necessary data files
!wget https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/KMeans/seeds_dataset.csv

--2023-10-04 17:27:37--  https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/KMeans/seeds_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11739 (11K) [text/plain]
Saving to: ‘seeds_dataset.csv’


2023-10-04 17:27:37 (21.7 MB/s) - ‘seeds_dataset.csv’ saved [11739/11739]



## Read in the data

In [19]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [8]:
# Create a session
spark = SparkSession.builder.appName('cluster_example').getOrCreate()

In [9]:
# Read in the data file
dataset = spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)

In [10]:
# Show data
dataset.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

## Assemble data features

In [12]:
# Print out columns
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [14]:
# Create assembler object
assembler = VectorAssembler(inputCols=dataset.columns,
                            outputCol='features')

In [26]:
# Transform dataset into features with aseembler just created
final_data = assembler.transform(dataset)

In [27]:
# Show final_data
final_data.printSchema()
final_data.show()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.

## Standarize features with `StandardScaler` method
Removes mean and sets stddev to 1

In [28]:
# Create a scaler object
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')

In [29]:
# Fit the standard scaler object with the final_data
scaler_model = scaler.fit(final_data)

In [30]:
# Scale data with the scaler object
final_data_scaled = scaler_model.transform(final_data)

In [36]:
# Show the final sclaed data
final_data_scaled.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      scaledFeatures|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [37]:
# Show the first row of final data scaled
final_data_scaled.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

## Train and evaluate the model

In [33]:
# Create kmeans object
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [38]:
# Fit the model
model = kmeans.fit(final_data_scaled)

In [41]:
#  Print out the Loss
print(f'WSSSE: {model.summary.trainingCost}')

WSSSE: 429.00761965459367


In [44]:
# Print out the centers
centers = model.clusterCenters()
centers

[array([ 4.90455443, 10.919579  , 37.26051182, 12.3885095 ,  8.57467662,
         1.81659031, 10.38074771]),
 array([ 4.06133795, 10.13721767, 35.82681204, 11.81771972,  7.5087187 ,
         3.25852121, 10.4215732 ]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107])]

In [49]:
# Predict clusters
preds = model.transform(final_data_scaled)

In [52]:
# Show only prediction and features
preds.select('features', 'prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[15.26,14.84,0.87...|         0|
|[14.88,14.57,0.88...|         0|
|[14.29,14.09,0.90...|         0|
|[13.84,13.94,0.89...|         0|
|[16.14,14.99,0.90...|         0|
|[14.38,14.21,0.89...|         0|
|[14.69,14.49,0.87...|         0|
|[14.11,14.1,0.891...|         0|
|[16.63,15.46,0.87...|         2|
|[16.44,15.25,0.88...|         0|
|[15.26,14.85,0.86...|         0|
|[14.03,14.16,0.87...|         0|
|[13.89,14.02,0.88...|         0|
|[13.78,14.06,0.87...|         0|
|[13.74,14.05,0.87...|         0|
|[14.59,14.28,0.89...|         0|
|[13.99,13.83,0.91...|         0|
|[15.69,14.75,0.90...|         0|
|[14.7,14.21,0.915...|         0|
|[12.72,13.57,0.86...|         1|
+--------------------+----------+
only showing top 20 rows

