In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.getOrCreate()

In [None]:
df=spark.read.csv("/content/seeds.csv",header=True,inferSchema=True)

In [None]:
df.printSchema()

root
 |-- AREA: double (nullable = true)
 |-- PERIMETER: double (nullable = true)
 |-- COMPACTNESS: double (nullable = true)
 |-- LENGTH: double (nullable = true)
 |-- WIDTH: double (nullable = true)
 |-- ASSYMMETRY_COEFFICIENT: double (nullable = true)
 |-- GROOVE_LENGTH: double (nullable = true)
 |-- TYPE: integer (nullable = true)



In [None]:
df.show()

+-----+---------+-----------+------+-----+----------------------+-------------+----+
| AREA|PERIMETER|COMPACTNESS|LENGTH|WIDTH|ASSYMMETRY_COEFFICIENT|GROOVE_LENGTH|TYPE|
+-----+---------+-----------+------+-----+----------------------+-------------+----+
|15.26|    14.84|      0.871| 5.763|3.312|                 2.221|         5.22|   1|
|14.88|    14.57|     0.8811| 5.554|3.333|                 1.018|        4.956|   1|
|14.29|    14.09|      0.905| 5.291|3.337|                 2.699|        4.825|   1|
|13.84|    13.94|     0.8955| 5.324|3.379|                 2.259|        4.805|   1|
|16.14|    14.99|     0.9034| 5.658|3.562|                 1.355|        5.175|   1|
|14.38|    14.21|     0.8951| 5.386|3.312|                 2.462|        4.956|   1|
|14.69|    14.49|     0.8799| 5.563|3.259|                 3.586|        5.219|   1|
|14.11|     14.1|     0.8911|  5.42|3.302|                   2.7|          5.0|   1|
|16.63|    15.46|     0.8747| 6.053|3.465|                  2.04|

In [None]:
df.groupBy('TYPE').count().show()

+----+-----+
|TYPE|count|
+----+-----+
|   1|   70|
|   2|   70|
+----+-----+



In [None]:
df.count()

140

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [None]:
# Exclude 'TYPE' column for clustering
feature_cols = [col for col in df.columns if col != 'TYPE']

In [None]:
# Step 1: Assemble feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="unscaled_features")
df_vector = assembler.transform(df)

In [None]:
# Step 2: Scale features
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

In [None]:
# Apply KMeans
kmeans = KMeans(featuresCol='features', k=2)
model = kmeans.fit(df_scaled)

In [None]:
# Predictions
predictions = model.transform(df_scaled)
predictions.select("prediction").show(5)

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol='features', metricName='silhouette', distanceMeasure='squaredEuclidean')
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette:.4f}")

Silhouette Score: 0.6463


In [None]:
predictions.groupBy("TYPE", "prediction").count().orderBy("TYPE").show()


+----+----------+-----+
|TYPE|prediction|count|
+----+----------+-----+
|   1|         0|    2|
|   1|         1|   68|
|   2|         1|    7|
|   2|         0|   63|
+----+----------+-----+



In [None]:
for k in range(2, 10):
    kmeans = KMeans(featuresCol='features', k=k, seed=1)
    model = kmeans.fit(df_scaled)
    pred = model.transform(df_scaled)
    score = evaluator.evaluate(pred)
    print(f"k = {k}, Silhouette Score = {score:.4f}")


k = 2, Silhouette Score = 0.6463
k = 3, Silhouette Score = 0.4681
k = 4, Silhouette Score = 0.3617
k = 5, Silhouette Score = 0.4128
k = 6, Silhouette Score = 0.3776
k = 7, Silhouette Score = 0.4069
k = 8, Silhouette Score = 0.3820
k = 9, Silhouette Score = 0.3841
