In [None]:
!pip install PySpark

In [None]:
from pyspark import SparkConf,SparkContext
conf=SparkConf().setAppName('abc').setMaster('local') #
sc=SparkContext(conf=conf)
sc.setLogLevel('ERROR')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('abc').config('','').getOrCreate()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv('/content/drive/MyDrive/ColabInputs/iris.csv', header=None, inferSchema=True)
df.show(5,0)

In [None]:
df.columns

In [None]:
df.count()

In [None]:
df.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
# Assembling features into a single column
assembler = VectorAssembler(inputCols=['_c0','_c1','_c2','_c3'], outputCol="features")
data_df = assembler.transform(df)
data_df.show(5,0)

In [None]:
#Scaling the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(data_df)
data_df = scaler_model.transform(data_df)
data_df.show(5,0)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# Define the K-means clustering model
kmeans = KMeans(k=2, featuresCol="scaled_features", predictionCol="cluster")
kmeans_model = kmeans.fit(data_df)
# Assigning the data points to clusters
clustered_data = kmeans_model.transform(data_df)

In [None]:
evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='scaled_features', metricName='silhouette', distanceMeasure='squaredEuclidean')
wssse = evaluator.evaluate(clustered_data)
print(f"Within Set Sum of Squared Errors (WSSSE) = {wssse}")

In [None]:
# Converting to Pandas DataFrame
clustered_data_pd = clustered_data.toPandas()
# Visualizing the results
plt.scatter(clustered_data_pd["_c0"], clustered_data_pd["_c1"], c=clustered_data_pd["cluster"], cmap='viridis')
plt.xlabel("SepalLengthCm")
plt.ylabel("SepalWidthCm")
plt.title("K-means Clustering with PySpark MLlib")
plt.colorbar().set_label("Cluster")
plt.show()

In [None]:
# for K values from 2 to 8
wssse_values =[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features', metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(2,11):
  KMeans_mod = KMeans(featuresCol='scaled_features', k=i)
  KMeans_fit = KMeans_mod.fit(data_df)
  output = KMeans_fit.transform(data_df)
  score = evaluator.evaluate(output)
  wssse_values.append(score)
  print("Silhouette Score:",score)

In [None]:
# Plotting WSSSE values
plt.plot( range(2,11),wssse_values)
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within Set Sum of Squared Errors (WSSSE)')
plt.title('Elbow Method for Optimal K')
plt.grid()
plt.show()