In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Clustering").getOrCreate()

In [None]:
data = spark.read.csv(path = "../input/clustering/seeds_dataset.csv",
                      header = True, inferSchema = True)

In [None]:
data.show()

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = data.columns,
                            outputCol = "features")

data = assembler.transform(data)

In [None]:
data.show()

## `StandardScaler`
`Documentacion:` https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.StandardScaler.html

In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = "features",
                        outputCol = "scaled_features",
                        withStd = True,
                        withMean = True)

data = scaler.fit(data).transform(data)

data.show()

In [None]:
data.select("scaled_features").show(truncate = False)

## `Kmeans`

`Documentacion:` https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.clustering.KMeans.html

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol = "scaled_features",
                predictionCol = "cluster", 
                k = 3,
                distanceMeasure = "euclidean")

model = kmeans.fit(data)

In [None]:
cluster = model.transform(data)

cluster.show()

In [None]:
centers = model.clusterCenters()
centers

In [None]:
summary = model.summary

# Inercia: Suma de las distancias al cuadrado de todos los puntos con su centroide mas cercano
summary.trainingCost

## Ejercicio: Hacer el codigo para el metodo del codo en PySpark

In [None]:
import matplotlib.pyplot as plt

inercias = list()

for k in range(2, 31):
    kmeans = KMeans(featuresCol = "scaled_features",
                    predictionCol = "cluster", 
                    k = k,
                    distanceMeasure = "euclidean")

    model = kmeans.fit(data)
    
    inercia = model.summary.trainingCost
    
    inercias.append(inercia)
    

plt.figure(figsize = (12, 8))
plt.plot(range(2, 31), inercias, color = "blue", marker = "o")
plt.xlabel("K's")
plt.ylabel("Inercias")
plt.title("Elbow's Method")
plt.show()