In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# PySpark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, PCA as PCA_spark
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col
from pyspark.ml.evaluation import ClusteringEvaluator

# Para visualización
import matplotlib.pyplot as plt

# Iniciar SparkSession
spark = SparkSession.builder \
    .appName("KMeans_Iris_PySpark") \
    .getOrCreate()

In [None]:
# Cargar dataset Iris con seaborn
iris = sns.load_dataset("iris")
print(iris.head())

# El DataFrame de seaborn/pandas contiene columnas:
# ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [None]:
# Scatterplot con las clases originales
plt.figure(figsize=(6, 5))
sns.scatterplot(
    data=iris, 
    x="sepal_length", 
    y="sepal_width", 
    hue="species",
    palette="deep"
)
plt.title("Distribución de Iris por especie (Sepal)")
plt.show()

In [None]:
# Convertir el DataFrame de pandas a Spark
spark_df = spark.createDataFrame(iris)
spark_df.printSchema()
spark_df.show(5)

In [None]:
feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_df = assembler.transform(spark_df)

assembled_df.printSchema()

In [None]:
kmeans = KMeans(featuresCol="features", k=3, seed=42)
model = kmeans.fit(assembled_df)
predictions = model.transform(assembled_df)
predictions.printSchema()
predictions.show(5)

In [None]:
centers = model.clusterCenters()
for i, center in enumerate(centers):
    print(f"Centroide del cluster {i}: {center}")

In [None]:
# Listas para guardar resultados
ks = list(range(2, 11))
wssse_values = []
silhouette_values = []

# Evaluador para la silueta
evaluator = ClusteringEvaluator(
    featuresCol="features",
    metricName="silhouette",
    distanceMeasure="squaredEuclidean"  # O "cosine"
)

for k in ks:
    kmeans = KMeans(featuresCol="features", k=k, seed=42)
    model = kmeans.fit(assembled_df)
    
    # WSSSE (inercia)
    wssse = model.summary.trainingCost
    
    # Predicciones para la silueta
    predictions = model.transform(assembled_df)
    silhouette = evaluator.evaluate(predictions)
    
    wssse_values.append(wssse)
    silhouette_values.append(silhouette)

# Mostramos los pares (k, WSSSE, Silhouette)
for i, k in enumerate(ks):
    print(f"k={k} => WSSSE={wssse_values[i]:.2f}, Silhouette={silhouette_values[i]:.3f}")


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Subplot 1: Inercia
axes[0].plot(ks, wssse_values, marker='o', color='b')
axes[0].set_xlabel("Número de clusters (k)")
axes[0].set_ylabel("WSSSE (Inercia)")
axes[0].set_title("Método del codo - Inercia")

# Subplot 2: Coef. de Silueta
axes[1].plot(ks, silhouette_values, marker='o', color='r')
axes[1].set_xlabel("Número de clusters (k)")
axes[1].set_ylabel("Coeficiente de Silueta")
axes[1].set_title("Método del codo - Silueta")

plt.tight_layout()
plt.show()

In [None]:
# K-Means con k=2
kmeans_2 = KMeans(featuresCol="features", k=2, seed=42)
model_2 = kmeans_2.fit(assembled_df)
predictions_2 = model_2.transform(assembled_df)

# Convertir a pandas para graficar
predictions_2_pd = predictions_2.select("sepal_length", "sepal_width", "prediction").toPandas()

# Gráfico de dispersión (sepal_length vs. sepal_width) coloreado por el clúster
plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=predictions_2_pd,
    x="sepal_length",
    y="sepal_width",
    hue="prediction",
    palette="Set2"
)
plt.title("K-Means con 2 Clusters")
plt.show()

In [None]:
# K-Means con k=3
kmeans_3 = KMeans(featuresCol="features", k=3, seed=42)
model_3 = kmeans_3.fit(assembled_df)
predictions_3 = model_3.transform(assembled_df)

# Convertir a pandas para graficar
predictions_3_pd = predictions_3.select("sepal_length", "sepal_width", "prediction").toPandas()

# Gráfico de dispersión (sepal_length vs. sepal_width) coloreado por el clúster
plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=predictions_3_pd,
    x="sepal_length",
    y="sepal_width",
    hue="prediction",
    palette="Set2"
)
plt.title("K-Means con 3 Clusters")
plt.show()

In [None]:
# K-Means con k=4
kmeans_4 = KMeans(featuresCol="features", k=4, seed=42)
model_4 = kmeans_4.fit(assembled_df)
predictions_4 = model_4.transform(assembled_df)

# Convertir a pandas para graficar
predictions_4_pd = predictions_4.select("sepal_length", "sepal_width", "prediction").toPandas()

# Gráfico de dispersión (sepal_length vs. sepal_width) coloreado por el clúster
plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=predictions_4_pd,
    x="sepal_length",
    y="sepal_width",
    hue="prediction",
    palette="Set2"
)
plt.title("K-Means con 4 Clusters")
plt.show()

In [None]:
spark.stop()