In [157]:
!pip install pyspark

In [158]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Clustering").getOrCreate()

In [159]:
data = spark.read.csv(path = "../input/pyspark-ml-clustering-dataset/e-commerce.csv",
                      header = True, inferSchema = True)

In [160]:
data = data.na.drop()

In [161]:
data.show() #Nos quedaremos con las columnas Quantity, UnitPrice, CustomerID y Country

In [162]:
data.printSchema()

In [163]:
# from pyspark.sql.types import *

# data = data.withColumn("InvoiceNo", data["InvoiceNo"].cast(IntegerType()))
# print(data.printSchema())
# data.limit(3).toPandas()

In [164]:
data.select("Country").toPandas().value_counts()

In [165]:
from pyspark.ml.feature import StringIndexer

country_indexer = StringIndexer(inputCol = "Country", outputCol = "CountryIndex")
invoice_indexer = StringIndexer(inputCol = "InvoiceNo", outputCol = "InvoiceIndex")

In [166]:
data = country_indexer.fit(data).transform(data)
data = invoice_indexer.fit(data).transform(data)
data.show(5)

In [167]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ["InvoiceIndex", "Quantity", "UnitPrice", "CustomerID", "CountryIndex"],
                            outputCol = "features",
                            handleInvalid = "keep")

data = assembler.transform(data)

data.show(truncate = False)

In [168]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = "features",
                        outputCol = "scaled_features",
                        withStd = True,
                        withMean = True)

data = scaler.fit(data).transform(data)

data.show()

In [169]:
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans

inercias = list()

for k in range(2, 31):
    kmeans = KMeans(featuresCol = "scaled_features",
                    predictionCol = "cluster",
                    k = k,
                    distanceMeasure = "euclidean")

    model = kmeans.fit(data)
    
    inercias.append(model.summary.trainingCost)

print(inercias)
    
plt.figure(figsize = (12, 8))
plt.plot(range(2, 31), inercias, color = "red", marker = "*")
plt.show()

In [182]:
# Agrupar datos por cliente
data = spark.read.csv(path = "../input/pyspark-ml-clustering-dataset/e-commerce.csv",
                      header = True, inferSchema = True)

In [183]:
data.show()

In [184]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

udf_producto = udf(f = lambda x,y : x * y,
                   returnType = DoubleType())

data = data.withColumn("precio_producto", udf_producto(data["Quantity"], data["UnitPrice"]))

data.show()

In [185]:
data = data.groupBy("CustomerID", "InvoiceNo", "Country").agg({"Quantity" : "sum", "precio_producto" : "sum"})

data.show()

In [186]:
data = data.filter("sum(precio_producto) > 0")

data.show()

In [187]:
data = data.na.drop()

data.show()

In [191]:
country_indexer = StringIndexer(inputCol = "Country", outputCol = "CountryIndex")

data = country_indexer.fit(data).transform(data)

data.show()

In [192]:
assembler = VectorAssembler(inputCols = ["sum(precio_producto)", "sum(Quantity)", "CountryIndex"], outputCol = "features")

data = assembler.transform(data)

data.show()

In [193]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = "features", #entrada de lo que queremos transformar
                        outputCol = "scaled_features", #nombre que queremos dar a la salida
                        withStd = True, #se puede poner que use sólo una (Std o Mean) o las dos
                        withMean = True) #la elimina o la incluye en la fórmula del StandardScaler

data = scaler.fit(data).transform(data) #hay que entrenarlo y transformarlo

data.show()

In [194]:
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans

inercias = list()

for k in range(2, 31):
    kmeans = KMeans(featuresCol = "scaled_features",
                    predictionCol = "cluster",
                    k = k,
                    distanceMeasure = "euclidean")

    model = kmeans.fit(data)
    
    inercias.append(model.summary.trainingCost)

print(inercias)
    
plt.figure(figsize = (12, 8))
plt.plot(range(2, 31), inercias, color = "red", marker = "*")
plt.show()