In [15]:
import pandas as pd

X = pd.read_csv('data/breadbasket_features.csv')
print(X.shape)
X.head()

(9531, 13)


Unnamed: 0,Transaction,OTHERS,beverage,bread,breakfast,breakfast_pastry,condiments,dessert,kids,lunch,snacks,hour,day
0,1,0,0,1,0,0,0,0,0,0,0,9,6
1,2,0,0,2,0,0,0,0,0,0,0,10,6
2,3,0,1,0,0,0,1,1,0,0,0,10,6
3,4,0,0,0,0,1,0,0,0,0,0,10,6
4,5,0,1,1,0,1,0,0,0,0,0,10,6


In [2]:
# Prepare a SparkContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession

spark = SparkSession(sc)
XX = spark.createDataFrame(X)

In [6]:
XX.show(3)

+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+
|Transaction|OTHERS|beverage|bread|breakfast|breakfast_pastry|condiments|dessert|kids|lunch|snacks|hour|day|
+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+
|          1|     0|       0|    1|        0|               0|         0|      0|   0|    0|     0|   9|  6|
|          2|     0|       0|    2|        0|               0|         0|      0|   0|    0|     0|  10|  6|
|          3|     0|       1|    0|        0|               0|         1|      1|   0|    0|     0|  10|  6|
+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+
only showing top 3 rows



In [9]:
XX_df = VectorAssembler(inputCols=XX.columns, outputCol="Features").transform(XX)

In [11]:
XX_df.show(3)

+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+--------------------+
|Transaction|OTHERS|beverage|bread|breakfast|breakfast_pastry|condiments|dessert|kids|lunch|snacks|hour|day|            Features|
+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+--------------------+
|          1|     0|       0|    1|        0|               0|         0|      0|   0|    0|     0|   9|  6|(13,[0,3,11,12],[...|
|          2|     0|       0|    2|        0|               0|         0|      0|   0|    0|     0|  10|  6|(13,[0,3,11,12],[...|
|          3|     0|       1|    0|        0|               0|         1|      1|   0|    0|     0|  10|  6|(13,[0,2,6,7,11,1...|
+-----------+------+--------+-----+---------+----------------+----------+-------+----+-----+------+----+---+--------------------+
only showing top 3 rows



In [12]:
X_data = XX_df.select('Features')


In [14]:
X_data.head(3)

[Row(Features=SparseVector(13, {0: 1.0, 3: 1.0, 11: 9.0, 12: 6.0})),
 Row(Features=SparseVector(13, {0: 2.0, 3: 2.0, 11: 10.0, 12: 6.0})),
 Row(Features=SparseVector(13, {0: 3.0, 2: 1.0, 6: 1.0, 7: 1.0, 11: 10.0, 12: 6.0}))]

In [17]:
from pyspark.mllib.clustering import KMeans

# Prepare data
data = X_data.rdd.map(lambda x: x[0].toArray()) 

# Train the clusters
num_clusters = 7
clusters = KMeans.train(data, num_clusters, maxIterations=15, initializationMode="random")


In [21]:
labeled_spark = X.reset_index()
labeled_spark["label_spark"] = clusters.predict(data).collect()

labeled_spark.head()

Unnamed: 0,index,Transaction,OTHERS,beverage,bread,breakfast,breakfast_pastry,condiments,dessert,kids,lunch,snacks,hour,day,label_spark
0,0,1,0,0,1,0,0,0,0,0,0,0,9,6,1
1,1,2,0,0,2,0,0,0,0,0,0,0,10,6,1
2,2,3,0,1,0,0,0,1,1,0,0,0,10,6,1
3,3,4,0,0,0,0,1,0,0,0,0,0,10,6,1
4,4,5,0,1,1,0,1,0,0,0,0,0,10,6,1


In [22]:
labeled_spark["label_spark"].value_counts()

5    1576
4    1526
6    1413
0    1319
3    1255
2    1224
1    1218
Name: label_spark, dtype: int64