In [1]:
# Imports & Spark setup
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from tools import setup_spark_config, read_parquet_files

sc, spark = setup_spark_config("Clustering Million Song Dataset")

In [2]:
# read songs data from parquet files
basedir = 'parsed_songs'
songs_df = read_parquet_files(basedir, spark)

Reading songs from parquet files to DataFrame


In [3]:
songs_df.show()

+------------+------+--------+----------+----+
|danceability|energy|loudness|   hotness|year|
+------------+------+--------+----------+----+
|         0.0|   0.0|  -9.636|0.54795295|2008|
|         0.0|   0.0| -11.061|0.47563848|2004|
|         0.0|   0.0|  -4.264| 0.7883882|1982|
|         0.0|   0.0|  -4.707|  0.681092|2004|
|         0.0|   0.0|  -4.523|0.40148672|2005|
|         0.0|   0.0|  -4.076| 0.6878737|2004|
|         0.0|   0.0|  -3.312|0.35528553|2001|
|         0.0|   0.0| -25.651|0.21508032|1982|
|         0.0|   0.0|  -6.052|0.87222904|2000|
|         0.0|   0.0| -15.433| 0.5968407|1981|
|         0.0|   0.0|  -4.325| 0.6248335|2007|
|         0.0|   0.0|  -5.193|0.42744657|2008|
|         0.0|   0.0|  -6.712|       0.0|2004|
|         0.0|   0.0|   -4.13| 0.4871122|2007|
|         0.0|   0.0|  -7.687|0.28848165|1978|
|         0.0|   0.0|  -7.687| 0.5675917|1995|
|         0.0|   0.0|  -21.82|0.50403434|2000|
|         0.0|   0.0|  -5.548|    0.5764|2005|
|         0.0

In [4]:
# transform grouped (by year) data to vector to use for clustering
vecAssembler = VectorAssembler(inputCols=["danceability", "energy", "loudness", "hotness"], \
                               outputCol="features")
vec_df = vecAssembler.transform(songs_df)

In [5]:
vec_df.show()

+------------+------+--------+----------+----+--------------------+
|danceability|energy|loudness|   hotness|year|            features|
+------------+------+--------+----------+----+--------------------+
|         0.0|   0.0|  -9.636|0.54795295|2008|[0.0,0.0,-9.63599...|
|         0.0|   0.0| -11.061|0.47563848|2004|[0.0,0.0,-11.0609...|
|         0.0|   0.0|  -4.264| 0.7883882|1982|[0.0,0.0,-4.26399...|
|         0.0|   0.0|  -4.707|  0.681092|2004|[0.0,0.0,-4.70699...|
|         0.0|   0.0|  -4.523|0.40148672|2005|[0.0,0.0,-4.52299...|
|         0.0|   0.0|  -4.076| 0.6878737|2004|[0.0,0.0,-4.07600...|
|         0.0|   0.0|  -3.312|0.35528553|2001|[0.0,0.0,-3.31200...|
|         0.0|   0.0| -25.651|0.21508032|1982|[0.0,0.0,-25.6509...|
|         0.0|   0.0|  -6.052|0.87222904|2000|[0.0,0.0,-6.05200...|
|         0.0|   0.0| -15.433| 0.5968407|1981|[0.0,0.0,-15.4329...|
|         0.0|   0.0|  -4.325| 0.6248335|2007|[0.0,0.0,-4.32499...|
|         0.0|   0.0|  -5.193|0.42744657|2008|[0

In [6]:
# fit a KMeans model to the vector transform of the grouped (by year) data
kmeans = KMeans(k=10, seed=1)
model = kmeans.fit(vec_df.select('features'))

In [7]:
# cluster the vector transform of the grouped (by year) data
transformed = model.transform(vec_df)

In [8]:
transformed.show()

+------------+------+--------+----------+----+--------------------+----------+
|danceability|energy|loudness|   hotness|year|            features|prediction|
+------------+------+--------+----------+----+--------------------+----------+
|         0.0|   0.0|  -9.636|0.54795295|2008|[0.0,0.0,-9.63599...|         0|
|         0.0|   0.0| -11.061|0.47563848|2004|[0.0,0.0,-11.0609...|         9|
|         0.0|   0.0|  -4.264| 0.7883882|1982|[0.0,0.0,-4.26399...|         2|
|         0.0|   0.0|  -4.707|  0.681092|2004|[0.0,0.0,-4.70699...|         7|
|         0.0|   0.0|  -4.523|0.40148672|2005|[0.0,0.0,-4.52299...|         2|
|         0.0|   0.0|  -4.076| 0.6878737|2004|[0.0,0.0,-4.07600...|         2|
|         0.0|   0.0|  -3.312|0.35528553|2001|[0.0,0.0,-3.31200...|         2|
|         0.0|   0.0| -25.651|0.21508032|1982|[0.0,0.0,-25.6509...|         1|
|         0.0|   0.0|  -6.052|0.87222904|2000|[0.0,0.0,-6.05200...|         7|
|         0.0|   0.0| -15.433| 0.5968407|1981|[0.0,0