In [1]:
# Imports & Spark setup
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.types import FloatType, StringType
from pyspark.sql import Row
from pyspark.sql.functions import broadcast, udf
import numpy as np
from tools import setup_spark_config, read_parquet_files

sc, spark = setup_spark_config("Clustering Million Song Dataset")

In [2]:
# read songs data from parquet files
basedir = 'parsed_songs'
songs_df = read_parquet_files(basedir, spark)

Reading songs from parquet files to DataFrame


In [3]:
songs_df.show()

+------------+------+--------+----------+----+
|danceability|energy|loudness|   hotness|year|
+------------+------+--------+----------+----+
|         0.0|   0.0|  -9.636|0.54795295|2008|
|         0.0|   0.0| -11.061|0.47563848|2004|
|         0.0|   0.0|  -4.264| 0.7883882|1982|
|         0.0|   0.0|  -4.707|  0.681092|2004|
|         0.0|   0.0|  -4.523|0.40148672|2005|
|         0.0|   0.0|  -4.076| 0.6878737|2004|
|         0.0|   0.0|  -3.312|0.35528553|2001|
|         0.0|   0.0| -25.651|0.21508032|1982|
|         0.0|   0.0|  -6.052|0.87222904|2000|
|         0.0|   0.0| -15.433| 0.5968407|1981|
|         0.0|   0.0|  -4.325| 0.6248335|2007|
|         0.0|   0.0|  -5.193|0.42744657|2008|
|         0.0|   0.0|  -6.712|       0.0|2004|
|         0.0|   0.0|   -4.13| 0.4871122|2007|
|         0.0|   0.0|  -7.687|0.28848165|1978|
|         0.0|   0.0|  -7.687| 0.5675917|1995|
|         0.0|   0.0|  -21.82|0.50403434|2000|
|         0.0|   0.0|  -5.548|    0.5764|2005|
|         0.0

In [4]:
# transform grouped (by year) data to vector to use for clustering
input_cols = ["danceability", "energy", "loudness", "hotness"]
vecAssembler = VectorAssembler(inputCols=input_cols, \
                               outputCol="features")
vec_df = vecAssembler.transform(songs_df)

In [5]:
vec_df.show()

+------------+------+--------+----------+----+--------------------+
|danceability|energy|loudness|   hotness|year|            features|
+------------+------+--------+----------+----+--------------------+
|         0.0|   0.0|  -9.636|0.54795295|2008|[0.0,0.0,-9.63599...|
|         0.0|   0.0| -11.061|0.47563848|2004|[0.0,0.0,-11.0609...|
|         0.0|   0.0|  -4.264| 0.7883882|1982|[0.0,0.0,-4.26399...|
|         0.0|   0.0|  -4.707|  0.681092|2004|[0.0,0.0,-4.70699...|
|         0.0|   0.0|  -4.523|0.40148672|2005|[0.0,0.0,-4.52299...|
|         0.0|   0.0|  -4.076| 0.6878737|2004|[0.0,0.0,-4.07600...|
|         0.0|   0.0|  -3.312|0.35528553|2001|[0.0,0.0,-3.31200...|
|         0.0|   0.0| -25.651|0.21508032|1982|[0.0,0.0,-25.6509...|
|         0.0|   0.0|  -6.052|0.87222904|2000|[0.0,0.0,-6.05200...|
|         0.0|   0.0| -15.433| 0.5968407|1981|[0.0,0.0,-15.4329...|
|         0.0|   0.0|  -4.325| 0.6248335|2007|[0.0,0.0,-4.32499...|
|         0.0|   0.0|  -5.193|0.42744657|2008|[0

In [6]:
# fit a KMeans model to the vector transform of the grouped (by year) data
kmeans = KMeans(k=len(input_cols), seed=1)
model = kmeans.fit(vec_df.select('features'))

In [7]:
# cluster the vector transform of the grouped (by year) data
transformed_df = model.transform(vec_df)

In [8]:
transformed_df.show()

+------------+------+--------+----------+----+--------------------+----------+
|danceability|energy|loudness|   hotness|year|            features|prediction|
+------------+------+--------+----------+----+--------------------+----------+
|         0.0|   0.0|  -9.636|0.54795295|2008|[0.0,0.0,-9.63599...|         3|
|         0.0|   0.0| -11.061|0.47563848|2004|[0.0,0.0,-11.0609...|         3|
|         0.0|   0.0|  -4.264| 0.7883882|1982|[0.0,0.0,-4.26399...|         0|
|         0.0|   0.0|  -4.707|  0.681092|2004|[0.0,0.0,-4.70699...|         0|
|         0.0|   0.0|  -4.523|0.40148672|2005|[0.0,0.0,-4.52299...|         0|
|         0.0|   0.0|  -4.076| 0.6878737|2004|[0.0,0.0,-4.07600...|         0|
|         0.0|   0.0|  -3.312|0.35528553|2001|[0.0,0.0,-3.31200...|         0|
|         0.0|   0.0| -25.651|0.21508032|1982|[0.0,0.0,-25.6509...|         2|
|         0.0|   0.0|  -6.052|0.87222904|2000|[0.0,0.0,-6.05200...|         0|
|         0.0|   0.0| -15.433| 0.5968407|1981|[0.0,0

In [9]:
# create dataframe for each centroid
centroids = model.clusterCenters()
centroids = np.array(centroids).T.tolist()
centroids.append([i for i in range(len(input_cols))])

R = Row("danceability", "energy", "loudness", "hotness", "centroid")
centroids_df = sc.parallelize([R(*r) for r in zip(*centroids)]).toDF()

In [10]:
centroids_df.show()

+------------+------+-------------------+-------------------+--------+
|danceability|energy|           loudness|            hotness|centroid|
+------------+------+-------------------+-------------------+--------+
|         0.0|   0.0|  -5.45759555498759|0.49935485538509156|       0|
|         0.0|   0.0|-14.927767576612869| 0.3679735965019948|       1|
|         0.0|   0.0|-25.208220417216673|0.36172923887685193|       2|
|         0.0|   0.0| -9.693064374493133| 0.4231813657638777|       3|
+------------+------+-------------------+-------------------+--------+



In [11]:
# add fictional genre to each centroid
genres = ["hot&loud", "plain", "mellow&soft", "mainstream"]

def add_genre(centroid):
    return genres[int(centroid)]

udf_add_genre = udf(add_genre, StringType())
genres_df = centroids_df.withColumn("genre", udf_add_genre("centroid")).select("centroid", "genre")

In [12]:
genres_df.show()

+--------+-----------+
|centroid|      genre|
+--------+-----------+
|       0|   hot&loud|
|       1|      plain|
|       2|mellow&soft|
|       3| mainstream|
+--------+-----------+



In [13]:
# add centroid genre to songs
song_genres_df = transformed_df.join(broadcast(genres_df), transformed_df.prediction == genres_df.centroid) \
    .select("danceability", "energy", "loudness", "hotness", "year", "genre")

In [14]:
song_genres_df.show()

+------------+------+--------+----------+----+-----------+
|danceability|energy|loudness|   hotness|year|      genre|
+------------+------+--------+----------+----+-----------+
|         0.0|   0.0|  -9.636|0.54795295|2008| mainstream|
|         0.0|   0.0| -11.061|0.47563848|2004| mainstream|
|         0.0|   0.0|  -4.264| 0.7883882|1982|   hot&loud|
|         0.0|   0.0|  -4.707|  0.681092|2004|   hot&loud|
|         0.0|   0.0|  -4.523|0.40148672|2005|   hot&loud|
|         0.0|   0.0|  -4.076| 0.6878737|2004|   hot&loud|
|         0.0|   0.0|  -3.312|0.35528553|2001|   hot&loud|
|         0.0|   0.0| -25.651|0.21508032|1982|mellow&soft|
|         0.0|   0.0|  -6.052|0.87222904|2000|   hot&loud|
|         0.0|   0.0| -15.433| 0.5968407|1981|      plain|
|         0.0|   0.0|  -4.325| 0.6248335|2007|   hot&loud|
|         0.0|   0.0|  -5.193|0.42744657|2008|   hot&loud|
|         0.0|   0.0|  -6.712|       0.0|2004|   hot&loud|
|         0.0|   0.0|   -4.13| 0.4871122|2007|   hot&lou

In [15]:
song_genres_df.createOrReplaceTempView("songs_with_genres")

In [16]:
n_mainstream_songs = spark.sql("SELECT COUNT(*) FROM songs_with_genres WHERE genre = \"mainstream\"").collect()[0][0]
n_hotnloud_songs = spark.sql("SELECT COUNT(*) FROM songs_with_genres WHERE genre = \"hot&loud\"").collect()[0][0]
n_mellownsoft_songs = spark.sql("SELECT COUNT(*) FROM songs_with_genres WHERE genre = \"mellow&soft\"").collect()[0][0]
n_plain_songs = spark.sql("SELECT COUNT(*) FROM songs_with_genres WHERE genre = \"plain\"").collect()[0][0]

print("There are %d mainstream songs in the dataset" % (n_mainstream_songs))
print("There are %d hot&loud songs in the dataset" % (n_hotnloud_songs))
print("There are %d mellow&soft songs in the dataset" % (n_mellownsoft_songs))
print("There are %d plain songs in the dataset" % (n_plain_songs))

There are 1041 mainstream songs in the dataset
There are 1350 hot&loud songs in the dataset
There are 118 mellow&soft songs in the dataset
There are 555 plain songs in the dataset
