In [1]:
# Imports & Spark setup
from tools import setup_spark_config, read_parquet_files
from pyspark.ml.feature import QuantileDiscretizer

sc, spark = setup_spark_config("Grouping Million Song Dataset")

In [2]:
# read songs data from parquet files
basedir = 'parsed-MillionSongSubset'
songs_df = read_parquet_files(basedir, spark)

Reading songs from parquet files to DataFrame


In [3]:
songs_df.show()

+------------+------+--------+----------+----+
|danceability|energy|loudness|   hotness|year|
+------------+------+--------+----------+----+
|         0.0|   0.0|  -9.636|0.54795295|2008|
|         0.0|   0.0| -11.061|0.47563848|2004|
|         0.0|   0.0|  -4.264| 0.7883882|1982|
|         0.0|   0.0|  -4.707|  0.681092|2004|
|         0.0|   0.0|  -4.523|0.40148672|2005|
|         0.0|   0.0|  -4.076| 0.6878737|2004|
|         0.0|   0.0|  -3.312|0.35528553|2001|
|         0.0|   0.0| -25.651|0.21508032|1982|
|         0.0|   0.0|  -6.052|0.87222904|2000|
|         0.0|   0.0| -15.433| 0.5968407|1981|
|         0.0|   0.0|  -4.325| 0.6248335|2007|
|         0.0|   0.0|  -5.193|0.42744657|2008|
|         0.0|   0.0|  -6.712|       0.0|2004|
|         0.0|   0.0|   -4.13| 0.4871122|2007|
|         0.0|   0.0|  -7.687|0.28848165|1978|
|         0.0|   0.0|  -7.687| 0.5675917|1995|
|         0.0|   0.0|  -21.82|0.50403434|2000|
|         0.0|   0.0|  -5.548|    0.5764|2005|
|         0.0

In [4]:
songs_df.createOrReplaceTempView("songs")

In [5]:
# group songs by year
grouped_year_songs_df = spark.sql("SELECT AVG(danceability), \
                              AVG(energy), AVG(loudness), \
                              AVG(hotness), year FROM songs \
                              GROUP BY year")

In [6]:
n_years = grouped_year_songs_df.count()
print("There are songs from %d different years in the dataset" % (n_years))

There are songs from 55 different years in the dataset


In [7]:
grouped_year_songs_df.orderBy("year", ascending=False).show()

+-----------------+-----------+-------------------+-------------------+----+
|avg(danceability)|avg(energy)|      avg(loudness)|       avg(hotness)|year|
+-----------------+-----------+-------------------+-------------------+----+
|              0.0|        0.0| -7.762981476607146|  0.563357169981356|2010|
|              0.0|        0.0| -8.098129057756035| 0.5018193852356685|2009|
|              0.0|        0.0| -7.666779771447182|  0.499955424418052|2008|
|              0.0|        0.0| -7.863054733371261|0.44247311116451055|2007|
|              0.0|        0.0| -7.882186937554974|0.43157015413602934|2006|
|              0.0|        0.0| -8.571008605689839|0.44272370174013337|2005|
|              0.0|        0.0| -8.169778960002096|0.45394635710277054|2004|
|              0.0|        0.0| -8.583950278508729|0.45536839657396244|2003|
|              0.0|        0.0| -8.481409092744192| 0.4328537909596255|2002|
|              0.0|        0.0| -8.155470170191865|0.46835622725202075|2001|

In [8]:
# see linear correlation between years and the other features
danceability_correlation = 100*float(songs_df.stat.corr("danceability", "year"))
energy_correlation = 100*float(songs_df.stat.corr("energy", "year"))
loudness_correlation = 100*float(songs_df.stat.corr("loudness", "year"))
hotness_correlation = 100*float(songs_df.stat.corr("hotness", "year"))
print("%0.2f%% correlation between danceability and year" % (danceability_correlation))
print("%0.2f%% correlation between energy and year" % (energy_correlation))
print("%0.2f%% correlation between loudness and year" % (loudness_correlation))
print("%0.2f%% correlation between year and year" % (hotness_correlation))

nan% correlation between danceability and year
nan% correlation between energy and year
28.33% correlation between loudness and year
7.22% correlation between year and year


In [9]:
# see linear correlation between years and the average of other features
danceability_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(danceability)", "year"))
energy_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(energy)", "year"))
loudness_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(loudness)", "year"))
hotness_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(hotness)", "year"))
print("%0.2f%% correlation between avg(danceability) and year" % (danceability_correlation))
print("%0.2f%% correlation between avg(energy) and year" % (energy_correlation))
print("%0.2f%% correlation between avg(loudness) and year" % (loudness_correlation))
print("%0.2f%% correlation between avg(year) and year" % (hotness_correlation))

nan% correlation between avg(danceability) and year
nan% correlation between avg(energy) and year
67.24% correlation between avg(loudness) and year
44.95% correlation between avg(year) and year


In [10]:
# discretize hotness to 10 different values
discretizer = QuantileDiscretizer(numBuckets=10, inputCol="hotness", outputCol="discrete_hotness")
discretized_df = discretizer.fit(songs_df).transform(songs_df)

In [11]:
discretized_df.show()

+------------+------+--------+----------+----+----------------+
|danceability|energy|loudness|   hotness|year|discrete_hotness|
+------------+------+--------+----------+----+----------------+
|         0.0|   0.0|  -9.636|0.54795295|2008|             6.0|
|         0.0|   0.0| -11.061|0.47563848|2004|             5.0|
|         0.0|   0.0|  -4.264| 0.7883882|1982|             9.0|
|         0.0|   0.0|  -4.707|  0.681092|2004|             8.0|
|         0.0|   0.0|  -4.523|0.40148672|2005|             3.0|
|         0.0|   0.0|  -4.076| 0.6878737|2004|             8.0|
|         0.0|   0.0|  -3.312|0.35528553|2001|             3.0|
|         0.0|   0.0| -25.651|0.21508032|1982|             1.0|
|         0.0|   0.0|  -6.052|0.87222904|2000|             9.0|
|         0.0|   0.0| -15.433| 0.5968407|1981|             7.0|
|         0.0|   0.0|  -4.325| 0.6248335|2007|             7.0|
|         0.0|   0.0|  -5.193|0.42744657|2008|             4.0|
|         0.0|   0.0|  -6.712|       0.0

In [12]:
discretized_df.createOrReplaceTempView("discrete_hotness_songs")

In [13]:
# group songs by hotness
grouped_hotness_songs_df = spark.sql("SELECT AVG(danceability), \
                              AVG(energy), AVG(loudness), \
                              discrete_hotness FROM discrete_hotness_songs \
                              GROUP BY discrete_hotness")

In [14]:
grouped_hotness_songs_df.orderBy("discrete_hotness", ascending=False).show()

+-----------------+-----------+-------------------+----------------+
|avg(danceability)|avg(energy)|      avg(loudness)|discrete_hotness|
+-----------------+-----------+-------------------+----------------+
|              0.0|        0.0| -7.300693565799344|             9.0|
|              0.0|        0.0| -7.845457530878727|             8.0|
|              0.0|        0.0|  -8.58589251499611|             7.0|
|              0.0|        0.0| -8.910549040713342|             6.0|
|              0.0|        0.0| -8.918314111156342|             5.0|
|              0.0|        0.0| -9.681052802419504|             4.0|
|              0.0|        0.0|-10.038973874515957|             3.0|
|              0.0|        0.0|-10.630196536207475|             2.0|
|              0.0|        0.0|-10.960292253485868|             1.0|
+-----------------+-----------+-------------------+----------------+



In [15]:
# see linear correlation between discrete_hotness and the other features
danceability_correlation = 100*float(discretized_df.stat.corr("danceability", "discrete_hotness"))
energy_correlation = 100*float(discretized_df.stat.corr("energy", "discrete_hotness"))
loudness_correlation = 100*float(discretized_df.stat.corr("loudness", "discrete_hotness"))
year_correlation = 100*float(discretized_df.stat.corr("year", "discrete_hotness"))
print("%0.2f%% correlation between danceability and discretized hotness" % (danceability_correlation))
print("%0.2f%% correlation between energy and discretized hotness" % (energy_correlation))
print("%0.2f%% correlation between loudness and discretized hotness" % (loudness_correlation))
print("%0.2f%% correlation between year and discretized hotness" % (year_correlation))

nan% correlation between danceability and discretized hotness
nan% correlation between energy and discretized hotness
24.16% correlation between loudness and discretized hotness
7.17% correlation between year and discretized hotness


In [16]:
# see linear correlation between discrete_hotness and the average of the other features
danceability_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(danceability)", "discrete_hotness"))
energy_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(energy)", "discrete_hotness"))
loudness_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(loudness)", "discrete_hotness"))
print("%0.2f%% correlation between avg(danceability) and discretized hotness" % (danceability_correlation))
print("%0.2f%% correlation between avg(energy) and discretized hotness" % (energy_correlation))
print("%0.2f%% correlation between avg(loudness) and discretized hotness" % (loudness_correlation))

nan% correlation between avg(danceability) and discretized hotness
nan% correlation between avg(energy) and discretized hotness
99.14% correlation between avg(loudness) and discretized hotness
