In [1]:
# Imports & Spark setup
from tools import setup_spark_config, read_parquet_files
from pyspark.ml.feature import QuantileDiscretizer

sc, spark = setup_spark_config("Grouping Million Song Dataset")

In [2]:
# read songs data from parquet files
basedir = 'parsed-MillionSongSubset'
songs_df = read_parquet_files(basedir, spark)

Reading songs from parquet files to DataFrame


In [3]:
songs_df.show()

+--------+------------+----+------------------+--------------+---+-------+
|loudness|song_hotness|year|artist_familiarity|artist_hotness|key|  tempo|
+--------+------------+----+------------------+--------------+---+-------+
|  -9.636|  0.54795295|2008|        0.55746025|    0.38615164|  0|124.059|
| -11.061|  0.47563848|2004|         0.6269577|     0.4348596|  1| 80.084|
|  -4.264|   0.7883882|1982|        0.73703754|     0.5392454| 10| 92.897|
|  -4.707|    0.681092|2004|         0.8218443|     0.5924395|  0|157.715|
|  -4.523|  0.40148672|2005|        0.49579692|    0.38949883|  0|146.331|
|  -4.076|   0.6878737|2004|        0.73343325|     0.4555588|  0| 84.992|
|  -3.312|  0.35528553|2001|        0.48433375|     0.3359355|  1| 99.959|
| -25.651|  0.21508032|1982|         0.5772761|    0.37693998|  1|104.989|
|  -6.052|  0.87222904|2000|         0.8873861|      0.791143|  4|105.095|
| -15.433|   0.5968407|1981|         0.6559214|     0.5783016|  5|100.042|
|  -4.325|   0.6248335|20

In [4]:
songs_df.createOrReplaceTempView("songs")

In [6]:
# group songs by year
grouped_year_songs_df = spark.sql("SELECT AVG(loudness), \
                              AVG(song_hotness), AVG(artist_familiarity), \
                              AVG(key), AVG(tempo), \
                              AVG(artist_hotness), year FROM songs \
                              GROUP BY year")

In [7]:
n_years = grouped_year_songs_df.count()
print("There are songs from %d different years in the dataset" % (n_years))

There are songs from 55 different years in the dataset


In [8]:
grouped_year_songs_df.orderBy("year", ascending=False).show()

+-------------------+-------------------+-----------------------+------------------+------------------+-------------------+----+
|      avg(loudness)|  avg(song_hotness)|avg(artist_familiarity)|          avg(key)|        avg(tempo)|avg(artist_hotness)|year|
+-------------------+-------------------+-----------------------+------------------+------------------+-------------------+----+
| -7.762981476607146|  0.563357169981356|     0.6528117033066573| 5.814814814814815|121.26220364040799| 0.4607585279477967|2010|
| -8.098129057756035| 0.5018193852356685|      0.661958682921625| 5.526881720430108|127.85901081946588| 0.4606414744930883|2009|
| -7.666779771447182|  0.499955424418052|      0.669840328484064| 5.119047619047619|129.57367274874733| 0.4657232601727758|2008|
| -7.863054733371261|0.44247311116451055|     0.6615233765312688|  5.36318407960199|127.57285067809755|0.45055698684940293|2007|
| -7.882186937554974|0.43157015413602934|     0.6377229931215956| 4.985981308411215|126.250869447

In [10]:
# see linear correlation between years and the other features
loudness_correlation = 100*float(songs_df.stat.corr("loudness", "year"))
song_hotness_correlation = 100*float(songs_df.stat.corr("song_hotness", "year"))
artist_familiarity_correlation = 100*float(songs_df.stat.corr("artist_familiarity", "year"))
key_correlation = 100*float(songs_df.stat.corr("key", "year"))
tempo_correlation = 100*float(songs_df.stat.corr("tempo", "year"))
artist_hotness_correlation = 100*float(songs_df.stat.corr("artist_hotness", "year"))

print("%0.2f%% correlation between loudness and year" % (loudness_correlation))
print("%0.2f%% correlation between song_hotness and year" % (song_hotness_correlation))
print("%0.2f%% correlation between artist_familiarity and year" % (artist_familiarity_correlation))
print("%0.2f%% correlation between key and year" % (key_correlation))
print("%0.2f%% correlation between tempo and year" % (tempo_correlation))
print("%0.2f%% correlation between artist_hotness and year" % (artist_hotness_correlation))

28.33% correlation between loudness and year
7.22% correlation between song_hotness and year
10.06% correlation between artist familiarity and year
2.56% correlation between key and year
1.02% correlation between tempo and year
6.42% correlation between artist_hotness and year


In [11]:
# see linear correlation between years and the average of other features
loudness_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(loudness)", "year"))
song_hotness_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(song_hotness)", "year"))
artist_familiarity_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(artist_familiarity)", "year"))
key_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(key)", "year"))
tempo_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(tempo)", "year"))
artist_hotness_correlation = 100*float(grouped_year_songs_df.stat.corr("avg(artist_hotness)", "year"))

print("%0.2f%% correlation between avg(loudness) and year" % (loudness_correlation))
print("%0.2f%% correlation between avg(song_hotness) and year" % (song_hotness_correlation))
print("%0.2f%% correlation between avg(artist_familiarity) and year" % (artist_familiarity_correlation))
print("%0.2f%% correlation between avg(key) and year" % (key_correlation))
print("%0.2f%% correlation between avg(tempo) and year" % (tempo_correlation))
print("%0.2f%% correlation between avg(artist_hotness) and year" % (artist_hotness_correlation))

67.24% correlation between avg(loudness) and year
44.95% correlation between avg(song_hotness) and year
48.90% correlation between avg(artist_familiarity) and year
16.01% correlation between avg(key) and year
15.08% correlation between avg(tempo) and year
49.43% correlation between avg(artist_hotness) and year


In [12]:
# discretize hotness to 10 different values
discretizer = QuantileDiscretizer(numBuckets=10, inputCol="song_hotness", outputCol="discrete_song_hotness")
discretized_df = discretizer.fit(songs_df).transform(songs_df)

In [13]:
discretized_df.show()

+--------+------------+----+------------------+--------------+---+-------+---------------------+
|loudness|song_hotness|year|artist_familiarity|artist_hotness|key|  tempo|discrete_song_hotness|
+--------+------------+----+------------------+--------------+---+-------+---------------------+
|  -9.636|  0.54795295|2008|        0.55746025|    0.38615164|  0|124.059|                  6.0|
| -11.061|  0.47563848|2004|         0.6269577|     0.4348596|  1| 80.084|                  5.0|
|  -4.264|   0.7883882|1982|        0.73703754|     0.5392454| 10| 92.897|                  9.0|
|  -4.707|    0.681092|2004|         0.8218443|     0.5924395|  0|157.715|                  8.0|
|  -4.523|  0.40148672|2005|        0.49579692|    0.38949883|  0|146.331|                  3.0|
|  -4.076|   0.6878737|2004|        0.73343325|     0.4555588|  0| 84.992|                  8.0|
|  -3.312|  0.35528553|2001|        0.48433375|     0.3359355|  1| 99.959|                  3.0|
| -25.651|  0.21508032|1982|  

In [14]:
discretized_df.createOrReplaceTempView("discrete_hotness_songs")

In [17]:
# group songs by hotness
grouped_hotness_songs_df = spark.sql("SELECT AVG(loudness), \
                              AVG(artist_familiarity), \
                              AVG(key), AVG(tempo), \
                              AVG(artist_hotness), discrete_song_hotness FROM discrete_hotness_songs \
                              GROUP BY discrete_song_hotness")

In [19]:
grouped_hotness_songs_df.orderBy("discrete_song_hotness", ascending=False).show()

+-------------------+-----------------------+------------------+------------------+-------------------+---------------------+
|      avg(loudness)|avg(artist_familiarity)|          avg(key)|        avg(tempo)|avg(artist_hotness)|discrete_song_hotness|
+-------------------+-----------------------+------------------+------------------+-------------------+---------------------+
| -7.300693565799344|     0.7688744101793535| 5.067741935483871|129.60604507692398| 0.5577006425588361|                  9.0|
| -7.845457530878727|     0.7184116879319833| 5.428104575163399|128.08759440004437|  0.505188079684778|                  8.0|
|  -8.58589251499611|     0.6941242202303406| 5.361563517915309| 128.5459022149201| 0.4847280896156541|                  7.0|
| -8.910549040713342|     0.6565574244930853| 5.741830065359477|126.39117654787948|0.45452985907691756|                  6.0|
| -8.918314111156342|     0.6399761292462548| 5.201923076923077|125.29110584503564|0.44578717157053643|               

In [26]:
# see linear correlation between discrete_hotness and the other features
loudness_correlation = 100*float(discretized_df.stat.corr("loudness", "discrete_song_hotness"))
artist_hotness_correlation = 100*float(discretized_df.stat.corr("artist_hotness", "discrete_song_hotness"))
artist_familiarity_correlation = 100*float(discretized_df.stat.corr("artist_familiarity", "discrete_song_hotness"))
key_correlation = 100*float(discretized_df.stat.corr("key", "discrete_song_hotness"))
tempo_correlation = 100*float(discretized_df.stat.corr("tempo", "discrete_song_hotness"))
year_correlation = 100*float(discretized_df.stat.corr("year", "discrete_song_hotness"))

print("%0.2f%% correlation between loudness and discrete_song_hotness" % (loudness_correlation))
print("%0.2f%% correlation between artist_hotness and discrete_song_hotness" % (artist_hotness_correlation))
print("%0.2f%% correlation between artist_familiarity and discrete_song_hotness" % (artist_familiarity_correlation))
print("%0.2f%% correlation between key and discrete_song_hotness" % (key_correlation))
print("%0.2f%% correlation between tempo and discrete_song_hotness" % (tempo_correlation))
print("%0.2f%% correlation between year and discrete_song_hotness" % (artist_hotness_correlation))

24.16% correlation between loudness and discrete_song_hotness
48.78% correlation between artist_hotness and discrete_song_hotness
53.29% correlation between artist_familiarity and discrete_song_hotness
2.60% correlation between key and discrete_song_hotness
6.98% correlation between tempo and discrete_song_hotness
48.78% correlation between year and discrete_song_hotness


In [27]:
# see linear correlation between discrete_hotness and the average of the other features
loudness_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(loudness)", "discrete_song_hotness"))
song_hotness_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(artist_hotness)", "discrete_song_hotness"))
artist_familiarity_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(artist_familiarity)", "discrete_song_hotness"))
key_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(key)", "discrete_song_hotness"))
tempo_correlation = 100*float(grouped_hotness_songs_df.stat.corr("avg(tempo)", "discrete_song_hotness"))

print("%0.2f%% correlation between avg(loudness) and discrete_song_hotness" % (loudness_correlation))
print("%0.2f%% correlation between avg(artist_hotness) and discrete_song_hotness" % (song_hotness_correlation))
print("%0.2f%% correlation between avg(artist_familiarity) and discrete_song_hotness" % (artist_familiarity_correlation))
print("%0.2f%% correlation between avg(key) and discrete_song_hotness" % (key_correlation))
print("%0.2f%% correlation between avg(tempo) and discrete_song_hotness" % (tempo_correlation))

99.14% correlation between avg(loudness) and discrete_song_hotness
97.64% correlation between avg(artist_hotness) and discrete_song_hotness
98.65% correlation between avg(artist_familiarity) and discrete_song_hotness
28.38% correlation between avg(key) and discrete_song_hotness
88.52% correlation between avg(tempo) and discrete_song_hotness
