In [3]:
# Imports & Spark setup
import os
import shutil
import glob
import hdf5_getters
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType

from tools import setup_spark_config

sc, spark = setup_spark_config("Exploring Million Song Dataset")

# add local files to spark so that workers can use them as well
homedir = str(os.getcwd())+'/'
if 'ubuntu' in homedir:
    sc.addPyFile('/home/ubuntu/hdf5_getters.py')
    sc.addPyFile('/home/ubuntu/tools.py')

In [4]:
# Get all subdirs
def get_subdirs(basedir, homedir):
    subdirs = []
    for subdir in next(os.walk(homedir+basedir))[1]:
        subdirs.append(os.path.join(basedir, subdir))
    return subdirs

homedir = str(os.getcwd())+'/'
basedir = 'MillionSongSubset'
subdirs_rdd = sc.parallelize(get_subdirs(basedir, homedir))
subsubdirs_rdd = subdirs_rdd.map(lambda subdir: get_subdirs(subdir, homedir)).flatMap(lambda x: x)
subsubsubdirs_rdd = subsubdirs_rdd.map(lambda subsubdir: get_subdirs(subsubdir, homedir)).flatMap(lambda x: x)
subsubsubdirs_rdd = subsubsubdirs_rdd.map(lambda subsubsubdir: homedir+subsubsubdir).cache()

In [5]:
print('%d dirs in dataset' % (subsubsubdirs_rdd.count()))
subsubsubdirs_rdd.take(5)

894 dirs in dataset


['/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/U',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/I',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/N',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/G']

In [6]:
# iterate & get all files (songs)
def count_and_get_files(basedir, ext='.h5'):
    # modified version of: https://labrosa.ee.columbia.edu/millionsong/pages/iterate-over-all-songs
    cnt = 0
    all_files = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for file in files:
            all_files.append(file)
        cnt += len(files)
    return cnt, all_files

file_names_rdd = subsubsubdirs_rdd \
                    .map(lambda subsubsubdir: count_and_get_files(subsubsubdir)[1]) \
                    .flatMap(lambda x: x) \
                    .cache()

In [7]:
print('%d files in dataset' % (file_names_rdd.count()))
file_names_rdd.take(5)

10000 files in dataset


['/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R/TRARRZU128F4253CA2.h5',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R/TRARRJL128F92DED0E.h5',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R/TRARRUZ128F9307C57.h5',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R/TRARRWA128F42A0195.h5',
 '/Users/kasper/Development/school/LDSA-project/MillionSongSubset/A/R/R/TRARRPG12903CD1DE9.h5']

In [8]:
# Inspect some sample data
h5 = hdf5_getters.open_h5_file_read(file_names_rdd.take(1)[0])

# in byte format --> decode to string
print('Sample artist: %s, \
      \nsong: %s, \
      \nartist familiarity: %0.2f, \
      \nartist hotness: %0.2f, \
      \nsong hotness: %0.2f, \
      \nkey: %d, \
      \ntempo: %0.2f \
      \nyear: %d' % \
      (hdf5_getters.get_artist_name(h5).decode('UTF-8'), \
       hdf5_getters.get_title(h5).decode('UTF-8'), \
       float(hdf5_getters.get_artist_familiarity(h5)), \
       float(hdf5_getters.get_artist_hotttnesss(h5)), \
       float(hdf5_getters.get_song_hotttnesss(h5)), \
       int(hdf5_getters.get_key(h5)), \
       float(hdf5_getters.get_tempo(h5)), \
       int(hdf5_getters.get_year(h5))))

h5.close()

Sample artist: Raphaël,       
song: Je Sais Que La Terre Est Plate,       
artist familiarity: 0.56,       
artist hotness: 0.39,       
song hotness: 0.55,       
key: 0,       
tempo: 124.06       
year: 2008


In [9]:
# get artist name for each song
def get_artist_name(filename):
    file = hdf5_getters.open_h5_file_read(filename)
    artist_name = hdf5_getters.get_artist_name(file).decode('UTF-8')
    file.close()
    return artist_name

artist_names_rdd = file_names_rdd.map(get_artist_name)

In [10]:
artist_names_rdd.take(5)

['Raphaël',
 'Julie Zenatti',
 'The Baltimore Consort',
 'I Hate Sally',
 'Orlando Pops Orchestra']

In [11]:
# get all artists and songs in format: (artist, song)
def get_artist_and_song(filename):
    file = hdf5_getters.open_h5_file_read(filename)
    artist = hdf5_getters.get_artist_name(file).decode('UTF-8')
    song = hdf5_getters.get_title(file).decode('UTF-8')
    file.close()
    return artist, song

artist_song_rdd = file_names_rdd.map(lambda x: get_artist_and_song(x))

In [12]:
# group songs by their artist in format: (artist, [song1, song2, song3...])
# very slow operation --> if printing songs is uninteresting, use reduceByKey instead
grouped_artist_song_rdd = artist_song_rdd.groupByKey().mapValues(list)

In [13]:
n_unique_artists = grouped_artist_song_rdd.count()
print('%d unique artists in dataset' % (n_unique_artists))

for artist_songs in grouped_artist_song_rdd.take(5):
    print('\n%s has %d songs:' % (artist_songs[0], len(artist_songs[1])))
    songs = ''
    for song in artist_songs[1]:
        print('- %s' % (song))

4412 unique artists in dataset

The Baltimore Consort has 4 songs:
- Howells Delight
- The Beautiful Shepherdess of Arcadia
- You Lasses and Lads
- Howells Delight

Atreyu has 7 songs:
- You Eclipsed By Me (Album Version)
- Gallows (Album Version)
- Nevada's Grace (Album Version)
- Ex's And Oh's (Instrumental Version)
- The Remembrance Ballad (Album Version)
- Blood Children (an Introduction) (Album Version)
- Shameful (LP Version)

Mistress has 1 songs:
- Shovel

Linkin Park has 3 songs:
- Crawling (Album Version)
- Given Up (Album Version)
- Pushing Me Away (Album Version)

Jimmy Riley has 9 songs:
- Amaze
- Amaze
- Conversation
- Simple Communication
- The Love We Had Version
- Show Of Love
- Life
- Prophecy
- Watch This Sounds


In [14]:
# check if any artist has no songs -- shouldn't be possible
print('%d artists have no songs' % (grouped_artist_song_rdd.filter(lambda x: len(x[1]) < 1).count()))

0 artists have no songs


In [15]:
# get song data to use for analysis
def get_song_data(filename):
    # round floats to 2 decimals
    file = hdf5_getters.open_h5_file_read(filename)
    loudness = float(hdf5_getters.get_loudness(file))
    song_hotness = float(hdf5_getters.get_song_hotttnesss(file))
    year = int(hdf5_getters.get_year(file))    
    artist_familiarity = float(hdf5_getters.get_artist_familiarity(file))
    artist_hotness = float(hdf5_getters.get_artist_hotttnesss(file))
    key = int(hdf5_getters.get_key(file))
    tempo = float(hdf5_getters.get_tempo(file))
    file.close()
    return loudness, song_hotness, year, artist_familiarity, artist_hotness, key, tempo

songs_rdd = file_names_rdd.map(get_song_data).cache()

In [16]:
songs_rdd.take(5)

[(-9.636,
  0.5479529419800353,
  2008,
  0.5574602197393447,
  0.3861516314132549,
  0,
  124.059),
 (-11.061,
  0.47563846801023907,
  2004,
  0.6269577230052118,
  0.43485958934341257,
  1,
  80.084),
 (-24.14, nan, 0, 0.42572365804650586, 0.0, 3, 54.874),
 (-5.795, nan, 2007, 0.6114954183523941, 0.3345197638116389, 7, 77.15),
 (-16.477, nan, 0, 0.3672550107574772, 0.3116155449734521, 10, 120.382)]

In [17]:
# move song data from RDD to DF & table view for optimization & Spark-SQL queries
fields = [StructField("loudness", FloatType()), \
          StructField("song_hotness", FloatType()), \
          StructField("year", IntegerType()), \
          StructField("artist_familiarity", FloatType()), \
          StructField("artist_hotness", FloatType()), \
          StructField("key", IntegerType()), \
          StructField("tempo", FloatType())]

schema = StructType(fields)

songs_df = spark.createDataFrame(songs_rdd, schema)
songs_df.createOrReplaceTempView("songs")

In [18]:
songs_df.show()

+--------+------------+----+------------------+--------------+---+-------+
|loudness|song_hotness|year|artist_familiarity|artist_hotness|key|  tempo|
+--------+------------+----+------------------+--------------+---+-------+
|  -9.636|  0.54795295|2008|        0.55746025|    0.38615164|  0|124.059|
| -11.061|  0.47563848|2004|         0.6269577|     0.4348596|  1| 80.084|
|  -24.14|         NaN|   0|        0.42572367|           0.0|  3| 54.874|
|  -5.795|         NaN|2007|        0.61149544|    0.33451977|  7|  77.15|
| -16.477|         NaN|   0|          0.367255|    0.31161556| 10|120.382|
| -12.474|  0.44545454|   0|         0.6013057|    0.36367568|  9| 99.024|
|  -4.393|  0.32773668|   0|        0.70901054|    0.55356616|  9|175.673|
|   -5.05|         NaN|   0|         0.5480224|     0.4401347|  1| 87.999|
|  -4.264|   0.7883882|1982|        0.73703754|     0.5392454| 10| 92.897|
| -13.885|         NaN|1998|        0.43591547|    0.35814852|  4| 86.981|
|  -4.707|    0.681092|20

In [19]:
n_songs = file_names_rdd.count() # each file corresponds to one song
print("There are %d songs in total" % (n_songs))

There are 10000 songs in total


In [20]:
# filter out songs with NaN values and no year
filtered_songs_df = spark.sql("SELECT * FROM songs WHERE \
                                  isNaN(loudness) = false AND \
                                  isNaN(song_hotness) = false AND \
                                  isNaN(year) = false AND \
                                  year > 0 AND \
                                  isNaN(artist_familiarity) = false AND \
                                  isNaN(artist_hotness) = false AND \
                                  isNaN(key) = false AND \
                                  isNaN(tempo) = false")
filtered_songs_df.createOrReplaceTempView("songs")

In [21]:
filtered_songs_df.show()

+--------+------------+----+------------------+--------------+---+-------+
|loudness|song_hotness|year|artist_familiarity|artist_hotness|key|  tempo|
+--------+------------+----+------------------+--------------+---+-------+
|  -9.636|  0.54795295|2008|        0.55746025|    0.38615164|  0|124.059|
| -11.061|  0.47563848|2004|         0.6269577|     0.4348596|  1| 80.084|
|  -4.264|   0.7883882|1982|        0.73703754|     0.5392454| 10| 92.897|
|  -4.707|    0.681092|2004|         0.8218443|     0.5924395|  0|157.715|
|  -4.523|  0.40148672|2005|        0.49579692|    0.38949883|  0|146.331|
|  -4.076|   0.6878737|2004|        0.73343325|     0.4555588|  0| 84.992|
|  -3.312|  0.35528553|2001|        0.48433375|     0.3359355|  1| 99.959|
| -25.651|  0.21508032|1982|         0.5772761|    0.37693998|  1|104.989|
|  -6.052|  0.87222904|2000|         0.8873861|      0.791143|  4|105.095|
| -15.433|   0.5968407|1981|         0.6559214|     0.5783016|  5|100.042|
|  -4.325|   0.6248335|20

In [22]:
n_songs_left = filtered_songs_df.count()
n_songs_left_frac = n_songs_left / n_songs * 100
print("There are %d songs left after removing songs with NaN values, corresponding to %0.2f%% of the total amount of songs" \
       % (n_songs_left, n_songs_left_frac))

There are 3064 songs left after removing songs with NaN values, corresponding to 30.64% of the total amount of songs


In [23]:
# write songs to parquet (better than CSV)
dst_dir = homedir+'parsed-'+basedir
if os.path.isdir(dst_dir):
    shutil.rmtree(dst_dir)
filtered_songs_df.write.parquet(dst_dir)

In [24]:
# write songs to CSV (for comparison purposes)
save_as_csv = False
if save_as_csv:
    dst_dir = homedir+'parsed-'+basedir+'-csv'
    if os.path.isdir(dst_dir):
        shutil.rmtree(dst_dir)
    filtered_songs_df.write.csv(dst_dir, header=True)