In [1]:
# Imports & Spark setup
import os
import glob
import hdf5_getters
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .appName("Exploring Million Song Dataset") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
# Inspect some sample data
h5 = hdf5_getters.open_h5_file_read('MillionSongSubset/A/A/A/TRAAAAW128F429D538.h5')

# in byte format --> decode to string
print('Sample Artist: %s, Song: %s' % \
      (hdf5_getters.get_artist_name(h5).decode('UTF-8'), hdf5_getters.get_title(h5).decode('UTF-8')))

Sample Artist: Casual, Song: I Didn't Mean To


In [3]:
# Get all subdirs
def get_subdirs(basedir):
    subdirs = []
    for subdir in next(os.walk(basedir))[1]:
        subdirs.append(os.path.join(basedir, subdir))
    return subdirs

basedir = 'MillionSongSubset'
subdirs_rdd = sc.parallelize(get_subdirs(basedir))
subsubdirs_rdd = subdirs_rdd.map(lambda subdir: get_subdirs(subdir)).flatMap(lambda x: x)
subsubsubdirs_rdd = subsubdirs_rdd.map(lambda subsubdir: get_subdirs(subsubdir)).flatMap(lambda x: x)
print('%d dirs in dataset' % (subsubsubdirs_rdd.count()))
subsubsubdirs_rdd.take(5)

894 dirs in dataset


['MillionSongSubset/A/R/R',
 'MillionSongSubset/A/R/U',
 'MillionSongSubset/A/R/I',
 'MillionSongSubset/A/R/N',
 'MillionSongSubset/A/R/G']

In [4]:
# iterate & get all files (songs)
def count_and_get_files(basedir, ext='.h5'):
    # modified version of: https://labrosa.ee.columbia.edu/millionsong/pages/iterate-over-all-songs
    cnt = 0
    all_files = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for file in files:
            all_files.append(file)
        cnt += len(files)
    return cnt, all_files

file_names_rdd = subsubsubdirs_rdd.map(lambda subsubsubdir: count_and_get_files(subsubsubdir)[1]).flatMap(lambda x: x)
print('%d files in dataset' % (file_names_rdd.count()))
file_names_rdd.take(5)

10000 files in dataset


['MillionSongSubset/A/R/R/TRARRZU128F4253CA2.h5',
 'MillionSongSubset/A/R/R/TRARRJL128F92DED0E.h5',
 'MillionSongSubset/A/R/R/TRARRUZ128F9307C57.h5',
 'MillionSongSubset/A/R/R/TRARRWA128F42A0195.h5',
 'MillionSongSubset/A/R/R/TRARRPG12903CD1DE9.h5']

In [5]:
# get artist name for each song
def get_artist_name(filename):
    h5 = hdf5_getters.open_h5_file_read(filename)
    return hdf5_getters.get_artist_name(h5).decode('UTF-8')

artist_names_rdd = file_names_rdd.map(get_artist_name)
artist_names_rdd.take(5)

['Raphaël',
 'Julie Zenatti',
 'The Baltimore Consort',
 'I Hate Sally',
 'Orlando Pops Orchestra']

In [6]:
# get all artists and songs in format: (artist, song)
def get_artist_and_song(filename):
    h5 = hdf5_getters.open_h5_file_read(filename)
    artist = hdf5_getters.get_artist_name(h5).decode('UTF-8')
    song = hdf5_getters.get_title(h5).decode('UTF-8')
    return artist, song

artist_song_rdd = file_names_rdd.map(lambda x: get_artist_and_song(x))

In [7]:
# group songs by their artist in format: (artist, [song1, song2, song3...])
grouped_artist_song_rdd = artist_song_rdd.groupByKey().mapValues(list)
n_unique_artists = grouped_artist_song_rdd.count()
print('%d unique artists in dataset' % (n_unique_artists))

for artist_songs in grouped_artist_song_rdd.take(5):
    print('\n%s has %d songs:' % (artist_songs[0], len(artist_songs[1])))
    songs = ''
    for song in artist_songs[1]:
        print('- %s' % (song))

4412 unique artists in dataset

Linkin Park has 3 songs:
- Crawling (Album Version)
- Given Up (Album Version)
- Pushing Me Away (Album Version)

Yank Rachell has 2 songs:
- My Mind Got Bad
- It Seems Like A Dream

Neil Diamond has 5 songs:
- Hey Louise
- Song Sung Blue
- Brooklyn On A Saturday Night
- Reminisce For A While (Sung With Raul Malo)
- Song Sung Blue

Pyrolator has 2 songs:
- Gold Und Silber
- Passage To Melilla

Audio Adrenaline has 4 songs:
- Some Kind Of Zombie (Criscoteque Remix)
- My God  (Audio Adrenaline Album Version)
- Will Not Fade  (Hit Parade Album Version (new Song))
- Gloria (In The Name Of Love Album Version)


In [9]:
# check if any artist has no songs -- shouldn't be possible
print('%d artists have no songs' % (grouped_artist_song_rdd.filter(lambda x: len(x[1]) < 1).count()))

0 artists have no songs
