In [1]:
#Importing required headers.
import os
import sys
import time
import glob
import tables
import scipy.io as sio
import pandas as pd
from pandas import read_hdf

#Header to handle H5 files.
import hdf5_getters

In [2]:
#Functions.

#Aggregate all .h5 files.
def get_all_files(basedir, ext='.h5'):
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

#Converter to matfile.
def transform(h5path, matpath=None, force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    #Sanity checks
    if not os.path.isfile(h5path):
        print ('Path to .h5 files does not exist:', h5path)
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print ('Expecting a .h5 extension for file:', h5path)
        return False
    
    #Check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if not force: 
            print('matfile', matpath, 'already exists (delete or force):')
            return False
    
    """
    get all getters! we assume that all we need is in hdf5_getters.py
    further assume that they have the form get_blablabla and that's the
    only thing that has that form
    """
    getters = list(filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()))
    #Special case
    getters.remove("get_num_songs") 
    #Open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    #Transform
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        #Iterate over songs
        for songidx in range(nSongs):
            #Iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print('Insufficient Main Memory.')
        raise
    finally:
        #Close h5
        h5.close()
    #Create
    return matdata

In [4]:
%%time
#Convert!

#Set input path.
h5s = get_all_files('../MillionSongSubset/data/')

#Creating the dataframe.
df = pd.DataFrame()
for file in h5s:
    xd = transform(file)
    df = df.append(pd.Series(xd), ignore_index=True)
    
#Storing the result.
df.to_csv('../MSD10k.csv')

Wall time: 23min 24s
