In [22]:
import os
import glob
import hdf5_getters
import aspectlib
import pprint
import pandas


pp = pprint.PrettyPrinter(indent=4)

class CreateSongDataset:

    def __init__(self, 
            MXM_metadata_filepath = 'mxm_779k_matches.txt', 
            hdf5_dataset_path = '../lmd_matched_h5', 
            hdf5_dataset_name = 'lmd_matched_h5', 
            midi_dataset_name = 'lmd_matched'):
        self.MXM_metadata_filepath = MXM_metadata_filepath
        self.hdf5_dataset_path = hdf5_dataset_path
        self.hdf5_dataset_name = hdf5_dataset_name
        self.midi_dataset_name = midi_dataset_name
        self.trackIDs_with_lyrics = self.get_track_ids_from_MXM_dataset()

    @aspectlib.Aspect
    def get_song_metadata(self, *args, **kwargs):
        result = yield aspectlib.Proceed
        for key in result.keys():
            fileObj = result[key]
            h5 = hdf5_getters.open_h5_file_read(fileObj['file_path'])
            fileObj['song_title'] = hdf5_getters.get_title(h5)
            fileObj['song_duration'] = hdf5_getters.get_duration(h5)
            fileObj['artist_name'] = hdf5_getters.get_artist_name(h5)
            fileObj['artist_genres'] = hdf5_getters.get_artist_terms(h5)
            fileObj['artist_genres_weights'] = hdf5_getters.get_artist_terms_weight(h5)
            fileObj['artist_genres_freq'] = hdf5_getters.get_artist_terms_freq(h5)
            result[key] = fileObj
        yield aspectlib.Return(result)

    @aspectlib.Aspect
    def get_song_midi(self, *args, **kwargs):
        result = yield aspectlib.Proceed
        for key in result.keys():
            fileObj = result[key]
            midiDir = fileObj['file_path'].replace(self.hdf5_dataset_name, self.midi_dataset_name).replace('.h5', '')
            fileObj['midi_dir'] = midiDir
            fileObj['midi_files'] = onlyfiles = [f for f in os.listdir(midiDir) if os.path.isfile(os.path.join(midiDir, f))]
        yield aspectlib.Return(result)

    @aspectlib.Aspect
    def set_have_lyrics(self, *args, **kwargs):
        result = yield aspectlib.Proceed
        for key in result.keys():
            fileObj = result[key]
            fileObj['have_lyrics'] = key in self.trackIDs_with_lyrics.values    
        yield aspectlib.Return(result)

    def get_files(self, only = 10, ext = '.h5'):
        idx = 0
        fileObjects = {}
        for root, dirs, files in os.walk(self.hdf5_dataset_path):
            files = glob.glob(os.path.join(root,'*'+ext))
            for f in files:
                fileName = os.path.basename(f)
                trackId = os.path.splitext(fileName)[0]
                fileObjects[trackId] = {
                    'file_path': f,
                    'track_id': trackId
                }
            idx += 1
            if idx == only:
                break
        return fileObjects
    
    def get_track_ids_from_MXM_dataset(self):
        df = pandas.read_csv(self.MXM_metadata_filepath, sep="<SEP>", engine='python')
        first = df[df.columns[0]]
        return first
    
    

with aspectlib.weave(CreateSongDataset.get_files, [CreateSongDataset.get_song_metadata, CreateSongDataset.get_song_midi, CreateSongDataset.set_have_lyrics]):
    load_data = CreateSongDataset()
    files_dict = load_data.get_files()
    file_obj = next(iter(files_dict.values()))
    pp.pprint(file_obj)


{   'artist_genres': array([b'new wave', b'pop', b'rock', b'ballad', b'soundtrack', b'disco',
       b'funk', b'vocal', b'cover', b'female', b'classic', b'beautiful',
       b'guitar', b'punk', b'fusion', b'adult contemporary', b'solo',
       b'folk', b'soul'], dtype='|S256'),
    'artist_genres_freq': array([1.        , 0.92094389, 0.94960955, 0.48641094, 0.45221077,
       0.38553616, 0.43597355, 0.30914504, 0.21425438, 0.31512916,
       0.25442608, 0.24987674, 0.2665819 , 0.34956831, 0.20174956,
       0.05043739, 0.05043739, 0.27535949, 0.22153703]),
    'artist_genres_weights': array([1.        , 0.83623738, 0.8274089 , 0.67337814, 0.52509574,
       0.52256826, 0.49997867, 0.46185663, 0.45937543, 0.43120619,
       0.41927814, 0.40548363, 0.39721848, 0.37810396, 0.36306362,
       0.32238978, 0.32238978, 0.32126081, 0.32121325]),
    'artist_name': b'Cyndi Lauper',
    'file_path': '../lmd_matched_h5\\A\\A\\A\\TRAAAGR128F425B14B.h5',
    'have_lyrics': True,
    'midi_dir': '..