# Utility functions

# Load Settings

In [None]:
from os import environ
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

settings = {
    "path": environ.get("PATH_TO_DATASET"),
    "loaded_data": environ.get('LOADED_TRAINING_DATA'),
    
    "very_few": environ.get('VERY_FEW_RECORDS', False),  # Limit the dataset to very few records, useful during development
}

## Load groundtruth

In [None]:
import pandas as pd

def load_groundtruth(filename):
    '''Load groundtruth tsv and process it
    
    Input:
        filename  string  Groundtruth to load (ex. '/home/user/groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')
        
    Return:
        DataFrame   With columns: 'recordingmbid', 'genre1', 'main_genres'
    '''

    # Read tsv file into groundtruth and extract only id and main genre from it
    groundtruth_raw = pd.read_table(filename)

    # Define a predicate to determine if the recordmbid is in our dataset
    def isInLoadedDataset(id):
        return id[0] in settings['loaded_data']

    # Filter only records with a recordmbid we have in our dataset
    groundtruth_available_labels = groundtruth_raw.apply(lambda x: isInLoadedDataset(x['recordingmbid']), axis=1)
    groundtruth_available = groundtruth_raw[groundtruth_available_labels]

    groundtruth = groundtruth_available[['recordingmbid', 'genre1', 'genre2']].copy()

    def extract_main_genres(record):
        return record[~record.str.contains("-", na=True)].tolist()

    groundtruth['main_genres'] = groundtruth_available.apply(lambda x: extract_main_genres(x), axis=1)
    
    return groundtruth


# Other

In [None]:
def loadFeatures(recordingmbid):
    '''Load raw feature file of a record into an object'''
    feature_file_path = '{basepath}acousticbrainz-mediaeval-train/{id_prefix}/{id}.json'.format(
        basepath = settings['path'], id_prefix=recordingmbid[0:2], id = recordingmbid);
    
    with open(feature_file_path) as feature_file:    
        data = json.load(feature_file)
    
    return data

def pettyPrintJSON(object_to_print):
    print(json.dumps(object_to_print, sort_keys=True, indent=4))
    

def getOnlyUsedFeatures(recordingmbid):
    '''Extract used functions from the raw feature file'''
    all_features = loadFeatures(recordingmbid)
    
    used_features = [
        'lowlevel.mfcc.mean',
         'lowlevel.average_loudness',
         'lowlevel.spectral_energy.mean']
    
    result_features = []
    
    for feature_name in used_features:
        reduced_features = all_features

        for k in feature_name.split('.'):
            reduced_features = reduced_features[k]
            
        if isinstance(reduced_features, float):
            result_features.append(reduced_features)
        else:
            result_features.extend(reduced_features)
        
    return pd.Series(result_features)