# Utility functions

## Load groundtruth

In [None]:
import pandas as pd

def load_groundtruth(filename):
    '''Load groundtruth tsv and process it
    
    Input:
        filename  string  Groundtruth to load (ex. '/home/user/groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')
        
    Return:
        DataFrame   With columns: 'recordingmbid', 'genre1', 'main_genres'
    '''

    # Read tsv file into groundtruth and extract only id and main genre from it
    groundtruth_raw = pd.read_table(filename)

    # Define a predicate to determine if the recordmbid is in our dataset
    def isInLoadedDataset(id):
        return id[0] in settings['loaded_data']

    # Filter only records with a recordmbid we have in our dataset
    groundtruth_available_labels = groundtruth_raw.apply(lambda x: isInLoadedDataset(x['recordingmbid']), axis=1)
    groundtruth_available = groundtruth_raw[groundtruth_available_labels]

    groundtruth = groundtruth_available[['recordingmbid', 'genre1']].copy()

    def extract_main_genres(record):
        return record[~record.str.contains("-", na=True)].tolist()

    groundtruth['main_genres'] = groundtruth_available.apply(lambda x: extract_main_genres(x), axis=1)
    
    return groundtruth
