# AcousticBrainz Genre Task 2017: Content-based music genre recognition from multiple sources

In [None]:
# Nice graphs for high dpi screens
%config InlineBackend.figure_format = 'retina'

## Install and import packages

In [None]:
!pip install -U scikit-learn[alldeps]
!pip install -U python-dotenv
!pip install -U pandas


In [None]:
from os import environ
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

settings = {
    "path": environ.get("PATH_TO_DATASET"),
    "loaded_data": environ.get('LOADED_TRAINING_DATA'),
    
    "very_few": environ.get('VERY_FEW_RECORDS', False),  # Limit the dataset to very few records, useful during development
}

You **must** restart the kernel after first instaling or updating packages!

In [None]:
from sklearn import svm
import pandas as pd
import json


## Load groundtruth and filter available records
During development it is very likely the notebook is executed with a subset of the training data, because the training data is very large (approx 80 GiB). Therefore it is needed to filter out any records we don't want to use in this notebook.

In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth_raw = pd.read_table(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')
groundtruth_raw = groundtruth_raw[['recordingmbid', 'genre1']]

# Define a predicate to determine if the recordmbid is in our dataset
def isInLoadedDataset(id):
    return id[0] in settings['loaded_data']
    
groundtruth_available = groundtruth_raw.apply(lambda x: isInLoadedDataset(x['recordingmbid']), axis=1)
groundtruth = groundtruth_raw[groundtruth_available]
print 'Groundtruth raw: %d' % len(groundtruth_raw)
print 'Groundtruth only available: %d' % len(groundtruth)


print 'Found {} unique genres.'.format(len(groundtruth['genre1'].unique()))


In [None]:

# Maybe during development you want a really really small dataset ?
if settings['very_few']:
    groundtruth = groundtruth.head(10000)


## Utility functions

In [None]:
def loadFeatures(recordingmbid):
    '''Load raw feature file of a record into an object'''
    feature_file_path = '{basepath}acousticbrainz-mediaeval-train/{id_prefix}/{id}.json'.format(
        basepath = settings['path'], id_prefix=recordingmbid[0:2], id = recordingmbid);
    
    with open(feature_file_path) as feature_file:    
        data = json.load(feature_file)
    
    return data

def pettyPrintJSON(object_to_print):
    print(json.dumps(object_to_print, sort_keys=True, indent=4))
    

def getOnlyUsedFeatures(recordingmbid):
    '''Extract used functions from the raw feature file'''
    all_features = loadFeatures(recordingmbid)
    
    used_features = [
        'lowlevel.mfcc.mean'
    ]
    
    result_features = []
    
    for feature_name in used_features:
        reduced_features = all_features

        for k in feature_name.split('.'):
            reduced_features = reduced_features[k]
        
        result_features.extend(reduced_features)
        
    return pd.Series(result_features)
    

training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

## Train SVM

In [None]:


clf = svm.SVC()
clf.fit(training_data[0:-2], groundtruth['genre1'][0:-2])  

In [None]:

test_features = training_data[-1:]
test_label = groundtruth['genre1'][-1:]
print test_label
print clf.predict(test_features)

In [None]:
 pettyPrintJSON(loadFeatures('1a00a335-fead-46ec-8d4f-06e8341291ea'))