# AcousticBrainz Genre Task 2017: Content-based music genre recognition from multiple sources

In [None]:
# Nice graphs for high dpi screens
%config InlineBackend.figure_format = 'retina'

## Install and import packages

In [None]:
!pip install -U scikit-learn[alldeps]
!pip install -U python-dotenv
!pip install -U pandas


In [None]:
from os import environ
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

settings = {
    "path": environ.get("PATH_TO_DATASET"),
    "loaded_data": environ.get('LOADED_TRAINING_DATA'),
    
    "very_few": environ.get('VERY_FEW_RECORDS', False),  # Limit the dataset to very few records, useful during development
}

You **must** restart the kernel after first instaling or updating packages!

In [None]:
from sklearn import svm
import pandas as pd
import json


In [None]:
%run Utilities.ipynb

## Load groundtruth and filter available records
During development it is very likely the notebook is executed with a subset of the training data, because the training data is very large (approx 80 GiB). Therefore it is needed to filter out any records we don't want to use in this notebook.

In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth = load_groundtruth(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')

print 'Groundtruth size: %d' % len(groundtruth)
print 'Found {} uniquse genres.'.format(len(groundtruth['genre1'].unique()))


In [None]:

# Maybe during development you want a really really small dataset ?
if settings['very_few']:
    print 'Limit groundtruth to 10000 elements'
    groundtruth = groundtruth.head(10000)


## Utility functions

In [None]:
def loadFeatures(recordingmbid):
    '''Load raw feature file of a record into an object'''
    feature_file_path = '{basepath}acousticbrainz-mediaeval-train/{id_prefix}/{id}.json'.format(
        basepath = settings['path'], id_prefix=recordingmbid[0:2], id = recordingmbid);
    
    with open(feature_file_path) as feature_file:    
        data = json.load(feature_file)
    
    return data

def pettyPrintJSON(object_to_print):
    print(json.dumps(object_to_print, sort_keys=True, indent=4))
    

def getOnlyUsedFeatures(recordingmbid):
    '''Extract used functions from the raw feature file'''
    all_features = loadFeatures(recordingmbid)
    
    used_features = [
        'lowlevel.mfcc.mean'
    ]
    
    result_features = []
    
    for feature_name in used_features:
        reduced_features = all_features

        for k in feature_name.split('.'):
            reduced_features = reduced_features[k]
        
        result_features.extend(reduced_features)
        
    return pd.Series(result_features)
    

training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

In [None]:
# Transform multi labels into matrix

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_matrix = mlb.fit_transform(groundtruth['main_genres'])
print list(mlb.classes_)


## Train SVM

In [None]:


clf = svm.SVC()
clf.fit(training_data[0:-1000], groundtruth['genre1'][0:-1000])  

In [None]:

test_data = training_data[-1000:]
test = pd.DataFrame()
test['truth'] = groundtruth['genre1'][-1000:]
test['prediction'] = clf.predict(test_data)


# Should be replaced with:
# correct_prediction = verify_predictions(test['truth'], test['prediction'])
correct_prediction = test['truth'] == test['prediction']


# Should be replaced by evaluate function
total = len(correct_prediction)
correct = len(correct_prediction[correct_prediction == True])

print 'Total: %d' % total
print 'Correct: %d' % correct
print 'Correct %% %.2f' % (100.0 * correct / total)


print test.head()
print correct_prediction.head()
