# AcousticBrainz Genre Task 2017: Content-based music genre recognition from multiple sources

In [None]:
# Nice graphs for high dpi screens
%config InlineBackend.figure_format = 'retina'

## Install and import packages

In [None]:
!pip install -U scikit-learn[alldeps]
!pip install -U python-dotenv
!pip install -U pandas
!pip install nbstripout
!pip install nbformat


You **must** restart the kernel after first instaling or updating packages!

In [None]:
from sklearn import svm
import pandas as pd
import numpy as np
import json


In [None]:
%run Utilities.ipynb
%run Evaluation.ipynb

## Load groundtruth and filter available records
During development it is very likely the notebook is executed with a subset of the training data, because the training data is very large (approx 80 GiB). Therefore it is needed to filter out any records we don't want to use in this notebook.

In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth = load_groundtruth(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')

print 'Groundtruth size: %d' % len(groundtruth)
print 'Found {} unique genres.'.format(len(groundtruth['genre1'].unique()))


### Only run for manual sampling method

# Create better balanced training data
By using a fixed number of recordings of each genre for training, the classifier will be better balanced. And this will hopefully remove bias to a genre that is in the dataset many times.

In [None]:
import matplotlib.pyplot as plt


single_genre = groundtruth[groundtruth['genre2'].isnull()]
count_per_genre = single_genre[['genre1', 'main_genres']].groupby(['genre1']).count()
sample_size = count_per_genre['main_genres'].quantile(0.5)

balanced_groundtruth = pd.DataFrame()

for genre, count in count_per_genre.iterrows():
    specific_genre_groundtruth = single_genre[single_genre['genre1'] == genre]
    
    if count[0] > sample_size:
        specific_genre_groundtruth = specific_genre_groundtruth.sample(n=int(sample_size))
        
    balanced_groundtruth = balanced_groundtruth.append(specific_genre_groundtruth)

# Shuffle balanced_groundtruth
balanced_groundtruth = balanced_groundtruth.sample(frac=1)
# Load training data
balanced_training_data = balanced_groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print 'Groundtruth size: %d' % len(balanced_groundtruth)


In [None]:
balanced_groundtruth[['genre1', 'main_genres']] \
    .groupby(['genre1']) \
    .count() \
    .plot.bar()

plt.show()


In [None]:
unique_genres = set(groundtruth['genre1'])
new_groundtruth = pd.DataFrame()

for genre in unique_genres:
    sampling_groundtruth = groundtruth.loc[groundtruth['genre1'] == genre]
    if (len(sampling_groundtruth) < 300):
        sampling_groundtruth = sampling_groundtruth     
    else:
        sampling_groundtruth = sampling_groundtruth.sample(n=300)
    
    new_groundtruth = new_groundtruth.append(sampling_groundtruth)
    

In [None]:
new_groundtruth
print len(new_groundtruth)

In [None]:
# Maybe during development you want a really really small dataset ?
if settings['very_few'] == 'True':
    print 'Limit groundtruth to 10000 elements'
    groundtruth = groundtruth.head(1000)
    new_groundtruth = new_groundtruth[0:1000]
 

In [None]:
training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

## Train SVM

In [None]:
from sklearn.multiclass import OneVsRestClassifier


def do_experiment(classifier, data, labels):
    separation = int(len(data) * 0.9)
    
    training_data = data[0:separation]
    training_labels = labels[0:separation]
    
    test_data = data[separation:]
    test_labels = labels[separation:]
    
    return do_experiment_separate_training_data(classifier, 
                                                training_data, training_labels, 
                                                test_data, test_labels)

def do_experiment_separate_training_data(classifier, training_data, training_labels, test_data, test_labels):
    
    classifier.fit(training_data, training_labels)

    predictions = classifier.predict(test_data)    
    test_truth = test_labels.apply(lambda x: [x])
    
    evaluation_results = evaluate(predictions, test_truth)

    display(evaluation_results[0])
    display(evaluation_results[1])
    
    return evaluation_results, predictions

In [None]:
%run Evaluation.ipynb

clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))

results_OvsR_SVC = do_experiment(clf, 
                                 balanced_training_data, 
                                 balanced_groundtruth['genre1'])


In [None]:
from sklearn.naive_bayes import GaussianNB

model_GaussianNB = GaussianNB()

results_GaussianNB = do_experiment(model_GaussianNB, 
                                   balanced_training_data, 
                                   balanced_groundtruth['genre1'])


## Undersampling using imbalance dataset library

In [None]:
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import datasets, neighbors

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

In [None]:
RANDOM_STATE = 50
X_train, X_test, y_train, y_test = train_test_split(training_data, groundtruth['genre1'],
                                                     random_state=RANDOM_STATE)

# Create a pipeline
# pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
#                          LinearSVC(random_state=RANDOM_STATE))
pipeline = make_pipeline(SMOTE(random_state=RANDOM_STATE), neighbors.KNeighborsClassifier(3))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(groundtruth['genre1'][-5000:], pipeline.predict(training_data[-5000:])))

In [None]:
predict_value = pipeline.predict(test_features)
predict_value = pd.DataFrame(predict_value)
predict_value

In [None]:
 pettyPrintJSON(loadFeatures('1a00a335-fead-46ec-8d4f-06e8341291ea'))

## Using Manual Sampling

In [None]:
clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))
clf.fit(training_data, new_groundtruth['genre1'])

results = do_experiment(clf, training_data, new_groundtruth['genre1'])