# AcousticBrainz Genre Task 2017: Content-based music genre recognition from multiple sources

In [None]:
# Nice graphs for high dpi screens
%config InlineBackend.figure_format = 'retina'

## Install and import packages

In [None]:
!pip install -U scikit-learn[alldeps]
!pip install -U python-dotenv
!pip install -U pandas
!pip install nbstripout
!pip install nbformat


You **must** restart the kernel after first instaling or updating packages!

In [None]:
from sklearn import svm
import pandas as pd
import numpy as np
import json


In [None]:
%run Utilities.ipynb

## Load groundtruth and filter available records
During development it is very likely the notebook is executed with a subset of the training data, because the training data is very large (approx 80 GiB). Therefore it is needed to filter out any records we don't want to use in this notebook.

In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth = load_groundtruth(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')

print 'Groundtruth size: %d' % len(groundtruth)
print 'Found {} uniquse genres.'.format(len(groundtruth['genre1'].unique()))


### Only run for manual sampling method

# Create better balanced training data
By using a fixed number of recordings of each genre for training, the classifier will be better balanced. And this will hopefully remove bias to a genre that is in the dataset many times.

In [None]:
unique_genres = set(groundtruth['genre1'])
new_groundtruth = pd.DataFrame()

for genre in unique_genres:
    sampling_groundtruth = groundtruth.loc[groundtruth['genre1'] == genre]
    if (len(sampling_groundtruth) < 300):
        sampling_groundtruth = sampling_groundtruth     
    else:
        sampling_groundtruth = sampling_groundtruth.sample(n=300)
    
    new_groundtruth = new_groundtruth.append(sampling_groundtruth)
    

In [None]:
new_groundtruth
print len(new_groundtruth)

In [None]:
# Maybe during development you want a really really small dataset ?
if settings['very_few'] == 'True':
    print 'Limit groundtruth to 10000 elements'
    groundtruth = groundtruth.head(10000)
 

In [None]:
training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

## Train SVM

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))

In [None]:
#clf = svm.SVC()
clf.fit(training_data[0:1000], groundtruth['genre1'][0:1000])

In [None]:
test_features = training_data[-100:]
test_label = groundtruth['genre1'][-100:]
print test_label
print clf.predict(test_features)

In [None]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import numpy as np

model = GaussianNB()

In [None]:
# Train the model using the training sets 
model.fit(training_data[0:1000], groundtruth['genre1'][0:1000])

In [None]:
#Predict Output 
predicted= model.predict(test_features)
print predicted
print test_label

## Undersampling using imbalance dataset library

In [None]:
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import datasets, neighbors

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

In [None]:
RANDOM_STATE = 50
X_train, X_test, y_train, y_test = train_test_split(training_data, groundtruth['genre1'],
                                                     random_state=RANDOM_STATE)

# Create a pipeline
# pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
#                          LinearSVC(random_state=RANDOM_STATE))
pipeline = make_pipeline(SMOTE(random_state=RANDOM_STATE), neighbors.KNeighborsClassifier(3))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(groundtruth['genre1'][-5000:], pipeline.predict(training_data[-5000:])))

In [None]:
predict_value = pipeline.predict(test_features)
predict_value = pd.DataFrame(predict_value)
predict_value

In [None]:
 pettyPrintJSON(loadFeatures('1a00a335-fead-46ec-8d4f-06e8341291ea'))

## Using Manual Sampling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))
#clf = svm.SVC()

### Utility for sampling data

In [None]:
from collections import Counter
genre = Counter(new_groundtruth.genre1).keys() # equals to list(set(words))
counter = Counter(new_groundtruth.genre1).values() # counts the elements' frequency

df = pd.DataFrame(genre)
df_2 = pd.DataFrame(counter)
df_new = pd.concat([df, df_2], axis=1)
df_new

In [None]:
clf.fit(training_data, new_groundtruth['genre1'])

### Prepare test set

In [None]:
test_features = training_data[-1000:]
test_label = new_groundtruth['genre1'][-1000:]

test_result = []

print type(test_features)
print type(test_label)


In [None]:
testing_set = groundtruth.sample(n=100)
test_label = testing_set['genre1']
test_features = testing_set['recordingmbid'].apply(getOnlyUsedFeatures)

In [None]:
print testing_data.head()

In [None]:
test_label = test_label.reset_index()

In [None]:
predict_value = clf.predict(test_features)
predict_value = pd.DataFrame(predict_value)

In [None]:
print type(test_label)
test_label = pd.DataFrame(test_label)


In [None]:
test_result = pd.DataFrame()
test_result

In [None]:
test_result = test_label
print type(test_result)
test_result

In [None]:
test_result["predict"] = predict_value
test_result

In [None]:
for i in xrange(100):
    if test_result["genre1"][i] == test_result["predict"][i]:
        test_result["checker"][i] = 1
    else:
        test_result["checker"][i] = 0

In [None]:
test_result

In [None]:
test_result["checker"].sum()