# AcousticBrainz Genre Task 2017: Content-based music genre recognition from multiple sources

In [None]:
# Nice graphs for high dpi screens
%config InlineBackend.figure_format = 'retina'

## Install and import packages

In [None]:
!pip install -U scikit-learn[alldeps]
!pip install -U python-dotenv
!pip install -U pandas
!pip install nbstripout
!pip install nbformat


You **must** restart the kernel after first instaling or updating packages!

In [None]:
from sklearn import svm
import pandas as pd
import numpy as np
import json


In [None]:
%run Utilities.ipynb

## Load groundtruth and filter available records
During development it is very likely the notebook is executed with a subset of the training data, because the training data is very large (approx 80 GiB). Therefore it is needed to filter out any records we don't want to use in this notebook.

In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth = load_groundtruth(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')

print 'Groundtruth size: %d' % len(groundtruth)
print 'Found {} uniquse genres.'.format(len(groundtruth['genre1'].unique()))


### Only run for manual sampling method

In [None]:
unique_genres = set(groundtruth['genre1'])
new_groundtruth = pd.DataFrame()

for genre in unique_genres:
    sampling_groundtruth = groundtruth.loc[groundtruth['genre1'] == genre]
    if (len(sampling_groundtruth) < 300):
        sampling_groundtruth = sampling_groundtruth     
    else:
        sampling_groundtruth = sampling_groundtruth.sample(n=300)
    
    new_groundtruth = new_groundtruth.append(sampling_groundtruth)
    

In [None]:
new_groundtruth
print len(new_groundtruth)

In [None]:
# Maybe during development you want a really really small dataset ?
if settings['very_few'] == 'True':
    print 'Limit groundtruth to 10000 elements'
    groundtruth = groundtruth.head(10000)
 

## Data Understanding

In [None]:
from collections import Counter
genre = Counter(groundtruth.genre1).keys() # equals to list(set(words))
counter = Counter(groundtruth.genre1).values() # counts the elements' frequency

df = pd.DataFrame(genre)
df_2 = pd.DataFrame(counter)
df_new = pd.concat([df, df_2], axis=1)
df_new

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

groundtruth.genre1.value_counts().plot(kind='bar')
plt.show()
 #show power distribution

## Load training data

In [None]:
training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

### See distribution of average loudness of each genre music

In [None]:
myset = set(groundtruth['genre1'])

In [None]:
%matplotlib inline 
import numpy as np

hist_loudness = np.arange(10)
j=0

for i in myset:    
    groundtruth_genre = groundtruth[groundtruth['genre1'] == i]
    training_data_genre = groundtruth_genre['recordingmbid'].apply(getOnlyUsedFeatures)
    print i
    loud = plt.hist(training_data_genre[13])
    print np.shape(loud[0])
    hist_loudness = np.vstack((hist_loudness, loud[0]))
    j = j+1
    plt.hist(training_data_genre[13])
    plt.show()
    plt.clf()

### See distribution of spectral energy mean of each genre

In [None]:
%matplotlib inline 
import numpy as np

hist_energy = np.arange(10)
j=0

for i in myset:    
    groundtruth_genre = groundtruth[groundtruth['genre1'] == i]
    training_data_genre = groundtruth_genre['recordingmbid'].apply(getOnlyUsedFeatures)
    print i
    energy = plt.hist(training_data_genre[14])
    print np.shape(loud[0])
    hist_energy = np.vstack((hist_energy, energy[0]))
    j = j+1
    plt.hist(training_data_genre[14])
    plt.show()
    plt.clf()

## Normalize Value in Array

In [None]:
genre_count = np.shape(hist_loudness)[0]

np.amax(hist_loudness[14])

for i in range(genre_count):
    max_value = np.amax(hist_loudness[i])
    array_length = np.shape(hist_loudness)[1]
    for j in range(array_length):
        hist_loudness[i,j] = hist_loudness[i,j]/max_value
        

In [None]:
hist_loudness

## Calculate Similarity

In [None]:
def histograms_similarity(histogram0, histogram1):
  hs_sim = histogram0.astype(np.float32) - histogram1.astype(np.float32)
  sim = 0
  for i in range(0, hs_sim.size):
    if(hs_sim[i] < 0):
      sim = sim + histogram0[i]
    else:
      sim = sim + histogram1[i]
     
  return sim

In [None]:
# Example.
print '1 vs 2: %.6f' % histograms_similarity(
    hist_loudness[0, :],
    hist_loudness[5, :])

In [None]:
def compute_self_similarity(feature_vector_matrix, similarity_function = histograms_similarity):
  matsize = np.shape(feature_vector_matrix)[0]
  sim_matrix = np.zeros((matsize, matsize))
  for i in range(0, matsize):
    for j in range(0, matsize):
        sim_matrix[i, j] = similarity_function(feature_vector_matrix[i,:], feature_vector_matrix[j,:])
  return sim_matrix

## Create Similarity Matrix

In [None]:
histograms_self_similarity = compute_self_similarity(
    hist_loudness, histograms_similarity)
print 'HS histograms self-similarity matrix size: %d x %d' % np.shape(histograms_self_similarity)

In [None]:
np.amax(histograms_self_similarity)

In [None]:
%matplotlib notebook

plt.figure()
plt.imshow(histograms_self_similarity)

## Create Dissimilarity Matrix

In [None]:
def histograms_dissimilarity(histogram0, histogram1):
  hs_dsim = abs(histogram0.astype(np.float32) - histogram1.astype(np.float32))
  dsim = 0
  for i in range(hs_dsim.size):
    dsim = dsim + hs_dsim[i]
     
  return dsim

In [None]:
histograms_dissimilarity(hist_loudness[0, :], hist_loudness[1, :])

In [None]:
# Example.
print '1 vs 2: %.2f' % histograms_dissimilarity(
    hist_loudness[0, :],
    hist_loudness[5, :])

In [None]:
def compute_self_dissimilarity(feature_vector_matrix, dissimilarity_function = histograms_dissimilarity):
  matsize = np.shape(feature_vector_matrix)[0]
  dsim_matrix = np.zeros((matsize, matsize))
  for i in range(0, matsize):
    for j in range(0, matsize):
        dsim_matrix[i, j] = dissimilarity_function(feature_vector_matrix[i,:], feature_vector_matrix[j,:])
  return dsim_matrix

In [None]:
histograms_self_dissimilarity = compute_self_dissimilarity(
    hist_loudness, histograms_dissimilarity)
print 'HS histograms self-dissimilarity matrix size: %d x %d' % np.shape(histograms_self_dissimilarity)

In [None]:
%matplotlib notebook

plt.figure()
plt.imshow(histograms_self_dissimilarity)

In [None]:
myset

## Train SVM

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))

In [None]:
#clf = svm.SVC()
clf.fit(training_data[0:1000], groundtruth['genre1'][0:1000])

In [None]:
test_features = training_data[-100:]
test_label = groundtruth['genre1'][-100:]
print test_label
print clf.predict(test_features)

In [None]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import numpy as np

model = GaussianNB()

In [None]:
# Train the model using the training sets 
model.fit(training_data[0:1000], groundtruth['genre1'][0:1000])

In [None]:
#Predict Output 
predicted= model.predict(test_features)
print predicted
print test_label

## Undersampling using imbalance dataset library

In [None]:
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import datasets, neighbors

from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

In [None]:
RANDOM_STATE = 50
X_train, X_test, y_train, y_test = train_test_split(training_data, groundtruth['genre1'],
                                                     random_state=RANDOM_STATE)

# Create a pipeline
# pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
#                          LinearSVC(random_state=RANDOM_STATE))
pipeline = make_pipeline(SMOTE(random_state=RANDOM_STATE), neighbors.KNeighborsClassifier(3))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(groundtruth['genre1'][-5000:], pipeline.predict(training_data[-5000:])))

In [None]:
predict_value = pipeline.predict(test_features)
predict_value = pd.DataFrame(predict_value)
predict_value

In [None]:
 pettyPrintJSON(loadFeatures('1a00a335-fead-46ec-8d4f-06e8341291ea'))

## Using Manual Sampling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced'))
#clf = svm.SVC()

### Utility for sampling data

In [None]:
from collections import Counter
genre = Counter(new_groundtruth.genre1).keys() # equals to list(set(words))
counter = Counter(new_groundtruth.genre1).values() # counts the elements' frequency

df = pd.DataFrame(genre)
df_2 = pd.DataFrame(counter)
df_new = pd.concat([df, df_2], axis=1)
df_new

In [None]:
clf.fit(training_data, new_groundtruth['genre1'])

### Prepare test set

In [None]:
test_features = training_data[-1000:]
test_label = new_groundtruth['genre1'][-1000:]

test_result = []

print type(test_features)
print type(test_label)


In [None]:
testing_set = groundtruth.sample(n=100)
test_label = testing_set['genre1']
test_features = testing_set['recordingmbid'].apply(getOnlyUsedFeatures)

In [None]:
print testing_data.head()

In [None]:
test_label = test_label.reset_index()

In [None]:
predict_value = clf.predict(test_features)
predict_value = pd.DataFrame(predict_value)

In [None]:
print type(test_label)
test_label = pd.DataFrame(test_label)


In [None]:
test_result = pd.DataFrame()
test_result

In [None]:
test_result = test_label
print type(test_result)
test_result

In [None]:
test_result["predict"] = predict_value
test_result

In [None]:
for i in xrange(100):
    if test_result["genre1"][i] == test_result["predict"][i]:
        test_result["checker"][i] = 1
    else:
        test_result["checker"][i] = 0

In [None]:
test_result

In [None]:
test_result["checker"].sum()