In [None]:
%run Utilities.ipynb


In [None]:
# Read tsv file into groundtruth and extract only id and main genre from it
groundtruth = load_groundtruth(settings['path'] + 'groundtruth/acousticbrainz-mediaeval2017-tagtraum-train.tsv')

print 'Groundtruth size: %d' % len(groundtruth)
print 'Found {} uniquse genres.'.format(len(groundtruth['genre1'].unique()))


## Data Understanding

In [None]:
from collections import Counter
genre = Counter(groundtruth.genre1).keys() # equals to list(set(words))
counter = Counter(groundtruth.genre1).values() # counts the elements' frequency

df = pd.DataFrame(genre)
df_2 = pd.DataFrame(counter)
df_new = pd.concat([df, df_2], axis=1)
df_new

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

groundtruth.genre1.value_counts().plot(kind='bar')
plt.show()
 #show power distribution

## Load training data

In [None]:
training_data = groundtruth['recordingmbid'].apply(getOnlyUsedFeatures)

print training_data.head()

### See distribution of average loudness of each genre music

In [None]:
myset = set(groundtruth['genre1'])

In [None]:
%matplotlib inline 
import numpy as np

hist_loudness = np.arange(10)
j=0

for i in myset:    
    groundtruth_genre = groundtruth[groundtruth['genre1'] == i]
    training_data_genre = groundtruth_genre['recordingmbid'].apply(getOnlyUsedFeatures)
    print i
    loud = plt.hist(training_data_genre[13])
    print np.shape(loud[0])
    hist_loudness = np.vstack((hist_loudness, loud[0]))
    j = j+1
    plt.hist(training_data_genre[13])
    plt.show()
    plt.clf()

### See distribution of spectral energy mean of each genre

In [None]:
%matplotlib inline 
import numpy as np

hist_energy = np.arange(10)
j=0

for i in myset:    
    groundtruth_genre = groundtruth[groundtruth['genre1'] == i]
    training_data_genre = groundtruth_genre['recordingmbid'].apply(getOnlyUsedFeatures)
    print i
    energy = plt.hist(training_data_genre[14])
    print np.shape(loud[0])
    hist_energy = np.vstack((hist_energy, energy[0]))
    j = j+1
    plt.hist(training_data_genre[14])
    plt.show()
    plt.clf()

## Normalize Value in Array

In [None]:
genre_count = np.shape(hist_loudness)[0]

np.amax(hist_loudness[14])

for i in range(genre_count):
    max_value = np.amax(hist_loudness[i])
    array_length = np.shape(hist_loudness)[1]
    for j in range(array_length):
        hist_loudness[i,j] = hist_loudness[i,j]/max_value
        

In [None]:
hist_loudness

## Calculate Similarity

In [None]:
def histograms_similarity(histogram0, histogram1):
  hs_sim = histogram0.astype(np.float32) - histogram1.astype(np.float32)
  sim = 0
  for i in range(0, hs_sim.size):
    if(hs_sim[i] < 0):
      sim = sim + histogram0[i]
    else:
      sim = sim + histogram1[i]
     
  return sim

In [None]:
# Example.
print '1 vs 2: %.6f' % histograms_similarity(
    hist_loudness[0, :],
    hist_loudness[5, :])

In [None]:
def compute_self_similarity(feature_vector_matrix, similarity_function = histograms_similarity):
  matsize = np.shape(feature_vector_matrix)[0]
  sim_matrix = np.zeros((matsize, matsize))
  for i in range(0, matsize):
    for j in range(0, matsize):
        sim_matrix[i, j] = similarity_function(feature_vector_matrix[i,:], feature_vector_matrix[j,:])
  return sim_matrix

## Create Similarity Matrix

In [None]:
histograms_self_similarity = compute_self_similarity(
    hist_loudness, histograms_similarity)
print 'HS histograms self-similarity matrix size: %d x %d' % np.shape(histograms_self_similarity)

In [None]:
np.amax(histograms_self_similarity)

In [None]:
%matplotlib notebook

plt.figure()
plt.imshow(histograms_self_similarity)

## Create Dissimilarity Matrix

In [None]:
def histograms_dissimilarity(histogram0, histogram1):
  hs_dsim = abs(histogram0.astype(np.float32) - histogram1.astype(np.float32))
  dsim = 0
  for i in range(hs_dsim.size):
    dsim = dsim + hs_dsim[i]
     
  return dsim

In [None]:
histograms_dissimilarity(hist_loudness[0, :], hist_loudness[1, :])

In [None]:
# Example.
print '1 vs 2: %.2f' % histograms_dissimilarity(
    hist_loudness[0, :],
    hist_loudness[5, :])

In [None]:
def compute_self_dissimilarity(feature_vector_matrix, dissimilarity_function = histograms_dissimilarity):
  matsize = np.shape(feature_vector_matrix)[0]
  dsim_matrix = np.zeros((matsize, matsize))
  for i in range(0, matsize):
    for j in range(0, matsize):
        dsim_matrix[i, j] = dissimilarity_function(feature_vector_matrix[i,:], feature_vector_matrix[j,:])
  return dsim_matrix

In [None]:
histograms_self_dissimilarity = compute_self_dissimilarity(
    hist_loudness, histograms_dissimilarity)
print 'HS histograms self-dissimilarity matrix size: %d x %d' % np.shape(histograms_self_dissimilarity)

In [None]:
%matplotlib notebook

plt.figure()
plt.imshow(histograms_self_dissimilarity)

In [None]:
myset