### Step 1: Import necessary libraries

In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import json

2024-05-25 04:26:18.942313: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 04:26:18.979382: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Step 2: Download model and define paths

Download the model and move to 'model' directory. (skip this step if model is already downloaded)

In [3]:
!curl -SLO https://essentia.upf.edu/models/classifiers/genre_tzanetakis/genre_tzanetakis-musicnn-msd-1.json
!curl -SLO https://essentia.upf.edu/models/classifiers/genre_tzanetakis/genre_tzanetakis-musicnn-msd-1.pb

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3101  100  3101    0     0   2416      0  0:00:01  0:00:01 --:--:--  2416
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0^C


In [2]:
MUSICNN_SR = 16000
MODEL_PATH = 'model/genre_tzanetakis-musicnn-msd-1.pb'
DATASET_DIR = 'dataset/vocals'

In [3]:
MODEL_NAME = 'model/genre_tzanetakis-musicnn-msd-1'
MODEL_JSON = f'{MODEL_NAME}.json'
MODEL_PB = f'{MODEL_NAME}.pb'

musicnn_metadata = json.load(open(MODEL_JSON, 'r'))
for k, v in musicnn_metadata.items():
    print('{}: {}'.format(k , v))

name: genre GTZAN
type: multi-class classifier
link: https://essentia.upf.edu/models/classifiers/genre_tzanetakis/genre_tzanetakis-musicnn-msd-1.pb
version: 1
description: classification of music by genre
author: Pablo Alonso
email: pablo.alonso@upf.edu
release_date: 2020-03-31
framework: tensorflow
framework_version: 1.15.0
classes: ['blu', 'cla', 'cou', 'dis', 'hip', 'jaz', 'met', 'pop', 'reg', 'roc']
model_types: ['frozen_model']
dataset: {'name': 'the GTZAN Genre Collection', 'citation': '@article{tzanetakis2002musical,\n  title={Musical genre classification of audio signals},\n  author={Tzanetakis, George and Cook, Perry},\n  journal={IEEE Transactions on speech and audio processing},\n  volume={10},\n  number={5},\n  pages={293--302},\n  year={2002},\n  publisher={IEEE}\n}', 'size': '1000 track excerpts, 100 per genre', 'metrics': {'5-fold_cross_validation_normalized_accuracy': 0.83}}
schema: {'inputs': [{'name': 'model/Placeholder', 'type': 'float', 'shape': [187, 96]}], 'output

We can observe the output of the penultimate dense layer is proposed as embeddings. 
We will use it to extract songs embeddings from our dataset.

### Step 3: Define genres for each song

In [4]:
genres = [
    ['Romantic', 'Melody'], # Allantha Doorala 0
    ['Romantic', 'Melody'], # Pranavalaya 1
    ['Romantic', 'Sad'], # Adiga Adiga 2
    ['Romantic', 'Upbeat'], # Guruvaram 3
    ['Romantic', 'Folk'],   # Ghal Ghal 4
    ['Romantic', 'Melody'], # Ninnila 5
    ['Mass', 'Inspirational'], # Hukum 6
    ['Melody'], # Chinnari Thalli 7
    ['Romantic', 'Melody'], # O Cheliya 8
    ['Folk', 'Dance'],      # Naatu Naatu 9 
    ['Folk', 'Dance'],      # Kurchi Madathapetti 10
    ['Romantic', 'Upbeat'], # Urike Urike 11
    ['Inspirational', 'Mass'], # Saahore Baahubali 12
    ['Romantic', 'Melody'], # Nenani Neevani 13
    ['Romantic', 'Melody'], # Manasavacha 14
    ['Romantic', 'Melody'],       # Ammaadi 15
    ['Romantic', 'Melody'], # Dheera Dheera 16
    ['Melody'], # Pedave Palikina 17 
    ['Romantic', 'Sad'],   # Oosupodu 18
    ['Romantic', 'Melody'] # Sirivennela 19
]

### Step 4: Encode genres

In [5]:
mlb = MultiLabelBinarizer()
genre_labels = mlb.fit_transform(genres)

### Step 5: Functions for loading and processing audio

In [6]:
def load_model(pb_file_path):
    with tf.io.gfile.GFile(pb_file_path, "rb") as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.compat.v1.Graph().as_default() as graph:
        tf.import_graph_def(graph_def, name="")
    return graph

def preprocess_audio(file_path, sample_rate=MUSICNN_SR):
    y, sr = librosa.load(file_path, sr=sample_rate)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=96, n_fft=2048, hop_length=512)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    if log_mel_spec.shape[1] > 187:
        log_mel_spec = log_mel_spec[:, :187]
    else:
        pad_width = 187 - log_mel_spec.shape[1]
        log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, pad_width)), mode='constant')
    return log_mel_spec.T[np.newaxis, :, :]

def run_model(graph, input_data):
    input_tensor = graph.get_tensor_by_name('model/Placeholder:0')
    output_tensor = graph.get_tensor_by_name('model/dense/BiasAdd:0')
    with tf.compat.v1.Session(graph=graph) as sess:
        embeddings = sess.run(output_tensor, feed_dict={input_tensor: input_data})
    return embeddings

def extract_mean_embedding(filename):
    audio_data = preprocess_audio(filename)
    graph = load_model(MODEL_PATH)
    embeddings = run_model(graph, audio_data)
    return embeddings

def process_dataset(dataset_dir):
    embeddings = []
    for root, dirs, files in os.walk(dataset_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                mean_embedding = extract_mean_embedding(file_path)
                embeddings.append((file_path, mean_embedding))
    return embeddings

### Step 6: Process dataset and extract embeddings

In [7]:
file_embeddings = process_dataset(DATASET_DIR)
embeddings = []
for file_path, embedding in file_embeddings:
    print(f'File path: {file_path}')
    embeddings.append(embedding)

embeddings = np.array(embeddings)
embeddings_matrix = np.vstack([emb for emb in embeddings])  # to convert list of tuples to a numpy matrix
similarities = cosine_similarity(embeddings_matrix)

2024-05-25 04:26:30.595203: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-25 04:26:30.652487: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-05-25 04:26:30.658316: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled


File path: dataset/vocals/Nattu Nattu_vocals.wav
File path: dataset/vocals/Guruvaram_vocals.wav
File path: dataset/vocals/Oosupodu_vocals.wav
File path: dataset/vocals/Nenani Neevani_vocals.wav
File path: dataset/vocals/Saahore Baahubali_vocals.wav
File path: dataset/vocals/Adiga Adiga_vocals.wav
File path: dataset/vocals/Ammaadi_vocals.wav
File path: dataset/vocals/O Cheliya_vocals.wav
File path: dataset/vocals/Manasavacha_vocals.wav
File path: dataset/vocals/Pedave Palikina_vocals.wav
File path: dataset/vocals/Hukum_vocals.wav
File path: dataset/vocals/Dheera Dheera Dheera_vocals.wav
File path: dataset/vocals/Ninnila_vocals.wav
File path: dataset/vocals/Ghal Ghal Ghal Ghal_vocals.wav
File path: dataset/vocals/Kurchi Madathapetti_vocals.wav
File path: dataset/vocals/Urike Urike_vocals.wav
File path: dataset/vocals/Allantha Doorala_vocals.wav
File path: dataset/vocals/Chinnari Thalli_vocals.wav
File path: dataset/vocals/Pranavalaya_vocals.wav
File path: dataset/vocals/Sirivennela_vocal

In [8]:
print(similarities.shape)
print(f'Min value: {similarities.min()}')
print(f'Max value: {similarities.max()}')

fig = px.imshow(similarities, color_continuous_scale='RdYlGn')
fig.show()

(20, 20)
Min value: 0.9490228891372681
Max value: 1.000000238418579


### Step 7: Compute sorted indexes and similar songs matrix

In [9]:
sorted_indexes = np.argsort(similarities, axis=1)
sorted_indexes = np.fliplr(sorted_indexes)

def match_genres(genres_i, genres_j):
    return any(genre in genres_j for genre in genres_i)

similar_songs = np.zeros_like(sorted_indexes)

for i in range(sorted_indexes.shape[0]):
    genre_i = genres[i] 
    sorted_indexes_i = sorted_indexes[i, :]  
    similar_songs_i = np.array([1 if match_genres(genre_i, genres[j]) else 0 for j in sorted_indexes_i])
    similar_songs[i, :] = similar_songs_i


### Step 8: Evaluation metrics: AP@N and MAP@N

In [10]:
def precision_k(similar_items, k):
    similar_items_k = similar_items[:k]
    P = sum(similar_items_k) / k
    return P

def average_precision_N(similar_items, N):
    m = np.min([np.sum(similar_items), N])
    sum_vector = []
    for k in range(1, N+1):
        if k-1 < len(similar_items):
            sum_vector.append(precision_k(similar_items, k) * similar_items[k-1])
    AP = (1/m) * sum(sum_vector)
    return AP


### Step 9: Evaluate music similarity

In [11]:
N_range = [3, 5, 10, 13, 15]
AP_songs = []

for N in N_range:
    AP_by_N = []
    for i in range(similar_songs.shape[0]):
        AP_by_N.append(average_precision_N(similar_songs[i, :], N))
    AP_songs.append(AP_by_N)

genre_strings = [','.join(genre) for genre in genres]

df = pd.DataFrame(
    list(zip(
        [file_path for file_path in range(len(file_path))], 
        genre_strings,
        *AP_songs
    )),
    columns=['song_name', 'genre'] + [f'@{N}' for N in N_range]
)

df.head(20)

Unnamed: 0,song_name,genre,@3,@5,@10,@13,@15
0,0,"Romantic,Melody",0.555556,0.483333,0.61254,0.598224,0.624175
1,1,"Romantic,Melody",0.555556,0.483333,0.61254,0.593293,0.619902
2,2,"Romantic,Sad",0.666667,0.71,0.594048,0.512904,0.569805
3,3,"Romantic,Upbeat",1.0,0.6,0.55631,0.483874,0.542849
4,4,"Romantic,Folk",1.0,1.0,0.732778,0.755804,0.708363
5,5,"Romantic,Melody",1.0,1.0,0.8,0.742424,0.749149
6,6,"Mass,Inspirational",0.5,0.5,0.5,0.5,0.5
7,7,Melody,0.333333,0.42,0.399167,0.476674,0.595722
8,8,"Romantic,Melody",1.0,1.0,0.8,0.742424,0.749149
9,9,"Folk,Dance",0.333333,0.333333,0.333333,0.333333,0.333333


### Step 10: Plotting AP@N histograms

In [12]:
fig = make_subplots(rows=2, cols=3, subplot_titles=("AP@2", "AP@3", "AP@5", "AP@10", "AP@15"))

fig.add_trace(go.Histogram(x=AP_songs[0]), row=1, col=1)
fig.add_trace(go.Histogram(x=AP_songs[1]), row=1, col=2)
fig.add_trace(go.Histogram(x=AP_songs[2]), row=1, col=3)
fig.add_trace(go.Histogram(x=AP_songs[3]), row=2, col=1)
fig.add_trace(go.Histogram(x=AP_songs[4]), row=2, col=2)

fig.update_layout(height=500, 
                  width=1000, 
                  showlegend=False,
                  title_text="AP@N histograms")

fig.show()

### Step 11: Compute mAP@N for each genre and for the whole dataset

In [13]:
numeric_columns = [f'@{N}' for N in N_range]

mAP_by_genre = df.groupby('genre')[numeric_columns].mean()
mAP_dataset = df[numeric_columns].mean().to_frame().rename(columns={0: 'mAP'})

print('Mean average precision for each genre: ')
print(mAP_by_genre)
print('Mean average precision for entire dataset: ')
print(mAP_dataset)

fig = go.Figure()
for genre in df['genre'].unique():
    fig.add_trace(go.Scatter(y=mAP_by_genre.loc[genre], x=N_range, mode='lines+markers', name=genre))
fig.add_trace(go.Scatter(y=mAP_dataset['mAP'], x=N_range, mode='lines+markers', name='mean', line=dict(width=4, dash='dash')))

fig.update_layout(title='MAP@N by genre', xaxis_title='N', yaxis_title='MAP@N')
fig.show()

Mean average precision for each genre: 
                          @3        @5       @10       @13       @15
genre                                                               
Folk,Dance          0.333333  0.400000  0.400000  0.400000  0.400000
Inspirational,Mass  0.500000  0.500000  0.500000  0.576923  0.576923
Mass,Inspirational  0.500000  0.500000  0.500000  0.500000  0.500000
Melody              0.500000  0.485000  0.384306  0.457575  0.543073
Romantic,Folk       1.000000  1.000000  0.732778  0.755804  0.708363
Romantic,Melody     0.765432  0.699259  0.678959  0.684929  0.697591
Romantic,Sad        0.500000  0.565000  0.530179  0.463774  0.524184
Romantic,Upbeat     0.666667  0.510000  0.524702  0.488407  0.551991
Mean average precision for entire dataset: 
          mAP
@3   0.644444
@5   0.610667
@10  0.576089
@13  0.580830
@15  0.605105
