# Task 2 : Extend your framework with audio-based retrieval systems and with evaluation metrics

### Team E

**Don't forget to update the version number after making changes** 

Version 2.0.2
Date: 11/12/2023

## Import Libraries  

In [40]:
import numpy as np
import pandas as pd
import json 

In [41]:
# import utility functions 
from ret import read_data # utility func to load data
from ret import get_id_from_info # utility func to return id by entering song's info
from ret import display_res # utility func to display results 
from ret import get_genre #utility func to get the genres from a list of id´s
from ret import get_genre_from_query #utility func to get the id and genre from the query
from ret import get_genre_from_ids #utility func to get the id and genre from the retrieved results

# import the retrieval systems 
from ret import random_baseline # baseline retrieval system that returns random results 
from ret import text_based # modularized text based retrieval system
from ret import audio_based # modularized audio based retrieval system

# import wrapper function to calculate cosine similarity
from ret import cos_sim 

# import evaluation functions 
from ret import gen_cov_10
from ret import ndcg_score
from ret import gen_div_10
from ret import calculate_precision_at_k
from ret import calculate_recall_at_k_vectorized
from ret import plot_precision_recall_curve
from ret import get_avg_recall_at_k
from ret import get_avg_precision_at_k

In [None]:
#!pip install sklearn

## Load Data

In [42]:
"""
Please put data files in ""./data/" before use 
"""

info = read_data("information")
genres = read_data("genres")

# text embeddings 
tfidf = read_data("lyrics_tf-idf")
word2vec = read_data("lyrics_word2vec")
bert = read_data("lyrics_bert")

# audio embeddings
blf_correlation = read_data("blf_correlation")
ivec256 = read_data("ivec256")
mfcc_stats = read_data("mfcc_stats")
musicnn = read_data("musicnn")

In [43]:
blf_correlation.drop(7298, axis=0, inplace=True)
genres.drop(7298, axis=0, inplace=True)

In [44]:
# Konvertieren Sie den DataFrame in ein NumPy-ndarray
blf_correlation_array = blf_correlation.iloc[:, 2:].to_numpy()
# Konvertieren Sie das NumPy-ndarray in eine NumPy-Matrix
blf_correlation_matrix = np.matrix(blf_correlation_array)


# Transponieren Sie die Matrix mit der transpose() Methode
blf_correlation_transpose = blf_correlation_matrix.transpose()

# Überprüfen Sie die Form der transponierten Matrizen
print(blf_correlation_transpose.shape) # (1328, 10093)
#print(len(blf_correlation), len(blf_correlation_transpose_zip[0])) # (1328, 10093)
# Berechnen Sie die Kosinusähnlichkeitsmatrix
cos_sim_matrix = np.triu(blf_correlation_matrix.dot(blf_correlation_transpose) / (np.linalg.norm(blf_correlation_matrix, axis=1) * np.linalg.norm(blf_correlation_matrix, axis=1)[:, None]))
# Fill the lower triangular part with the same values as the upper triangular part
cos_sim_matrix = cos_sim_matrix + cos_sim_matrix.T - np.diag(cos_sim_matrix.diagonal())

# Create a dataframe from the numpy matrix
df = pd.DataFrame(cos_sim_matrix)

# Assign the ids to the index and columns
df.index = blf_correlation.iloc[:, :1]
#df.index = df.index.str.strip('(),')
df.columns = blf_correlation.iloc[:, :1]
#df.columns = df.columns.str.strip('(),')

cos_sim_matrix

(1325, 10093)


array([[1.        , 0.63489262, 0.6353494 , ..., 0.48916107, 0.53107856,
        0.64144186],
       [0.63489262, 1.        , 0.82856014, ..., 0.69606716, 0.85213451,
        0.821675  ],
       [0.6353494 , 0.82856014, 1.        , ..., 0.69512885, 0.73822328,
        0.80549606],
       ...,
       [0.48916107, 0.69606716, 0.69512885, ..., 1.        , 0.6878554 ,
        0.68902471],
       [0.53107856, 0.85213451, 0.73822328, ..., 0.6878554 , 1.        ,
        0.79572456],
       [0.64144186, 0.821675  , 0.80549606, ..., 0.68902471, 0.79572456,
        1.        ]])

In [45]:
df

Unnamed: 0,"(01Yfj2T3YTwJ1Yfy,)","(01gyRHLquwXDlhkO,)","(01rMxQv6vhyE1oQX,)","(02RGE9FNH65RtMS7,)","(02ZnlCGZEbkfCDxo,)","(04OjszRi9rC5BlHC,)","(04iitW3ffa0mhpx3,)","(04xUDjAYC14jsHyH,)","(06HvNTU9M9lnH71I,)","(06L9OJ5nRqKnO2q9,)",...,"(zxI0u8m0EPvVhNeD,)","(zxYtSeZzEVgPczJz,)","(zxlnGZoud2KCmSaw,)","(zyxOCKcXX1RmWpm0,)","(zyz0UbYN4n9rHXex,)","(zyzILCQvVeUFIINi,)","(zzgS4ZqyswamEWNj,)","(zzoFYDMlqU1X2zz1,)","(zzpkRCGA5ud8q4mv,)","(zzx8CWdM7qkxKQpC,)"
"(01Yfj2T3YTwJ1Yfy,)",1.000000,0.634893,0.635349,0.598153,0.576161,0.568466,0.629246,0.645637,0.587046,0.532837,...,0.621411,0.637629,0.654952,0.638904,0.560101,0.666608,0.611436,0.489161,0.531079,0.641442
"(01gyRHLquwXDlhkO,)",0.634893,1.000000,0.828560,0.851030,0.777809,0.686822,0.877492,0.708623,0.753095,0.847415,...,0.912093,0.823651,0.867349,0.869174,0.902670,0.652415,0.807221,0.696067,0.852135,0.821675
"(01rMxQv6vhyE1oQX,)",0.635349,0.828560,1.000000,0.773646,0.761471,0.722970,0.869319,0.759349,0.774074,0.819857,...,0.808650,0.797611,0.805425,0.815107,0.764606,0.701674,0.761847,0.695129,0.738223,0.805496
"(02RGE9FNH65RtMS7,)",0.598153,0.851030,0.773646,1.000000,0.785102,0.664442,0.832017,0.627340,0.719576,0.804217,...,0.858410,0.819604,0.820042,0.844789,0.847214,0.609150,0.787245,0.708434,0.835343,0.749437
"(02ZnlCGZEbkfCDxo,)",0.576161,0.777809,0.761471,0.785102,1.000000,0.680657,0.820742,0.648261,0.731629,0.761479,...,0.789689,0.790634,0.768481,0.838606,0.745495,0.597407,0.798870,0.711728,0.768490,0.776015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(zyzILCQvVeUFIINi,)",0.666608,0.652415,0.701674,0.609150,0.597407,0.567506,0.654330,0.653536,0.594052,0.551923,...,0.619223,0.636745,0.630093,0.634828,0.570979,1.000000,0.602570,0.503714,0.545268,0.670342
"(zzgS4ZqyswamEWNj,)",0.611436,0.807221,0.761847,0.787245,0.798870,0.669773,0.834442,0.660964,0.755063,0.795230,...,0.827598,0.769887,0.795303,0.855147,0.808174,0.602570,1.000000,0.749721,0.815573,0.818467
"(zzoFYDMlqU1X2zz1,)",0.489161,0.696067,0.695129,0.708434,0.711728,0.670488,0.741595,0.590277,0.728201,0.718227,...,0.703538,0.705924,0.681825,0.741768,0.665708,0.503714,0.749721,1.000000,0.687855,0.689025
"(zzpkRCGA5ud8q4mv,)",0.531079,0.852135,0.738223,0.835343,0.768490,0.612901,0.826779,0.599824,0.700431,0.828984,...,0.903774,0.756766,0.829616,0.846157,0.864387,0.545268,0.815573,0.687855,1.000000,0.795725


In [None]:
get_avg_recall_at_k(blf_correlation, 10, genres)

In [None]:
genres[genres["id"]== "02RGE9FNH65RtMS7"]


In [None]:
# Import numpy
import numpy as np

# Define a sample repr data set with 5 vectors, one of them having a zero norm
repr = blf_correlation
repr = repr.iloc[:, 2:].to_numpy()

# Calculate the norm of each vector along the rows
norms = np.linalg.norm(repr, axis=1)

# Create a boolean mask of the condition that the norm is zero
mask = np.isclose(norms, 0)

# Use numpy.nonzero to find the indices of the vectors that have a zero norm
indices = np.nonzero(mask)

# Use numpy indexing to get the problematic vectors
problematic_vectors = repr[indices]

# Print the indices and the problematic vectors
print("The indices of the problematic vectors are:", indices)
print("The problematic vectors are:", problematic_vectors)


In [None]:
"""
Initialize empty dictionary to store the results

User guide:
the dictionary has three dimensions:
1st dim: query tracks 
available tags ["track1", "track2", "track3"]
2nd dim: retrieval systems 
available tags ["base_line", "tfidf", "word2vec", "bert", "blf_correlation", "ivec256", "mfcc_stats", "musicnn"]
3rd dim: evaluation metrics and retrieved tracks 
available tags ["tracks", "precision@10", "recall@10", "genre_diversity@10", "genre_coverage@10", "ndcg"]

Example:
res["track1"]["base_line"]["tracks"]

"""
with open('result_mod.json', 'r') as f:
    res = json.load(f)

## Track 1 
"Love Me" "The 1975"

In [None]:
name = "Love Me"
artist = "The 1975"
id_track1 = get_id_from_info(name, artist, info)

In [None]:
# retrieve tracks with random baseline 
res["track1"]["base_line"]["tracks"] = random_baseline(id=id_track1, info=info, N=10)

In [None]:
# retrieve tracks with text based retrieval systems 
res["track1"]["tfidf"]["tracks"] = text_based(id=id_track1, repr=tfidf, N=10, sim_func=cos_sim)
res["track1"]["word2vec"]["tracks"] = text_based(id=id_track1, repr=word2vec, N=10, sim_func=cos_sim)
res["track1"]["bert"]["tracks"] = text_based(id=id_track1, repr=bert, N=10, sim_func=cos_sim)

In [None]:
# retrieve tracks with audio based retrieval systems 
res["track1"]["blf_correlation"]["tracks"] = audio_based(id=id_track1, repr=blf_correlation, N=10, sim_func=cos_sim)
print("blf correlation Results:")
display_res(res["track1"]["blf_correlation"]["tracks"], info)

res["track1"]["ivec256"]["tracks"] = audio_based(id=id_track1, repr=ivec256, N=10, sim_func=cos_sim)
print("ivec256 Results:")
display_res(res["track1"]["ivec256"]["tracks"], info)

res["track1"]["mfcc_stats"]["tracks"] = audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim)
print("MFCC Stats Results:")
display_res(res["track1"]["mfcc_stats"]["tracks"], info)

res["track1"]["musicnn"]["tracks"] = audio_based(id=id_track1, repr=musicnn, N=10, sim_func=cos_sim)
print("Musicnn Results:")
display_res(res["track1"]["musicnn"]["tracks"], info)

In [None]:
#retrieve genres of the results for precision@10 & recall@10 calculation
q1_genres_mfcc_stats = get_genre_from_ids(res["track1"]["mfcc_stats"]["tracks"],genres)
q1_genres_blf_correlation = get_genre_from_ids(res["track1"]["blf_correlation"]["tracks"], genres)
q1_genres_ivec256 = get_genre_from_ids(res["track1"]["ivec256"]["tracks"], genres)
q1_genres_musicnn = get_genre_from_ids(res["track1"]["musicnn"]["tracks"], genres)
q1_genres_tfidf = get_genre_from_ids(res["track1"]["tfidf"]["tracks"], genres)
q1_genres_word2vec = get_genre_from_ids(res["track1"]["word2vec"]["tracks"],genres)
q1_genres_bert =  get_genre_from_ids(res["track1"]["bert"]["tracks"],genres)
q1_genres_base_line = get_genre_from_ids(res["track1"]["base_line"]["tracks"],genres)

In [None]:
# calculate precision @10 for track1

#get query genre
query_genre1 = get_genre_from_query(id_track1, genres)

#query track 1 precision (k=10) for all 8 Retrieval Systems
res["track1"]["mfcc_stats"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_mfcc_stats,10)
precision = res["track1"]["mfcc_stats"]["precision@10"]
print(f"precision@10: {precision}")

res["track1"]["blf_correlation"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_blf_correlation,10)
precision = res["track1"]["blf_correlation"]["precision@10"]
print(f"precision@10: {precision}")

res["track1"]["ivec256"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_ivec256,10)
precision = res["track1"]["ivec256"]["precision@10"] 
print(f"precision@10: {precision}")

res["track1"]["musicnn"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_musicnn,10)
precision = res["track1"]["musicnn"]["precision@10"]
print(f"precision@10: {precision}")

res["track1"]["tfidf"]["precision@10"]= calculate_precision_at_k(query_genre1,q1_genres_tfidf,10)
precision = res["track1"]["tfidf"]["precision@10"]
print(f"precision@10: {precision}")

res["track1"]["word2vec"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_word2vec,10)
precision = res["track1"]["word2vec"]["precision@10"]
print(f"precision@10: {precision}")

res["track1"]["bert"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_bert,10)
precision = res["track1"]["bert"]["precision@10"] 
print(f"precision@10: {precision}")

res["track1"]["base_line"]["precision@10"] = calculate_precision_at_k(query_genre1,q1_genres_base_line,10)
precision = res["track1"]["base_line"]["precision@10"]
print(f"precision@10: {precision}")

In [None]:
genres_list = genres.values.tolist()
res["track1"]["mfcc_stats"]["recall@10"] = calculate_recall_at_k_vectorized(query_genre1, q1_genres_mfcc_stats,genres_list,10)
recall = res["track1"]["mfcc_stats"]["recall@10"]
print(f"recall@10: {recall}")

In [None]:
genres.values.tolist()


In [None]:
#query track 1 recall (k=10) for all 8 Retrieval Systems
#create genres dataset list for later use


res["track1"]["mfcc_stats"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_mfcc_stats,genres_list,10)
recall = res["track1"]["mfcc_stats"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["blf_correlation"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_blf_correlation,genres_list,10)
recall = res["track1"]["blf_correlation"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["ivec256"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_ivec256,genres_list,10)
recall = res["track1"]["ivec256"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["musicnn"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_musicnn,genres_list,10)
recall = res["track1"]["musicnn"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["tfidf"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_tfidf,genres_list,10)
recall = res["track1"]["tfidf"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["word2vec"]["recall@10"]= calculate_recall_at_k(query_genre1, q1_genres_word2vec,genres_list,10)
recall = res["track1"]["word2vec"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["bert"]["recall@10"]= calculate_recall_at_k(query_genre1,  q1_genres_bert,genres_list,10)
recall = res["track1"]["bert"]["recall@10"]
print(f"recall@10: {recall}")

res["track1"]["base_line"]["recall@10"] = calculate_recall_at_k(query_genre1, q1_genres_base_line,genres_list,10)
recall = res["track1"]["base_line"]["recall@10"]
print(f"recall@10: {recall}")

In [None]:
#retrieve 100 results for precision-recall plot (audio based) because k should be varied in the interval [1,100]
q1_100_mfcc_stats = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=100, sim_func=cos_sim), genres)
q1_100_blf_correlation = get_genre_from_ids(audio_based(id=id_track1, repr=blf_correlation, N=100, sim_func=cos_sim), genres)
q1_100_ivec256 = get_genre_from_ids(audio_based(id=id_track1, repr=ivec256, N=100, sim_func=cos_sim), genres)
q1_100_musicnn = get_genre_from_ids(audio_based(id=id_track1, repr=musicnn, N=100, sim_func=cos_sim), genres)
#retrieve 100 text based results for precision recall plot because k should be varied in the interval [1,100]
q1_100_tfidf = get_genre_from_ids(text_based(id=id_track1, repr=tfidf, N=100, sim_func=cos_sim), genres)
q1_100_word2vec = get_genre_from_ids(text_based(id=id_track1, repr=word2vec, N=100, sim_func=cos_sim),genres)
q1_100_bert =  get_genre_from_ids(text_based(id=id_track1, repr=bert, N=100, sim_func=cos_sim),genres)
q1_100_base_line = get_genre_from_ids(random_baseline(id=id_track1, info=info, N=100),genres)


#plot precision recall curve for query track1 for all evaluated systems

system_data1 = {
    "audio mfcc stats": {"system_name": "Audio MFCC Stats","query_genre": query_genre1, "retrieved_genres": q1_100_mfcc_stats, "dataset_genres": genres_list},
    "audio blf correlation": {"system_name": "Audio BLF Correlation","query_genre": query_genre1, "retrieved_genres": q1_100_blf_correlation, "dataset_genres": genres_list},
    "audio ivec_256": {"system_name": "Audio iVec 256","query_genre": query_genre1, "retrieved_genres": q1_100_ivec256, "dataset_genres":genres_list},
    "audio muiscnn": {"system_name": "Audio Musicnn","query_genre": query_genre1, "retrieved_genres": q1_100_musicnn, "dataset_genres":genres_list},
    "text tf-idf": {"system_name": "Text TF-IDF","query_genre": query_genre1, "retrieved_genres": q1_100_tfidf, "dataset_genres":genres_list},
    "text word2vec": {"system_name": "Text Word2Vec","query_genre": query_genre1, "retrieved_genres": q1_100_word2vec, "dataset_genres":genres_list},
    "text bert": {"system_name": "Text BERT","query_genre": query_genre1, "retrieved_genres": q1_genres_bert, "dataset_genres":genres_list},
    "text random": {"system_name": "Text Random Baseline","query_genre": query_genre1, "retrieved_genres": q1_100_base_line, "dataset_genres":genres_list},
}

plot_precision_recall_curve(system_data1)

In [None]:
# calculate genre diversity@10
all_genres = list(set([item for id in genres['id'] for item in get_genre(id, genres)]))

#audio based
q1_genre_mfcc_stats = [get_genre(id, genres) for id in res["track1"]["mfcc_stats"]["tracks"]]
q1_genre_blf_correlation = [get_genre(id, genres) for id in res["track1"]["blf_correlation"]["tracks"]]
q1_genre_ivec256 = [get_genre(id, genres) for id in res["track1"]["ivec256"]["tracks"]]
q1_genre_musicnn = [get_genre(id, genres) for id in res["track1"]["musicnn"]["tracks"]]
#text based
q1_genre_tfidf = [get_genre(id, genres) for id in res["track1"]["tfidf"]["tracks"]]
q1_genre_word2vec = [get_genre(id, genres) for id in res["track1"]["word2vec"]["tracks"]]
q1_genre_bert = [get_genre(id, genres) for id in res["track1"]["bert"]["tracks"]]
q1_genre_base_line = [get_genre(id, genres) for id in res["track1"]["base_line"]["tracks"]]

#calculate genre diversity @10
res["track1"]["mfcc_stats"]["genre_diversity@10"] = gen_div_10(q1_genre_mfcc_stats, all_genres, N=10)
genre_div = res["track1"]["mfcc_stats"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["blf_correlation"]["genre_diversity@10"] = gen_div_10(q1_genre_blf_correlation,all_genres,N=10)
genre_div = res["track1"]["blf_correlation"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["ivec256"]["genre_diversity@10"] = gen_div_10(q1_genre_ivec256,all_genres,N=10)
genre_div = res["track1"]["ivec256"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["musicnn"]["genre_diversity@10"] = gen_div_10(q1_genre_musicnn,all_genres,N=10)
genre_div = res["track1"]["musicnn"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["tfidf"]["genre_diversity@10"] = gen_div_10(q1_genre_tfidf,all_genres,N=10)
genre_div = res["track1"]["tfidf"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["word2vec"]["genre_diversity@10"] = gen_div_10(q1_genre_word2vec,all_genres,N=10)
genre_div = res["track1"]["word2vec"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["bert"]["genre_diversity@10"] = gen_div_10(q1_genre_bert,all_genres,N=10)
genre_div = res["track1"]["bert"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track1"]["base_line"]["genre_diversity@10"] = gen_div_10(q1_genre_base_line,all_genres,N=10)
genre_div = res["track1"]["base_line"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")


In [None]:
print(query_genre1)

In [None]:
# calculate genre coverage@10 

res["track1"]["mfcc_stats"]["genre_coverage@10"] = gen_cov_10(res["track1"]["mfcc_stats"]["tracks"], genres)
genre_cov = res["track1"]["mfcc_stats"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["blf_correlation"]["genre_coverage@10"] = gen_cov_10(res["track1"]["blf_correlation"]["tracks"], genres)
genre_cov = res["track1"]["blf_correlation"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["ivec256"]["genre_coverage@10"] = gen_cov_10(res["track1"]["ivec256"]["tracks"], genres)
genre_cov = res["track1"]["ivec256"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["musicnn"]["genre_coverage@10"] = gen_cov_10(res["track1"]["musicnn"]["tracks"], genres)
genre_cov = res["track1"]["musicnn"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["tfidf"]["genre_coverage@10"] = gen_cov_10(res["track1"]["tfidf"]["tracks"], genres)
genre_cov = res["track1"]["tfidf"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["word2vec"]["genre_coverage@10"] = gen_cov_10(res["track1"]["word2vec"]["tracks"], genres)
genre_cov = res["track1"]["word2vec"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["bert"]["genre_coverage@10"] = gen_cov_10(res["track1"]["bert"]["tracks"], genres)
genre_cov = res["track1"]["bert"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track1"]["base_line"]["genre_coverage@10"] = gen_cov_10(res["track1"]["base_line"]["tracks"], genres)
genre_cov = res["track1"]["base_line"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

In [None]:
# calculate ndcg@10 

res["track1"]["mfcc_stats"]["ndcg"] = ndcg_score(id_track1, res["track1"]["mfcc_stats"]["tracks"], genres)
ndcg = res["track1"]["mfcc_stats"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["blf_correlation"]["ndcg"] = ndcg_score(id_track1, res["track1"]["blf_correlation"]["tracks"], genres)
ndcg = res["track1"]["blf_correlation"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["ivec256"]["ndcg"] = ndcg_score(id_track1, res["track1"]["ivec256"]["tracks"], genres)
ndcg = res["track1"]["ivec256"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["musicnn"]["ndcg"] = ndcg_score(id_track1, res["track1"]["musicnn"]["tracks"], genres)
ndcg = res["track1"]["musicnn"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["tfidf"]["ndcg"] = ndcg_score(id_track1, res["track1"]["tfidf"]["tracks"], genres)
ndcg = res["track1"]["tfidf"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["word2vec"]["ndcg"] = ndcg_score(id_track1, res["track1"]["word2vec"]["tracks"], genres)
ndcg = res["track1"]["word2vec"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["bert"]["ndcg"] = ndcg_score(id_track1, res["track1"]["bert"]["tracks"], genres)
ndcg = res["track1"]["bert"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track1"]["base_line"]["ndcg"] = ndcg_score(id_track1, res["track1"]["base_line"]["tracks"], genres)
ndcg = res["track1"]["base_line"]["ndcg"]
print(f"ndcg: {ndcg}")

## Track 2 
"One" "U2"

In [None]:
name = 'One'
artist = 'U2'
id_track2 = get_id_from_info(name, artist, info)

In [None]:
# retrieve tracks with random baseline 
res["track2"]["base_line"]["tracks"] = random_baseline(id=id_track2, info=info, N=10)

In [None]:
# retrieve tracks with text based retrieval systems 
res["track2"]["tfidf"]["tracks"] = text_based(id=id_track2, repr=tfidf, N=10, sim_func=cos_sim)
res["track2"]["word2vec"]["tracks"] = text_based(id=id_track2, repr=word2vec, N=10, sim_func=cos_sim)
res["track2"]["bert"]["tracks"] = text_based(id=id_track2, repr=bert, N=10, sim_func=cos_sim)

In [None]:
# retrieve tracks with audio based retrieval systems 
res["track2"]["blf_correlation"]["tracks"] = audio_based(id=id_track2, repr=blf_correlation, N=10, sim_func=cos_sim)
print("blf correlation Results:")
display_res(res["track2"]["blf_correlation"]["tracks"], info)

res["track2"]["ivec256"]["tracks"] = audio_based(id=id_track2, repr=ivec256, N=10, sim_func=cos_sim)
print("ivec 256 Results:")
display_res(res["track2"]["ivec256"]["tracks"], info)

res["track2"]["mfcc_stats"]["tracks"] = audio_based(id=id_track2, repr=mfcc_stats, N=10, sim_func=cos_sim)
print("mfcc_stats Results:")
display_res(res["track2"]["mfcc_stats"]["tracks"], info)

res["track2"]["musicnn"]["tracks"] = audio_based(id=id_track2, repr=musicnn, N=10, sim_func=cos_sim)
print("musicnn Results:")
display_res(res["track2"]["musicnn"]["tracks"], info)

In [None]:
#retrieve genres of the results for precision@10 & recall@10 calculation
q2_genres_mfcc_stats = get_genre_from_ids(res["track2"]["mfcc_stats"]["tracks"],genres)
q2_genres_blf_correlation = get_genre_from_ids(res["track2"]["blf_correlation"]["tracks"], genres)
q2_genres_ivec256 = get_genre_from_ids(res["track2"]["ivec256"]["tracks"], genres)
q2_genres_musicnn = get_genre_from_ids(res["track2"]["musicnn"]["tracks"], genres)
q2_genres_tfidf = get_genre_from_ids(res["track2"]["tfidf"]["tracks"], genres)
q2_genres_word2vec = get_genre_from_ids(res["track2"]["word2vec"]["tracks"],genres)
q2_genres_bert =  get_genre_from_ids(res["track2"]["bert"]["tracks"],genres)
q2_genres_base_line = get_genre_from_ids(res["track2"]["base_line"]["tracks"],genres)

In [None]:
# calculate precision @10 for track2
#get query genre
query_genre2 = get_genre_from_query(id_track2, genres)

#query track 1 precision (k=10) for all 8 Retrieval Systems
res["track2"]["mfcc_stats"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_mfcc_stats,10)
precision = res["track2"]["mfcc_stats"]["precision@10"]
print(f"precision@10: {precision}")

res["track2"]["blf_correlation"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_blf_correlation,10)
precision = res["track2"]["blf_correlation"]["precision@10"]
print(f"precision@10: {precision}")

res["track2"]["ivec256"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_ivec256,10)
precision = res["track2"]["ivec256"]["precision@10"] 
print(f"precision@10: {precision}")

res["track2"]["musicnn"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_musicnn,10)
precision = res["track2"]["musicnn"]["precision@10"]
print(f"precision@10: {precision}")

res["track2"]["tfidf"]["precision@10"]= calculate_precision_at_k(query_genre2,q2_genres_tfidf,10)
precision = res["track2"]["tfidf"]["precision@10"]
print(f"precision@10: {precision}")

res["track2"]["word2vec"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_word2vec,10)
precision = res["track2"]["word2vec"]["precision@10"]
print(f"precision@10: {precision}")

res["track2"]["bert"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_bert,10)
precision = res["track2"]["bert"]["precision@10"] 
print(f"precision@10: {precision}")

res["track2"]["base_line"]["precision@10"] = calculate_precision_at_k(query_genre2,q2_genres_base_line,10)
precision = res["track2"]["base_line"]["precision@10"]
print(f"precision@10: {precision}")


In [None]:
#query track 2 recall (k=10) for all 8 Retrieval Systems
#create genres dataset list for later use
genres_list = genres.values.tolist()

res["track2"]["mfcc_stats"]["recall@10"] = calculate_recall_at_k(query_genre2,q2_genres_mfcc_stats,genres_list,10)
recall = res["track2"]["mfcc_stats"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["blf_correlation"]["recall@10"] = calculate_recall_at_k(query_genre2,q2_genres_blf_correlation,genres_list,10)
recall = res["track2"]["blf_correlation"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["ivec256"]["recall@10"] = calculate_recall_at_k(query_genre2,q2_genres_ivec256,genres_list,10)
recall = res["track2"]["ivec256"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["musicnn"]["recall@10"] = calculate_recall_at_k(query_genre2, q2_genres_musicnn,genres_list,10)
recall = res["track2"]["musicnn"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["tfidf"]["recall@10"] = calculate_recall_at_k(query_genre2, q2_genres_tfidf,genres_list,10)
recall = res["track2"]["tfidf"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["word2vec"]["recall@10"]= calculate_recall_at_k(query_genre2, q2_genres_word2vec,genres_list,10)
recall = res["track2"]["word2vec"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["bert"]["recall@10"]= calculate_recall_at_k(query_genre2,q2_genres_bert,genres_list,10)
recall = res["track2"]["bert"]["recall@10"]
print(f"recall@10: {recall}")

res["track2"]["base_line"]["recall@10"] = calculate_recall_at_k(query_genre2,q2_genres_base_line,genres_list,10)
recall = res["track2"]["base_line"]["recall@10"]
print(f"recall@10: {recall}")

In [None]:
#retrieve 100 results for precision-recall plot (audio based) because k should be varied in the interval [1,100]
q2_100_mfcc_stats = get_genre_from_ids(audio_based(id=id_track2, repr=mfcc_stats, N=100, sim_func=cos_sim), genres)
q2_100_blf_correlation = get_genre_from_ids(audio_based(id=id_track2, repr=blf_correlation, N=100, sim_func=cos_sim), genres)
q2_100_ivec256 = get_genre_from_ids(audio_based(id=id_track2, repr=ivec256, N=100, sim_func=cos_sim), genres)
q2_100_musicnn = get_genre_from_ids(audio_based(id=id_track2, repr=musicnn, N=100, sim_func=cos_sim), genres)
q2_100_tfidf = get_genre_from_ids(text_based(id=id_track2, repr=tfidf, N=100, sim_func=cos_sim), genres)
q2_100_word2vec = get_genre_from_ids(text_based(id=id_track2, repr=word2vec, N=100, sim_func=cos_sim),genres)
q2_100_bert =  get_genre_from_ids(text_based(id=id_track2, repr=bert, N=100, sim_func=cos_sim),genres)
q2_100_base_line = get_genre_from_ids(random_baseline(id=id_track2, info=info, N=100),genres)

#plot precision recall curve for query track2 for all evaluated systems

system_data2 = {
    "audio mfcc stats": {"system_name": "Audio MFCC Stats","query_genre": query_genre2, "retrieved_genres": q2_100_mfcc_stats, "dataset_genres": genres_list},
    "audio blf correlation": {"system_name": "Audio BLF Correlation","query_genre": query_genre2, "retrieved_genres": q2_100_blf_correlation, "dataset_genres": genres_list},
    "audio ivec_256": {"system_name": "Audio iVec 256","query_genre": query_genre2, "retrieved_genres": q2_100_ivec256, "dataset_genres":genres_list},
    "audio muiscnn": {"system_name": "Audio Musicnn","query_genre": query_genre2, "retrieved_genres": q2_100_musicnn, "dataset_genres":genres_list},
    "text tf-idf": {"system_name": "Text TF-IDF","query_genre": query_genre2, "retrieved_genres": q2_100_tfidf, "dataset_genres":genres_list},
    "text word2vec": {"system_name": "Text Word2Vec","query_genre": query_genre2, "retrieved_genres": q2_100_word2vec, "dataset_genres":genres_list},
    "text bert": {"system_name": "Text BERT","query_genre": query_genre2, "retrieved_genres": q2_genres_bert, "dataset_genres":genres_list},
    "text random": {"system_name": "Text Random Baseline","query_genre": query_genre2, "retrieved_genres": q2_100_base_line, "dataset_genres":genres_list},
}

plot_precision_recall_curve(system_data2)

In [None]:
# calculate genre diversity@10
all_genres = list(set([item for id in genres['id'] for item in get_genre(id, genres)]))

#audio based
q2_genre_mfcc_stats = [get_genre(id, genres) for id in res["track2"]["mfcc_stats"]["tracks"]]
q2_genre_blf_correlation = [get_genre(id, genres) for id in res["track2"]["blf_correlation"]["tracks"]]
q2_genre_ivec256 = [get_genre(id, genres) for id in res["track2"]["ivec256"]["tracks"]]
q2_genre_musicnn = [get_genre(id, genres) for id in res["track2"]["musicnn"]["tracks"]]
#text based
q2_genre_tfidf = [get_genre(id, genres) for id in res["track2"]["tfidf"]["tracks"]]
q2_genre_word2vec = [get_genre(id, genres) for id in res["track2"]["word2vec"]["tracks"]]
q2_genre_bert = [get_genre(id, genres) for id in res["track2"]["bert"]["tracks"]]
q2_genre_random = [get_genre(id, genres) for id in res["track2"]["base_line"]["tracks"]]

#calculate genre diversity @10
res["track2"]["mfcc_stats"]["genre_diversity@10"] = gen_div_10(q2_genre_mfcc_stats, all_genres, N=10)
genre_div = res["track2"]["mfcc_stats"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["blf_correlation"]["genre_diversity@10"] = gen_div_10(q2_genre_blf_correlation,all_genres,N=10)
genre_div = res["track2"]["blf_correlation"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["ivec256"]["genre_diversity@10"] = gen_div_10(q2_genre_ivec256,all_genres,N=10)
genre_div = res["track2"]["ivec256"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["musicnn"]["genre_diversity@10"] = gen_div_10(q2_genre_musicnn,all_genres,N=10)
genre_div = res["track2"]["musicnn"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["tfidf"]["genre_diversity@10"] = gen_div_10(q2_genre_tfidf,all_genres,N=10)
genre_div = res["track2"]["tfidf"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["word2vec"]["genre_diversity@10"] = gen_div_10(q2_genre_word2vec,all_genres,N=10)
genre_div = res["track2"]["word2vec"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["bert"]["genre_diversity@10"] = gen_div_10(q2_genre_bert,all_genres,N=10)
genre_div = res["track2"]["bert"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track2"]["base_line"]["genre_diversity@10"] = gen_div_10(q2_genre_random,all_genres,N=10)
genre_div = res["track2"]["base_line"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")


In [None]:
# calculate genre coverage@10 
res["track2"]["mfcc_stats"]["genre_coverage@10"] = gen_cov_10(res["track2"]["mfcc_stats"]["tracks"], genres)
genre_cov = res["track2"]["mfcc_stats"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["blf_correlation"]["genre_coverage@10"] = gen_cov_10(res["track2"]["blf_correlation"]["tracks"], genres)
genre_cov = res["track2"]["blf_correlation"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["ivec256"]["genre_coverage@10"] = gen_cov_10(res["track2"]["ivec256"]["tracks"], genres)
genre_cov = res["track2"]["ivec256"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["musicnn"]["genre_coverage@10"] = gen_cov_10(res["track2"]["musicnn"]["tracks"], genres)
genre_cov = res["track2"]["musicnn"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["tfidf"]["genre_coverage@10"] = gen_cov_10(res["track2"]["tfidf"]["tracks"], genres)
genre_cov = res["track2"]["tfidf"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["word2vec"]["genre_coverage@10"] = gen_cov_10(res["track2"]["word2vec"]["tracks"], genres)
genre_cov = res["track2"]["word2vec"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["bert"]["genre_coverage@10"] = gen_cov_10(res["track2"]["bert"]["tracks"], genres)
genre_cov = res["track2"]["bert"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track2"]["base_line"]["genre_coverage@10"] = gen_cov_10(res["track2"]["base_line"]["tracks"], genres)
genre_cov = res["track2"]["base_line"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

In [None]:
# calculate ndcg@10 
res["track2"]["mfcc_stats"]["ndcg"] = ndcg_score(id_track2, res["track2"]["mfcc_stats"]["tracks"], genres)
ndcg = res["track2"]["mfcc_stats"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["blf_correlation"]["ndcg"] = ndcg_score(id_track2, res["track2"]["blf_correlation"]["tracks"], genres)
ndcg = res["track2"]["blf_correlation"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["ivec256"]["ndcg"] = ndcg_score(id_track2, res["track2"]["ivec256"]["tracks"], genres)
ndcg = res["track2"]["ivec256"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["musicnn"]["ndcg"] = ndcg_score(id_track2, res["track2"]["musicnn"]["tracks"], genres)
ndcg = res["track2"]["musicnn"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["tfidf"]["ndcg"] = ndcg_score(id_track2, res["track2"]["tfidf"]["tracks"], genres)
ndcg = res["track2"]["tfidf"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["word2vec"]["ndcg"] = ndcg_score(id_track2, res["track2"]["word2vec"]["tracks"], genres)
ndcg = res["track2"]["word2vec"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["bert"]["ndcg"] = ndcg_score(id_track2, res["track2"]["bert"]["tracks"], genres)
ndcg = res["track2"]["bert"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track2"]["base_line"]["ndcg"] = ndcg_score(id_track2, res["track2"]["base_line"]["tracks"], genres)
ndcg = res["track2"]["base_line"]["ndcg"]
print(f"ndcg: {ndcg}")


## Track 3
"Every Christmas" "Kelly Clarkson"

In [None]:
name = 'Every Christmas'
artist = 'Kelly Clarkson'
id_track3 = get_id_from_info(name, artist, info)

In [None]:
# retrieve tracks with random baseline 
res["track3"]["base_line"]["tracks"] = random_baseline(id=id_track3, info=info, N=10)

In [None]:
# retrieve tracks with text based retrieval systems 
res["track3"]["tfidf"]["tracks"] = text_based(id=id_track3, repr=tfidf, N=10, sim_func=cos_sim)
res["track3"]["word2vec"]["tracks"] = text_based(id=id_track3, repr=word2vec, N=10, sim_func=cos_sim)
res["track3"]["bert"]["tracks"] = text_based(id=id_track3, repr=bert, N=10, sim_func=cos_sim)

In [None]:
# retrieve tracks with audio based retrieval systems 
res["track3"]["blf_correlation"]["tracks"] = audio_based(id=id_track3, repr=blf_correlation, N=10, sim_func=cos_sim)
print("blf correlation Results:")
display_res(res["track3"]["blf_correlation"]["tracks"], info)

res["track3"]["ivec256"]["tracks"] = audio_based(id=id_track3, repr=ivec256, N=10, sim_func=cos_sim)
print("ivec256 Results:")
display_res(res["track3"]["ivec256"]["tracks"], info)

res["track3"]["mfcc_stats"]["tracks"] = audio_based(id=id_track3, repr=mfcc_stats, N=10, sim_func=cos_sim)
print("mfcc_stats Results:")
display_res(res["track3"]["mfcc_stats"]["tracks"], info)

res["track3"]["musicnn"]["tracks"] = audio_based(id=id_track3, repr=musicnn, N=10, sim_func=cos_sim)
print("musicnn Results:")
display_res(res["track3"]["musicnn"]["tracks"], info)


In [None]:
#retrieve genres of the results for precision@10 & recall@10 calculation
q3_genres_mfcc_stats = get_genre_from_ids(res["track3"]["mfcc_stats"]["tracks"],genres)
q3_genres_blf_correlation = get_genre_from_ids(res["track3"]["blf_correlation"]["tracks"], genres)
q3_genres_ivec256 = get_genre_from_ids(res["track3"]["ivec256"]["tracks"], genres)
q3_genres_musicnn = get_genre_from_ids(res["track3"]["musicnn"]["tracks"], genres)
q3_genres_tfidf = get_genre_from_ids(res["track3"]["tfidf"]["tracks"], genres)
q3_genres_word2vec = get_genre_from_ids(res["track3"]["word2vec"]["tracks"],genres)
q3_genres_bert =  get_genre_from_ids(res["track3"]["bert"]["tracks"],genres)
q3_genres_base_line = get_genre_from_ids(res["track3"]["base_line"]["tracks"],genres)

In [None]:
# calculate precision @10 for track3

#get query genre
query_genre3 = get_genre_from_query(id_track3, genres)

res["track3"]["mfcc_stats"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_mfcc_stats,10)
precision = res["track3"]["mfcc_stats"]["precision@10"]
print(f"precision@10: {precision}")

res["track3"]["blf_correlation"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_blf_correlation,10)
precision = res["track3"]["blf_correlation"]["precision@10"]
print(f"precision@10: {precision}")

res["track3"]["ivec256"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_ivec256,10)
precision = res["track3"]["ivec256"]["precision@10"] 
print(f"precision@10: {precision}")

res["track3"]["musicnn"]["precision@10"] = calculate_precision_at_k(query_genre3,q2_genres_musicnn,10)
precision = res["track3"]["musicnn"]["precision@10"]
print(f"precision@10: {precision}")

res["track3"]["tfidf"]["precision@10"]= calculate_precision_at_k(query_genre3,q3_genres_tfidf,10)
precision = res["track3"]["tfidf"]["precision@10"]
print(f"precision@10: {precision}")

res["track3"]["word2vec"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_word2vec,10)
precision = res["track3"]["word2vec"]["precision@10"]
print(f"precision@10: {precision}")

res["track3"]["bert"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_bert,10)
precision = res["track3"]["bert"]["precision@10"] 
print(f"precision@10: {precision}")

res["track3"]["base_line"]["precision@10"] = calculate_precision_at_k(query_genre3,q3_genres_base_line,10)
precision = res["track3"]["base_line"]["precision@10"]
print(f"precision@10: {precision}")

In [None]:
#query track 3 recall (k=10) for all 8 Retrieval Systems

res["track3"]["mfcc_stats"]["recall@10"] = calculate_recall_at_k(query_genre3, q3_genres_mfcc_stats,genres_list,10)
recall = res["track3"]["mfcc_stats"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["blf_correlation"]["recall@10"] = calculate_recall_at_k(query_genre3, q3_genres_blf_correlation,genres_list,10)
recall = res["track3"]["blf_correlation"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["ivec256"]["recall@10"] = calculate_recall_at_k(query_genre3, q3_genres_ivec256,genres_list,10)
recall = res["track3"]["ivec256"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["musicnn"]["recall@10"] = calculate_recall_at_k(query_genre3, q3_genres_musicnn,genres_list,10)
recall = res["track3"]["musicnn"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["tfidf"]["recall@10"] = calculate_recall_at_k(query_genre3, q3_genres_tfidf,genres_list,10)
recall = res["track3"]["tfidf"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["word2vec"]["recall@10"]= calculate_recall_at_k(query_genre3, q3_genres_word2vec,genres_list,10)
recall = res["track3"]["word2vec"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["bert"]["recall@10"]= calculate_recall_at_k(query_genre3,q3_genres_bert,genres_list,10)
recall = res["track3"]["bert"]["recall@10"]
print(f"recall@10: {recall}")

res["track3"]["base_line"]["recall@10"] = calculate_recall_at_k(query_genre3,q3_genres_base_line,genres_list,10)
recall = res["track3"]["base_line"]["recall@10"]
print(f"recall@10: {recall}")

In [None]:
#retrieve 100 results for precision-recall plot (audio based) because k should be varied in the interval [1,100]
q3_100_mfcc_stats = get_genre_from_ids(audio_based(id=id_track3, repr=mfcc_stats, N=100, sim_func=cos_sim), genres)
q3_100_blf_correlation = get_genre_from_ids(audio_based(id=id_track3, repr=blf_correlation, N=100, sim_func=cos_sim), genres)
q3_100_ivec256 = get_genre_from_ids(audio_based(id=id_track3, repr=ivec256, N=100, sim_func=cos_sim), genres)
q3_100_musicnn = get_genre_from_ids(audio_based(id=id_track3, repr=musicnn, N=100, sim_func=cos_sim), genres)
q3_100_tfidf = get_genre_from_ids(text_based(id=id_track3, repr=tfidf, N=100, sim_func=cos_sim), genres)
q3_100_word2vec = get_genre_from_ids(text_based(id=id_track3, repr=word2vec, N=100, sim_func=cos_sim),genres)
q3_100_bert =  get_genre_from_ids(text_based(id=id_track3, repr=bert, N=100, sim_func=cos_sim),genres)
q3_100_base_line = get_genre_from_ids(random_baseline(id=id_track3, info=info, N=100),genres)

#plot precision recall curve for query track3 for all evaluated systems

system_data3 = {
    "audio mfcc stats": {"system_name": "Audio MFCC Stats","query_genre": query_genre3, "retrieved_genres": q3_100_mfcc_stats, "dataset_genres": genres_list},
    "audio blf correlation": {"system_name": "Audio BLF Correlation","query_genre": query_genre3, "retrieved_genres": q3_100_blf_correlation, "dataset_genres": genres_list},
    "audio ivec_256": {"system_name": "Audio iVec 256","query_genre": query_genre3, "retrieved_genres": q3_100_ivec256, "dataset_genres":genres_list},
    "audio muiscnn": {"system_name": "Audio Musicnn","query_genre": query_genre3, "retrieved_genres": q3_100_musicnn, "dataset_genres":genres_list},
    "text tf-idf": {"system_name": "Text TF-IDF","query_genre": query_genre3, "retrieved_genres": q3_100_tfidf, "dataset_genres":genres_list},
    "text word2vec": {"system_name": "Text Word2Vec","query_genre": query_genre3, "retrieved_genres": q3_100_word2vec, "dataset_genres":genres_list},
    "text bert": {"system_name": "Text BERT","query_genre": query_genre3, "retrieved_genres": q3_100_bert, "dataset_genres":genres_list},
    "text random": {"system_name": "Text Random Baseline","query_genre": query_genre3, "retrieved_genres": q3_100_base_line, "dataset_genres":genres_list},
}

plot_precision_recall_curve(system_data3)

In [None]:
# calculate genre diversity@10
all_genres = list(set([item for id in genres['id'] for item in get_genre(id, genres)]))

#audio based
q3_genre_mfcc_stats = [get_genre(id, genres) for id in res["track3"]["mfcc_stats"]["tracks"]]
q3_genre_blf_correlation = [get_genre(id, genres) for id in res["track3"]["blf_correlation"]["tracks"]]
q3_genre_ivec256 = [get_genre(id, genres) for id in res["track3"]["ivec256"]["tracks"]]
q3_genre_musicnn = [get_genre(id, genres) for id in res["track3"]["musicnn"]["tracks"]]
#text based
q3_genre_tfidf = [get_genre(id, genres) for id in res["track3"]["tfidf"]["tracks"]]
q3_genre_word2vec = [get_genre(id, genres) for id in res["track3"]["word2vec"]["tracks"]]
q3_genre_bert = [get_genre(id, genres) for id in res["track3"]["bert"]["tracks"]]
q3_genre_random = [get_genre(id, genres) for id in res["track3"]["base_line"]["tracks"]]

#calculate genre diversity @10
res["track3"]["mfcc_stats"]["genre_diversity@10"] = gen_div_10(q3_genre_mfcc_stats, all_genres, N=10)
genre_div = res["track3"]["mfcc_stats"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["blf_correlation"]["genre_diversity@10"] = gen_div_10(q3_genre_blf_correlation,all_genres,N=10)
genre_div = res["track3"]["blf_correlation"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["ivec256"]["genre_diversity@10"] = gen_div_10(q3_genre_ivec256,all_genres,N=10)
genre_div = res["track3"]["ivec256"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["musicnn"]["genre_diversity@10"] = gen_div_10(q3_genre_musicnn,all_genres,N=10)
genre_div = res["track3"]["musicnn"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["tfidf"]["genre_diversity@10"] = gen_div_10(q3_genre_tfidf,all_genres,N=10)
genre_div = res["track3"]["tfidf"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["word2vec"]["genre_diversity@10"] = gen_div_10(q3_genre_word2vec,all_genres,N=10)
genre_div = res["track3"]["word2vec"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["bert"]["genre_diversity@10"] = gen_div_10(q3_genre_bert,all_genres,N=10)
genre_div = res["track3"]["bert"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")

res["track3"]["base_line"]["genre_diversity@10"] = gen_div_10(q3_genre_random,all_genres,N=10)
genre_div = res["track3"]["base_line"]["genre_diversity@10"]
print(f"genre diversity@10: {genre_div}")


In [None]:
# calculate genre coverage@10 
res["track3"]["mfcc_stats"]["genre_coverage@10"] = gen_cov_10(res["track3"]["mfcc_stats"]["tracks"], genres)
genre_cov = res["track3"]["mfcc_stats"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["blf_correlation"]["genre_coverage@10"] = gen_cov_10(res["track3"]["blf_correlation"]["tracks"], genres)
genre_cov = res["track3"]["blf_correlation"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["ivec256"]["genre_coverage@10"] = gen_cov_10(res["track3"]["ivec256"]["tracks"], genres)
genre_cov = res["track3"]["ivec256"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["musicnn"]["genre_coverage@10"] = gen_cov_10(res["track3"]["musicnn"]["tracks"], genres)
genre_cov = res["track3"]["musicnn"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["tfidf"]["genre_coverage@10"] = gen_cov_10(res["track3"]["tfidf"]["tracks"], genres)
genre_cov = res["track3"]["tfidf"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["word2vec"]["genre_coverage@10"] = gen_cov_10(res["track3"]["word2vec"]["tracks"], genres)
genre_cov = res["track3"]["word2vec"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["bert"]["genre_coverage@10"] = gen_cov_10(res["track3"]["bert"]["tracks"], genres)
genre_cov = res["track3"]["bert"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

res["track3"]["base_line"]["genre_coverage@10"] = gen_cov_10(res["track3"]["base_line"]["tracks"], genres)
genre_cov = res["track3"]["base_line"]["genre_coverage@10"]
print(f"genre coverage@10: {genre_cov}")

In [None]:
# calculate ndcg@10 
res["track3"]["mfcc_stats"]["ndcg"] = ndcg_score(id_track3, res["track3"]["mfcc_stats"]["tracks"], genres)
ndcg = res["track3"]["mfcc_stats"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["blf_correlation"]["ndcg"] = ndcg_score(id_track3, res["track3"]["blf_correlation"]["tracks"], genres)
ndcg = res["track3"]["blf_correlation"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["ivec256"]["ndcg"] = ndcg_score(id_track3, res["track3"]["ivec256"]["tracks"], genres)
ndcg = res["track3"]["ivec256"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["musicnn"]["ndcg"] = ndcg_score(id_track3, res["track3"]["musicnn"]["tracks"], genres)
ndcg = res["track3"]["musicnn"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["tfidf"]["ndcg"] = ndcg_score(id_track3, res["track3"]["tfidf"]["tracks"], genres)
ndcg = res["track3"]["tfidf"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["word2vec"]["ndcg"] = ndcg_score(id_track3, res["track3"]["word2vec"]["tracks"], genres)
ndcg = res["track3"]["word2vec"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["bert"]["ndcg"] = ndcg_score(id_track3, res["track3"]["bert"]["tracks"], genres)
ndcg = res["track3"]["bert"]["ndcg"]
print(f"ndcg: {ndcg}")

res["track3"]["base_line"]["ndcg"] = ndcg_score(id_track3, res["track3"]["base_line"]["tracks"], genres)
ndcg = res["track3"]["base_line"]["ndcg"]
print(f"ndcg: {ndcg}")

In [None]:
# Apply the vectorized function with k = 2
df["recall_at_2"] = df.apply(lambda row: calculate_recall_at_k_vectorized([row["query_genre"]], [row["retrieved_genres"]], [row["dataset_genres"]], 10), axis=1)

In [4]:
blf_correlation

Unnamed: 0,id,BLF_CORR0000,BLF_CORR0001,BLF_CORR0002,BLF_CORR0003,BLF_CORR0004,BLF_CORR0005,BLF_CORR0006,BLF_CORR0007,BLF_CORR0008,...,BLF_CORR1316,BLF_CORR1317,BLF_CORR1318,BLF_CORR1319,BLF_CORR1320,BLF_CORR1321,BLF_CORR1322,BLF_CORR1323,BLF_CORR1324,BLF_CORR1325
0,01Yfj2T3YTwJ1Yfy,0.000214,0.000067,0.000017,0.000087,0.000058,0.000025,0.000036,0.000041,0.000050,...,0.000580,0.000464,0.000425,0.000394,0.000613,0.000538,0.000502,0.000662,0.000628,0.000674
1,01gyRHLquwXDlhkO,0.000398,0.000209,0.000155,0.000209,0.000240,0.000224,0.000161,0.000173,0.000206,...,0.000691,0.000648,0.000647,0.000628,0.000697,0.000673,0.000655,0.000698,0.000683,0.000722
2,01rMxQv6vhyE1oQX,0.000298,0.000195,0.000171,0.000155,0.000150,0.000123,0.000126,0.000195,0.000123,...,0.000696,0.000651,0.000630,0.000555,0.000714,0.000685,0.000593,0.000717,0.000632,0.000654
3,02RGE9FNH65RtMS7,0.000341,0.000226,0.000144,0.000119,0.000119,0.000189,0.000193,0.000208,0.000214,...,0.000707,0.000706,0.000693,0.000689,0.000713,0.000690,0.000704,0.000702,0.000706,0.000705
4,02ZnlCGZEbkfCDxo,0.000315,0.000145,0.000116,0.000133,0.000091,0.000086,0.000052,0.000109,0.000145,...,0.000662,0.000594,0.000512,0.000437,0.000623,0.000587,0.000547,0.000627,0.000595,0.000632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10089,zyzILCQvVeUFIINi,0.000227,0.000170,0.000120,0.000184,0.000179,0.000144,0.000042,0.000008,0.000098,...,0.000631,0.000582,0.000537,0.000445,0.000644,0.000623,0.000559,0.000636,0.000556,0.000665
10090,zzgS4ZqyswamEWNj,0.000345,0.000124,0.000042,0.000042,0.000096,0.000096,0.000074,0.000052,0.000029,...,0.000671,0.000593,0.000522,0.000521,0.000642,0.000572,0.000539,0.000666,0.000613,0.000693
10091,zzoFYDMlqU1X2zz1,0.000455,0.000269,0.000183,0.000163,0.000120,0.000082,0.000088,0.000080,0.000038,...,0.000702,0.000630,0.000628,0.000533,0.000629,0.000616,0.000516,0.000681,0.000650,0.000631
10092,zzpkRCGA5ud8q4mv,0.000338,-0.000065,-0.000019,0.000036,0.000045,0.000002,-0.000031,-0.000049,0.000011,...,0.000698,0.000670,0.000665,0.000621,0.000706,0.000682,0.000643,0.000695,0.000660,0.000702


In [None]:
# Importieren Sie sklearn
from sklearn.metrics.pairwise import cosine_similarity
# Berechnen Sie die Kosinus-Ähnlichkeit aller IDs mit allen anderen IDs entlang der Reihen
similarity = cosine_similarity(blf_correlation[:][2:])

# Erhalten Sie nur die obere Dreiecksmatrix der Ähnlichkeitsmatrix
upper = np.triu(similarity)

In [33]:
df


Unnamed: 0,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9,...,NaN.10,NaN.11,NaN.12,NaN.13,NaN.14,NaN.15,NaN.16,NaN.17,NaN.18,NaN.19
,1.000000,0.634893,0.635349,0.598153,0.576161,0.568466,0.629246,0.645637,0.587046,0.532837,...,0.621411,0.637629,0.654952,0.638904,0.560101,0.666608,0.611436,0.489161,0.531079,0.641442
,0.634893,1.000000,0.828560,0.851030,0.777809,0.686822,0.877492,0.708623,0.753095,0.847415,...,0.912093,0.823651,0.867349,0.869174,0.902670,0.652415,0.807221,0.696067,0.852135,0.821675
,0.635349,0.828560,1.000000,0.773646,0.761471,0.722970,0.869319,0.759349,0.774074,0.819857,...,0.808650,0.797611,0.805425,0.815107,0.764606,0.701674,0.761847,0.695129,0.738223,0.805496
,0.598153,0.851030,0.773646,1.000000,0.785102,0.664442,0.832017,0.627340,0.719576,0.804217,...,0.858410,0.819604,0.820042,0.844789,0.847214,0.609150,0.787245,0.708434,0.835343,0.749437
,0.576161,0.777809,0.761471,0.785102,1.000000,0.680657,0.820742,0.648261,0.731629,0.761479,...,0.789689,0.790634,0.768481,0.838606,0.745495,0.597407,0.798870,0.711728,0.768490,0.776015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,0.666608,0.652415,0.701674,0.609150,0.597407,0.567506,0.654330,0.653536,0.594052,0.551923,...,0.619223,0.636745,0.630093,0.634828,0.570979,1.000000,0.602570,0.503714,0.545268,0.670342
,0.611436,0.807221,0.761847,0.787245,0.798870,0.669773,0.834442,0.660964,0.755063,0.795230,...,0.827598,0.769887,0.795303,0.855147,0.808174,0.602570,1.000000,0.749721,0.815573,0.818467
,0.489161,0.696067,0.695129,0.708434,0.711728,0.670488,0.741595,0.590277,0.728201,0.718227,...,0.703538,0.705924,0.681825,0.741768,0.665708,0.503714,0.749721,1.000000,0.687855,0.689025
,0.531079,0.852135,0.738223,0.835343,0.768490,0.612901,0.826779,0.599824,0.700431,0.828984,...,0.903774,0.756766,0.829616,0.846157,0.864387,0.545268,0.815573,0.687855,1.000000,0.795725


In [19]:
df_merged = df.merge( genres,left_index=True, right_on='id',  how='inner')

In [20]:
df_merged

Unnamed: 0,"(01Yfj2T3YTwJ1Yfy,)","(01gyRHLquwXDlhkO,)","(01rMxQv6vhyE1oQX,)","(02RGE9FNH65RtMS7,)","(02ZnlCGZEbkfCDxo,)","(04OjszRi9rC5BlHC,)","(04iitW3ffa0mhpx3,)","(04xUDjAYC14jsHyH,)","(06HvNTU9M9lnH71I,)","(06L9OJ5nRqKnO2q9,)",...,"(zxlnGZoud2KCmSaw,)","(zyxOCKcXX1RmWpm0,)","(zyz0UbYN4n9rHXex,)","(zyzILCQvVeUFIINi,)","(zzgS4ZqyswamEWNj,)","(zzoFYDMlqU1X2zz1,)","(zzpkRCGA5ud8q4mv,)","(zzx8CWdM7qkxKQpC,)",id,genre


In [21]:
genres

Unnamed: 0,id,genre
0,01Yfj2T3YTwJ1Yfy,"['rock', 'christian rock']"
1,01gyRHLquwXDlhkO,"['hip hop', 'rap', 'grindcore', 'death metal']"
2,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"
3,02RGE9FNH65RtMS7,"['trance', 'techno', 'progressive trance']"
4,02ZnlCGZEbkfCDxo,"['pop', 'italian pop', 'latin', 'europop', 'am..."
...,...,...
10089,zyzILCQvVeUFIINi,"['rock', 'pop', 'indie rock', 'folk rock', 'ne..."
10090,zzgS4ZqyswamEWNj,"['pop', 'rock', 'teen pop', 'soundtrack', 'sin..."
10091,zzoFYDMlqU1X2zz1,"['jazz', 'jazz fusion']"
10092,zzpkRCGA5ud8q4mv,"['soul', 'blues', 'r b', 'blues rock', 'southe..."


In [None]:
blf_correlation_matrix

In [None]:
blf_correlation.shape

In [None]:
 # Fill the lower triangular part with the same values as the upper triangular part
cos_sim_matrix = cos_sim_matrix + cos_sim_matrix.T - np.diag(cos_sim_matrix.diagonal())