In [None]:
##### Exercice 2 MMSR #####
# - basd audio retrieval system
# - evaluation metrics: nDCGN and genre diversity@N

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import math

In [2]:
# reading repr (representation/encoded) data ---> data: id_mfcc_stats_mmsr data
task2id_mfcc_stats_df = pd.read_csv('/content/id_mfcc_stats_mmsr.tsv' , sep='\t')
task2id_mfcc_stats_df.head()

Unnamed: 0,id,MFCC000,MFCC001,MFCC002,MFCC003,MFCC004,MFCC005,MFCC006,MFCC007,MFCC008,...,cov_9_9,cov_9_10,cov_9_11,cov_9_12,cov_10_10,cov_10_11,cov_10_12,cov_11_11,cov_11_12,cov_12_12
0,9ErLUJOzu2Lvqwbq,24.062305,-24.529358,0.992749,11.64836,9.184493,7.888308,3.549652,3.135545,-4.26757,...,93.488292,33.682528,-3.576354,-21.538019,76.832969,22.308741,-10.750334,64.227492,25.354757,75.786649
1,MTWv5ooA00iAD8Ms,24.377205,-9.601337,-0.591649,-1.563359,-3.887833,-5.740052,-3.548983,-4.365711,3.474253,...,143.537407,76.636133,33.551934,31.9129,146.343073,61.383011,33.601789,95.405675,40.896044,87.043994
2,mTwXhqc4op8iTl4j,23.252638,-14.410173,-9.571658,7.999164,0.506061,7.472953,2.90832,14.135953,-3.479433,...,90.249964,21.49207,11.622519,7.554839,91.891904,16.390867,7.394409,79.469247,23.994299,51.304981
3,MTxLXUkaW5ujpBH5,24.743526,-14.497754,-9.613932,-1.148391,-8.293142,-1.447457,-6.936134,5.588965,1.948418,...,62.023422,18.857,-5.454738,-6.255883,55.363001,13.154167,-2.587679,56.837088,21.24345,52.81625
4,MtxPSiYt0J3CTojA,22.420496,-10.768103,-16.124973,-0.544802,-8.816124,-0.328781,-0.759349,-7.752491,0.817031,...,97.63808,11.538308,-9.060791,-11.943329,116.217278,20.499887,-37.395791,91.527811,24.185012,123.732329


In [3]:
# reading information file
infos = pd.read_csv('/content/id_information_mmsr.tsv', sep='\t')
infos.head()

Unnamed: 0,id,artist,song,album_name
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition)
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002)
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te


In [4]:
def id_to_title_artist(id,info):
  'parameters:'
  'id: (str) the id string'
  'info: (df) the dataframe of the tracks/songs information'
  'returns: (tuple) the title and artist as a tuple'
  id_row = info.loc[info['id'] == id].squeeze()
  title = id_row['song']
  artists = id_row['artist']
  return title, artists


# test of the function id_to_title_artist
id_to_title_artist('01Yfj2T3YTwJ1Yfy',infos)

('Take The Bullets Away (feat. Lacey Sturm)', 'We As Human')

In [5]:
def title_artist_to_id(title,artist,info):
  'parameters:'
  'title: (str) the title of the track/song'
  'artist: (str) the artist name'
  'returns: (str) the id of the track/song given the title and artist'
  title_artist_row = info.loc[(info['song'] == title) & (info['artist'] == artist)].squeeze()
  id = title_artist_row['id']
  return id


# tset of the function title_artist_to_id
title = 'Take The Bullets Away (feat. Lacey Sturm)'
artist = 'We As Human'
query_id = title_artist_to_id(title,artist,infos)
query_id

'01Yfj2T3YTwJ1Yfy'

In [6]:
def retrive(title,artist,feature,N):
  'parameters:'
  'title: (str) the title of the track/song'
  'artist: (str) the artist name'
  'feature: (df) data representation'
  'N: (int) number of tracks/songs'
  'returns: (list) list of tuples of titles and artists of the retrived N tracks/songs'
  query_id = title_artist_to_id(title,artist,infos)
  query_feat = feature.loc[feature['id'] == query_id].drop('id', axis=1) # + dropping the col id
  idx = query_feat.index # getting the index of the query feature
  remaining_features = feature.drop(idx)



  # similarity
  similarities = []
  for i, rem_feat in enumerate(remaining_features.drop('id', axis=1).values):

    cosim = cosine_similarity(query_feat.values.reshape(1,-1),rem_feat.reshape(1,-1))
    similarities.append((remaining_features['id'].iloc[i],cosim))

  # sorting the list and retriving the  top N similar tracks
  similarities.sort(key=lambda x: x[1], reverse=True)
  most_similar_tracks = similarities[1:N+1]

  most_similar_tracks_ids = [id for id, _ in most_similar_tracks]

  # Top N similar songs (title + artist): manual extraction no function used
  # selected_cols = ['song', 'artist']
  # sim_N_songs = infos.loc[infos['id'].isin(most_similar_tracks_ids), selected_cols].to_records(index=False).tolist()

  # Top N similar songs (title + artist) using id_to_title_artist function
  sim_N_songs = [id_to_title_artist(id,infos) for id in most_similar_tracks_ids]
  return sim_N_songs


retrived = retrive(title,artist,task2id_mfcc_stats_df,10)
retrived

[('Cactus', 'David Bowie'),
 ('Local Man Ruins Everything', 'The Wonder Years'),
 ('Unknown Soldier', 'Breaking Benjamin'),
 ('Broken Promises', 'Element Eighty'),
 ('Geraldine', 'Glasvegas'),
 ('Call My Name', 'In Flames'),
 ('Start a Fire', 'Ryan Star'),
 ('Someone Who Does', 'Issues'),
 ('Green Man', 'Type O Negative'),
 ('My Revenge', 'Bury Tomorrow')]

In [None]:
###############################################################################################################
###################################### EVALUATION #############################################################
###############################################################################################################

In [None]:
### test with my retrieving function retrieve ###

In [None]:
############# nDCG ###############

In [8]:
### genres
genres = pd.read_csv('/content/id_genres_mmsr.tsv' , sep='\t')
genres.iloc[:10]

Unnamed: 0,id,genre
0,01Yfj2T3YTwJ1Yfy,"['rock', 'christian rock']"
1,01gyRHLquwXDlhkO,"['hip hop', 'rap', 'grindcore', 'death metal']"
2,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"
3,02RGE9FNH65RtMS7,"['trance', 'techno', 'progressive trance']"
4,02ZnlCGZEbkfCDxo,"['pop', 'italian pop', 'latin', 'europop', 'am..."
5,04OjszRi9rC5BlHC,"['experimental', 'folk', 'lo fi', 'freak folk'..."
6,04iitW3ffa0mhpx3,"['pop', 'r b', 'hip hop', 'soul', 'rhythm and ..."
7,04xUDjAYC14jsHyH,"['punk', 'emo', 'post hardcore', 'post punk', ..."
8,06HvNTU9M9lnH71I,"['jazz', 'easy listening', 'swing', 'smooth ja..."
9,06L9OJ5nRqKnO2q9,"['smooth soul', 'sophisti pop']"


In [9]:
# get retrived_ids given title and artists
retrieved_ids = []
for title, artist in retrived:
  retrieved_ids.append(title_artist_to_id(title,artist,infos))
retrieved_ids

['15blZOCUg63HosU3',
 '3cMJTipuaJSlq27p',
 'oiJUeLdVwgBzhPiI',
 'eOvLEAOmwApxf5JQ',
 'vOGoMtp0LQ2fzS3F',
 'vai5vPlNfTBiu6Nj',
 'pM5Hf0ucqHSGr3jp',
 '4sUNaDw0evcjjBzv',
 'IvFtcAajHxKWNl7x',
 'i3Hq3mMS2pL8aWsU']

In [10]:
def get_genre(id,genres_df):
  'parametrs:'
  'id: (str) the string id of the track/song to retrive its genre'
  'genres_df: (df) a dataframe of the ids and their corresponding genres'
  'returns: (set) set of all genres of the given track/song id'
  # print(genres_df[genres_df['id'] == id ]['id'].values[0],'--->',id)
  return set(genres_df[genres_df['id'] == id ]['genre'].values[0].replace("[", "").replace("]", "").replace("'", "").split(', '))

# example with query id
query_genres = get_genre(query_id,genres)
query_genres

{'christian rock', 'rock'}

In [13]:
import numpy as np
def DCG(query_id,retrived_ids):
  'parameters:'
  'query_id: (str) the string id of the query track/song'
  'retrived_ids: (list) list of retrived tracks/songs ids'
  'returns: (float) the DCG'
  g_q =  get_genre(query_id,genres)
  g_is = [get_genre(id_i,genres) for id_i in retrived_ids]
  rel = [2 * len(g_q & g_i) / (len(g_q) + len(g_i)) for g_i in g_is]
  dcg = sum(np.array(rel) / np.array([math.log((i + 1), 2) for i in range(1,len(g_is)+1)]))
  return dcg

dcg = DCG(query_id,retrieved_ids)
print('DCG: ',dcg)

DCG:  0.5321190434141885


In [14]:
def iDCG(query_id,retrived_ids):
  'parameters:'
  'query_id: (str) the string id of the query track/song'
  'retrived_ids: (list) list of retrived tracks/songs ids'
  'returns: (float) the iDCG'
  g_q =  get_genre(query_id,genres)
  g_is = [get_genre(id_i,genres) for id_i in retrived_ids]
  rel = [2 * len(g_q & g_i) / (len(g_q) + len(g_i)) for g_i in g_is]
  # print(rel)
  sorted_rel = np.sort(rel)[::-1]
  idcg = sum(np.array(sorted_rel) / np.array([math.log((i + 1), 2) for i in range(1,len(g_is)+1)]))
  return idcg

In [16]:
idcg = iDCG(query_id,retrieved_ids)
print('iDCG: ', idcg)

iDCG:  0.8188327158418165


In [17]:
def nDCG(dcg,idcg):
  'parameters:'
  'dcg: (float) the DCG'
  'retrived_ids: (float) the iDCG'
  'returns: (float) the nDCG'
  return dcg/idcg
ndcg = nDCG(dcg,idcg)
print('nDCG: ',ndcg)

nDCG:  0.6498507364439309


In [None]:
##################### Genre diversity@10 #####################

In [18]:
# retriving all the unique genres in the whole datset
all_genres = list(set([item for id in genres['id'] for item in get_genre(id, genres)]))

In [19]:
print('nbr of genres:', len(all_genres))
print('all genres:', all_genres)

# retrive the genres of the retrived tracks/songs
genres_retrieved = [get_genre(id, genres) for id in retrieved_ids]
print('retrived genres:', genres_retrieved)

nbr of genres: 1112
all genres: ['riot grrrl', 'easy listening', 'nerdcore', 'funk pop', 'norwegian metal', 'rumba', 'christian hardcore', 'weightless', 'chill out', 'rock gaucho', 'dungeon synth', 'ambient trance', 'jazz piano', 'afrofuturism', 'j metal', 'accordeon', 'symphonic death metal', 'darkstep', 'melodic doom', 'karaoke', 'greek metal', 'pop', 'jungle', 'screamo', 'reggae rock', 'pop nacional', 'melodic groove metal', 'breakcore', 'klezmer', 'post rock', 'northern soul', 'pop soul', 'merseybeat', 'hungarian black metal', 'dark trap', 'psychedelic pop', 'canadian metal', 'j punk', 'afrobeat', 'abstract hip hop', 'hyperpop', 'protopunk', 'orgcore', 'hardcore', 'tech house', 'atmospheric black metal', 'baroque', 'idol', 'deathrock', 'bajki', 'dinner jazz', 'future bass', 'desert blues', 'massage', 'alternative hip hop', 'medieval rock', 'moog', 'liedermacher', 'synth funk', 'ritual ambient', 'experimental', 'melodic metalcore', 'tecnobrega', 'synth punk', 'nightcore', 'pop emo',

In [31]:
def diversity(genres_retrived,all_genres,N):
  'Paramters:'
  'genres_retrived: (list(sets)--> [{},{}...]) list of sets of the genres of the retrived tracks/songs '
  'all_genres: (list) of all unique genres in the whole dataset'
  'N: (int) the number of retrived tracks/songs'
  'returns: (float) the Genre diversity@N'
  zeros_vec = np.zeros(len(all_genres))
  for g in genres_retrived:
    leng_g = len(g)
    for g_i in g:
      # print(g_i)
      # print(all_genres)
      position = all_genres.index(g_i)
      g_i_contribution = 1/leng_g
      zeros_vec[position] += g_i_contribution

  result_vec = zeros_vec/N
  # Shannon's Entropy Calculation:
  diversity = 0
  for item in result_vec :
    if item != 0:
      diversity += item * math.log((item), 2)
  return - diversity

diversity_10 = diversity(genres_retrieved,all_genres,N=10)
print('Genre diversity@10: ', diversity_10 )

Genre diversity@10:  5.0479878179248105


In [None]:
#######################################################
##### Test with new_retriever_func (from the git) #####
#######################################################

In [32]:
from new_retriever_func import audio_based
from ret import cos_sim

# audio_based(id, repr, N, sim_func)

retrieve_tracks = audio_based(id=query_id, repr=task2id_mfcc_stats_df, N=10, sim_func=cos_sim)
print(retrieve_tracks)

['AiE9Cln05EjE1dOf' '15blZOCUg63HosU3' '3cMJTipuaJSlq27p'
 'oiJUeLdVwgBzhPiI' 'eOvLEAOmwApxf5JQ' 'vOGoMtp0LQ2fzS3F'
 'vai5vPlNfTBiu6Nj' 'pM5Hf0ucqHSGr3jp' '4sUNaDw0evcjjBzv'
 'i3Hq3mMS2pL8aWsU']


In [33]:
# this function is a colplementary function to the audiobased one it should return the titles and artists of the
# audio based retrieved tracks
def retrival_audio_based(info,retrieved_track_ids):
  'parameters:'
  'info: (df) dataframe of the tracks/songs information'
  'retrieved_track_ids: (list) list of the retrieved ids'
  'return: (list) list of tuples(title,artist) of the retrieved tracks/songs'
  return [id_to_title_artist(id,info) for id in retrieved_track_ids]
retrieved_songs = retrival_audio_based(infos,retrieve_tracks)
print('retrieved tracks/songs: ',retrieved_songs)

retrieved tracks/songs:  [('The Heart of Everything', 'Within Temptation'), ('Cactus', 'David Bowie'), ('Local Man Ruins Everything', 'The Wonder Years'), ('Unknown Soldier', 'Breaking Benjamin'), ('Broken Promises', 'Element Eighty'), ('Geraldine', 'Glasvegas'), ('Call My Name', 'In Flames'), ('Start a Fire', 'Ryan Star'), ('Someone Who Does', 'Issues'), ('My Revenge', 'Bury Tomorrow')]


In [34]:
### nDCG ###
new_idcg = iDCG(query_id,retrieve_tracks)
new_dcg = DCG(query_id,retrieve_tracks)

new_ndcg = nDCG(new_dcg,new_idcg)
new_ndcg
print('nDCG: ',new_ndcg)

nDCG:  0.6267649619762788


In [35]:
### Genre diversity@N ###
new_genres_retrieved = [get_genre(id, genres) for id in retrieve_tracks]
new_diversity_10 = diversity(new_genres_retrieved,all_genres,N=10)
print('Genre diversity@10: ',new_diversity_10 )

Genre diversity@10:  5.108144261249608
