In [88]:
# ! pip install librosa
# and such, in case it is needed...

In [89]:
import numpy as np
import os
import warnings
import librosa
import scipy


In [90]:
SR = 22050
MONO = True
NUM_MAT = 6
N_SONGS = 900
DEBUG = False

In [91]:
def import_csv(filename:str) -> np.ndarray:
    ans = np.genfromtxt(filename, skip_header = 1, delimiter=',')
    return ans [:,1:-1] # returns all feture values but the name and the quadrant


def export_csv(filename:str,  array:np.ndarray, fmt='%.18e') -> None:
    np.savetxt(filename, array, delimiter =',', fmt=fmt)



In [92]:
def print_debug(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)

In [93]:

def normalize_features(matrix: np.ndarray) -> np.ndarray:
    maxs = np.max(matrix, axis = 0)
    mins = np.min(matrix, axis = 0)
    ans= np.array(matrix)
    ans[np.arange(len(matrix)),:] = (ans[np.arange(len(matrix)),: ] - mins)/(maxs-mins)
    return ans



In [94]:
def save_normalized_features() -> np.ndarray:
    data = import_csv("dataset/top100_features.csv")
    n_data = normalize_features(data)
    print_debug(n_data)
    # print_debug('shape: ', n_data.shape)
    export_csv('dataset/normalized_features.csv', n_data)
    return n_data

In [95]:
def ex2_1():
    save_normalized_features()

In [96]:
def calculate_stats(array:np.ndarray):
    mean= np.mean(array)
    std =  np.std(array)
    median = np.median(array)
    skewness = scipy.stats.skew(array)
    kurtosis = scipy.stats.kurtosis(array)
    maximum = np.max(array)
    minimum = np.min(array)


def features_librosa(dirname:str) -> np.ndarray:    
    ans=np.array([])
    i=0
    for filename in os.listdir(dirname):
        print_debug(filename)
        i+=1
        # spectral features
        y,fs = librosa.load(dirname+'/'+filename, sr = SR, mono = MONO)
        mfccs = librosa.feature.mfcc(y, sr=SR, n_mfcc=13)
        spcentroid = librosa.feature.spectral_centroid(y, sr=SR)
        spband = librosa.feature.spectral_bandwidth(y, sr=SR)
        spcontrast = librosa.feature.spectral_contrast(y, sr=SR)
        spflatness = librosa.feature.spectral_flatness(y)
        sprolloff = librosa.feature.spectral_rolloff(y, sr=SR)
        rms = librosa.feature.rms(y)
        zcr = librosa.feature.zero_crossing_rate(y)
        f0 = librosa.yin(y, sr=SR, fmin=20, fmax=11025)
        f0[f0==11025]=0
        all_features_array = np.vstack((mfccs, spcentroid, spband, spcontrast, spflatness, sprolloff, f0, rms, zcr))
        all_stats = np.apply_along_axis(calculate_stats, 1, all_features_array).flatten()


        tempo = librosa.beat.tempo(y,sr=SR)
        aid = np.append(all_stats, tempo)
        # print_debug(aid)
        if i==1:
            ans = np.array(aid)
        else:
            ans= np.vstack((ans,aid))
    # print_debug("ans: ",ans)
    # print_debug("ans shape: ",ans.shape)
    ans = np.array(ans)
    return normalize_features(ans)

In [97]:
def ex2_2():
    features_norm_obtained = features_librosa('dataset/allSongs')
    export_csv('dataset/song_features.csv', features_norm_obtained)

In [98]:
def euclidean_distance (vec1:np.ndarray, vec2:np.ndarray) -> float:
    return np.linalg.norm(vec1-vec2)

def manhattan_distance(vec1:np.ndarray, vec2:np.ndarray) -> float:
    return np.sum(np.abs(vec1-vec2))

def cosine_distance (vec1:np.ndarray, vec2:np.ndarray) -> float:
    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))


In [99]:
def distance_matrix(feature_matrix:np.ndarray, distance_function, filename:str):
    lines= len(feature_matrix)
    distance_mat =  np.zeros((lines,lines))

    # distance_mat[np.arange(lines), np.arange(lines)] = distance_function(feature_matrix[np.arange(lines)], feature_matrix[np.arange(lines)])  
    feature_matrix[feature_matrix!=feature_matrix] = 0
    
    for i in range(len(feature_matrix)):
        for j in range(i):
            distance_mat[i,j] = distance_function(feature_matrix[i], feature_matrix[j])
            distance_mat[j,i] = distance_mat[i,j]
    export_csv(filename, distance_mat)
    return distance_mat

def get_distance_matrices():
    song_features = np.genfromtxt('dataset/song_features.csv', skip_header = 0, delimiter=',') 
    # the skip header must be set to zero!!!
    top100 = np.genfromtxt('dataset/normalized_features.csv', skip_header = 0, delimiter=',')
    d_functions = [euclidean_distance, manhattan_distance, cosine_distance]
    function_names=['euclidean','manhattan', 'cosine']
    matrices = [top100, song_features]
    matrix_names=['top100','song_features']
    n_functions= len(d_functions)
    n_mat = len(matrices)
    n_songs=  len(song_features)
    arr = np.zeros(( n_functions* n_mat,n_songs,n_songs ))
    # arr = [[[] for i in range(len(function_names))] for j in range(len(matrix_names))]
    
    for i in range(len(function_names)):
        for j in range(len(matrices)):
            arr[i*n_mat+j]= distance_matrix(matrices[j], d_functions[i], f"dataset/results/{function_names[i]}_{matrix_names[j]}.csv")
    # arr = np.array(arr)
    return arr







In [100]:
def read_distance_mats()->np.ndarray:

    arr = np.zeros((NUM_MAT, N_SONGS, N_SONGS ))

    filenames = ['euclidean_song_features','euclidean_top100','manhattan_song_features','manhattan_top100','cosine_song_features','cosine_top100',]

    for i in range(len(filenames)):
        gen =  np.genfromtxt('dataset/results/'+filenames[i]+'.csv', skip_header = 0, delimiter=',')
        arr[i] = gen
    return arr

In [101]:
def get_query_ranking(filename, index, distance_matrices, all_songs)-> np.ndarray:
    print_debug( "Music: ", filename)
    ans = []
    for i in range(len(distance_matrices)):
        line=distance_matrices[i,index]
        sorted_distances = np.argsort(line)
        indices = sorted_distances[1:21]
        ans.append(indices)
        if DEBUG:
            print_debug("According to metric ",i)
            for j in indices:
                print_debug(all_songs[j])
            print_debug("------")
    ans = np.array(ans)
    return ans

def get_rankings(distance_matrices, all_songs, queries)->np.ndarray:
    rks=[]
    for q in queries:
        index = all_songs.index(q)
        rks.append(get_query_ranking ( q,index, distance_matrices, all_songs))
    rks = np.array(rks)
    a,b,c = rks.shape
    ans = np.zeros((b,a,c))
    for i in range(b):
        ans[i] = rks[:,i,:]
    return ans 


In [102]:
def score_distance(md1:np.ndarray, md2:np.ndarray)->int:
    ans=0
    # 1, 3,9,11
    inds = [1,3]
    sps = [9,11]
    for i in inds:
        if md1[i] == md2[i]:
            ans +=1
    for s in sps:
        # print_debug("s = ",s, " and md1[s] = ", md1[s], " and md2[s] =",md2[s] )
        sp1:list = md1[s].split("; ")
        sp2:list = md2[s].split("; ")
        print_debug("sp1 len is ", len(sp1), "and len sp2 is", len(sp2))
        for str_prop in sp1:
            if str_prop in sp2:
                ans +=1
    return ans

def get_metadata_ranking(filename:str, index:int, all_songs:list, metadata_matrix:np.ndarray)->np.ndarray:
    # index of the song
    print_debug("Music: ", filename)
    md_file = metadata_matrix[index]
    rating = np.zeros((len(all_songs)))
    for i in range(len(all_songs)):
        md_i = metadata_matrix[i]
        rating[i] = score_distance(md_i, md_file)
        print_debug("Score: ",rating[i])
    rating[index]=-1
    sorted_dists = np.argsort(rating)
    sorted_dists = np.flip(sorted_dists)
    reccoms = sorted_dists[0:20] # the first 20 songs that are supposed to be considered for the ranking
    return reccoms

def get_all_metadata_rankings(all_songs: list, metadata_matrix: np.ndarray, queries:list)->np.ndarray:
    a:list =[]
    for q in queries:
        index:int  = all_songs.index(q)
        a.append(get_metadata_ranking(q,index, all_songs, metadata_matrix))
    a = np.array(a)
    return a

In [107]:
def print_rankings_matrices(ranking_matrix:np.ndarray, all_songs:list, queries:list) -> None :
    for mat in ranking_matrix:
        print_rankings(mat, all_songs, queries)


def print_rankings(ranking_matrix:np.ndarray, all_songs:list, queries:list)->None:
    for i in range(len(queries)):
        q = queries[i]
        print("Recommendation for music", q, ":")
        for r in ranking_matrix[i]:
            print(all_songs[r])
        print("--------")

def get_music_from_pos(music_array, all_songs):
    return [[all_songs[music_array[i,j]] for j in range(len(music_array[i]))] for i in range(len(music_array))]


def save_feature_ranks(all_ranks:np.ndarray, name_array: list, all_songs:list):
    for i in range(len(all_ranks)):
        export_csv(f"dataset/results/ranking_features_metric_{name_array[i]}.csv", get_music_from_pos(all_ranks[i],all_songs), fmt="%s")

In [108]:
def comp_rank(metadata_rank, feature_ranks):
    ret = np.zeros((len(feature_ranks), len(feature_ranks[0])))
    # one number for each type of rating, and for each song
    for k in range(len(feature_ranks)):
        fr = feature_ranks[k] # rating of type k
        for i in range(len(fr)): # for each song in the rating
            f = len([None for j in range(len(fr[i])) if fr[i,j] in metadata_rank[i]]) / len(fr[i])
            print("Precision of rating ", k," and song ", i," is ", f )
            ret[k,i] = f
    return ret


In [109]:
# get_distance_matrices()
distance_matrices = read_distance_mats() # reads distance matrices, already obtained
all_songs= os.listdir('dataset/allSongs')
queries = os.listdir('Queries')
distance_names = ['euclidean', 'manhattan', 'cosine']
distance_sources = ['calculated', 'top100']
func_names = ['metadata']+ [a+'_'+b for a in distance_names for b in distance_sources]
print(func_names)
all_feature_ranks = get_rankings(distance_matrices, all_songs, queries) # this gets all rankings
metadata_matrix = np.genfromtxt('dataset/panda_dataset_taffc_metadata.csv', delimiter=',',skip_header = 1, encoding = None, dtype=None)
metadata_rankings = get_all_metadata_rankings(all_songs, metadata_matrix, queries) # getting rankings based on metadata
metadata_rankings = metadata_rankings[np.newaxis, :,:]
all_rankings = np.vstack((metadata_rankings, all_feature_ranks)).astype(np.int32)
print_rankings_matrices(all_rankings, all_songs, queries)
save_feature_ranks(all_rankings, func_names, all_songs) # the rankings are now saved in the disk
cr = comp_rank(all_rankings[0], all_rankings[1:])
print(cr.shape)
export_csv("dataset/results/rating_eval.csv", cr, '%f')




['metadata', 'euclidean_calculated', 'euclidean_top100', 'manhattan_calculated', 'manhattan_top100', 'cosine_calculated', 'cosine_top100']
Recommendation for music MT0000202045.mp3 :
MT0014475915.mp3
MT0012862507.mp3
MT0000888329.mp3
MT0007556029.mp3
MT0004867564.mp3
MT0031898123.mp3
MT0007453719.mp3
MT0034186620.mp3
MT0004850690.mp3
MT0001494812.mp3
MT0005285696.mp3
MT0002846256.mp3
MT0011938737.mp3
MT0011922905.mp3
MT0003025046.mp3
MT0030369896.mp3
MT0003022328.mp3
MT0009741521.mp3
MT0001058887.mp3
MT0007186714.mp3
--------
Recommendation for music MT0000379144.mp3 :
MT0011032905.mp3
MT0031951901.mp3
MT0005253065.mp3
MT0014584473.mp3
MT0011391187.mp3
MT0013416300.mp3
MT0007349999.mp3
MT0001810607.mp3
MT0008979040.mp3
MT0008716237.mp3
MT0026690204.mp3
MT0007652281.mp3
MT0001982863.mp3
MT0033159895.mp3
MT0026905937.mp3
MT0007099407.mp3
MT0007338724.mp3
MT0001526386.mp3
MT0006919273.mp3
MT0006410285.mp3
--------
Recommendation for music MT0000414517.mp3 :
MT0003949060.mp3
MT0027048677.m