In [1]:
import numpy as np
import os
import json
import pandas as pd
import time
from tqdm import tqdm
import gc
import mmap
from scipy import spatial
import heapq
from sklearn.metrics import ndcg_score

In [2]:
# !pip install import-ipynb
import import_ipynb
# random.seed(0)

In [3]:
from src.show import show_playlist
from get_similar_tracks import *
from Rerank_docs import Rerank
from Vocab_class import TermVocab
from generate_vocab import create_vocab
collection_dir = '/media/harsh/Common/IITD/COL764-IR/Project/spotify_million_playlist_dataset/data/'

importing Jupyter notebook from get_similar_tracks.ipynb
Creating Track Vocabulary
Reading the dataset
for file       : mpd.slice.324000-324999.json
total playlists: 1000
total time taken 0.2846109867095947
190
Total similar playlists 50
total tracks: 589
Total similar playlists 50
total tracks: 2457
importing Jupyter notebook from Rerank_docs.ipynb
importing Jupyter notebook from create_co_occour_mat.ipynb
Creating Track Vocabulary
Creating Track Vocabulary
Creating Track Vocabulary
Creating Track Vocabulary
Total similar playlists 100
total tracks: 3459
reccommended len (finally 500) 3459
relevent tracks size: 93
sampled relevent tracks 20


In [4]:
vocab, idf = create_vocab(collection_dir)

Creating Track Vocabulary


In [5]:
class Evaluate:
    def __init__(self, coll_dir, vocab, idf):
        self.coll_dir = coll_dir
        self.vocab = vocab
        self.idf = idf
        self.rerank = Rerank(self.coll_dir, TVocab= self.vocab, idf= self.idf, select_subset_of=20, apply_pca= True)
    
    def get_relevent_tracks(self, pid):
        actual_tracks = []
        playlist = show_playlist(self.coll_dir, pid, print_op=False)
        if playlist is None:
            return []
        for track in playlist['tracks']:
            actual_tracks.append(track['track_uri'])
        return actual_tracks[:500]
    
    def get_recommended_songs(self, pid):
        recommended = self.rerank.get_reranked_tracks(pid=pid)
        return recommended
    
    def r_precision(self, pid):
        recommended = self.get_recommended_songs(pid)
        relevent = self.get_relevent_tracks(pid)
        score = 0
        score = len(set(recommended) & set(relevent))
        score = score/len(relevent)
        #print('r_precision',score)
        return score
    
    def ndcg(self, recommended, relevent):
        score = 0
        #score = ndcg_score(relevent, recommended)
        if recommended[0] in relevent:
            score = 1
        index = 2
        idcg = 1
        for rec in recommended[1:]:
            rel_i = 1 if rec in relevent else 0
            score += rel_i/np.log2(index)
            idcg += 1/np.log2(index)
            index += 1
        score = score/idcg
        #print('ndcg', score)
        return score
        
    def click_at(self, pid):
        recommended = self.get_recommended_songs(pid)
        relevent = self.get_relevent_tracks(pid)
        #print('rec. tracks:',len(recommended))
        #print('rel. tracks:',len(relevent))
        click = 51
        for i,track in enumerate(recommended):
            if track in relevent:
                click = i/10
                break
        #print('click',click)
        return click
    
    def calc_metrices(self, pid):
        recommended = self.get_recommended_songs(pid)
        relevent = self.get_relevent_tracks(pid)
        print('rec. tracks:',len(recommended))
        print('rel. tracks:',len(relevent))
        click = 51
        for i,track in enumerate(recommended):
            if track in relevent:
                click = i/10
                break
        print('click      :',click)
        score = 0
        score = len(set(recommended) & set(relevent))
        score = score/len(relevent)
        print('r_precision:',score)
        ndcg_s = self.ndcg(recommended, relevent)
        print('ndcg       :',ndcg_s)
        return click, score, ndcg_s
    
    def avg_clicks(self, pids):
        avg_clicks = 0
        for pid in pids:
            avg_clicks += self.click_at(pid)
            #print(avg_clicks)
        avg_clicks /= len(pids)
        #print('avg Clicks:',avg_clicks)
        return avg_clicks
    
    def avg_metricess(self, pids):
        avg_clicks = 0
        avg_prec = 0
        avg_ndcg = 0
        for pid in pids:
            c, p, nd = self.calc_metrices(pid)
            avg_clicks += c
            avg_prec += p
            avg_ndcg += nd
            #print(avg_clicks)
        avg_clicks /= len(pids)
        avg_prec /= len(pids)
        avg_ndcg /= len(pids)
        print('avg Clicks   :',avg_clicks)
        print('avg Precision:',avg_prec)
        print('avg nDCG     :',avg_ndcg)
        return avg_clicks, avg_prec

In [6]:
evl = Evaluate(collection_dir, vocab=vocab, idf=idf)

In [7]:
evl.calc_metrices(324002)

Total similar playlists 100
total tracks: 2885
reccommended len (finally 500) 2885
relevent tracks size: 165
sampled relevent tracks 20
rec. tracks: 500
rel. tracks: 165
click      : 0.0
r_precision: 0.21818181818181817
ndcg       : 0.18278106057592614


(0.0, 0.21818181818181817, 0.18278106057592614)

In [8]:
v_pids =[]
with open('validation_pids_1.txt','r') as f:
    v_pids = f.read().split()
    v_pids = [int(pid) for pid in v_pids]
evl.avg_metricess(v_pids[:10])

Total similar playlists 100
total tracks: 5165
reccommended len (finally 500) 5165
relevent tracks size: 37
sampled relevent tracks 20
rec. tracks: 500
rel. tracks: 37
click      : 15.8
r_precision: 0.10810810810810811
ndcg       : 0.00704437747235646
Total similar playlists 100
total tracks: 1459
reccommended len (finally 500) 1459
relevent tracks size: 24
sampled relevent tracks 20
rec. tracks: 500
rel. tracks: 24
click      : 28.7
r_precision: 0.041666666666666664
ndcg       : 0.0017132007824331252
Total similar playlists 100
total tracks: 2885
reccommended len (finally 500) 2885
relevent tracks size: 165
sampled relevent tracks 20
rec. tracks: 500
rel. tracks: 165
click      : 0.0
r_precision: 0.21818181818181817
ndcg       : 0.18278106057592614
Total similar playlists 100
total tracks: 3459
reccommended len (finally 500) 3459
relevent tracks size: 93
sampled relevent tracks 20
rec. tracks: 500
rel. tracks: 93
click      : 9.2
r_precision: 0.03225806451612903
ndcg       : 0.0087727

(12.9, 0.18312745418607795)