In [1]:
import pandas as pd
import numpy as np
import scipy
import json
import os

from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split

In [2]:
def load_df():
    df = pd.DataFrame()
    with open('./mpd.slice.0-999.json') as data_file:
        data_string = data_file.read()
        try:
            data = json.loads(data_string)
        except ValueError:
            print('Failed:')
            print(repr(data_string))
    df = pd.concat([df, pd.DataFrame(data['playlists'])], ignore_index=True)
    return df

In [3]:
df = load_df()
df.drop(['description','name', 'pid', 'num_albums','num_artists', 
         'num_edits', 'num_followers', 'num_tracks', 'collaborative'], axis = 1, inplace = True) #dropping columns 
                                                                                                 #that we are not going
                                                                                                 #to use

artist_list = []
vocab_artist = set()

for row in tqdm_notebook(df.iterrows()): #iterating through df to get sequence of artists name 
                                         #that are contained in playlist
    artists = [x['artist_name'] for x in row[1]['tracks']] #getting artists from playlist(json type)
    for x in row[1]['tracks']:
        vocab_artist.add(x['artist_name']) #creating set with unique artists name
    artist_list.append(artists) 

df['artist_list'] = artist_list 

w2x_artist = {artist:i for i, artist in enumerate(vocab_artist)} #artist name to index
x2w_artist = {i:artist for i, artist in enumerate(vocab_artist)} #index to artist name

df['artist_idx'] = df['artist_list'].apply(lambda x: [w2x_artist[a] for a in x]) #converting sequence of artist name 
                                                                                 #to sequence of artists idx


df['train_seq_artist'] = df['artist_idx'].apply(lambda x: x[:-3]) #creating train sequence
df['target_val_artist'] = df['artist_idx'].apply(lambda x: x[-3:]) #creating validation sequence




In [4]:
inds = df['train_seq_artist']
playlist_artist = scipy.sparse.lil_matrix((df.shape[0], len(vocab_artist)), dtype=np.int8)
for i, row in tqdm_notebook(enumerate(inds)):
    playlist_artist[i, row] = 1 




In [5]:
precision = []
hr = []
sum_artists = np.asarray(np.sum(playlist_artist, axis = 0)).reshape((playlist_artist.shape[1], ))
preds = np.argsort(sum_artists)[-3:]
y_true = df['target_val_artist']
for y in y_true:
    score = len(set(preds) & set(y))
    precision.append(score/3)
    hr.append(int(score > 0))
print('Hit rate using most popular benchmark:', np.mean(hr))
print('Precision using most popular benchmark:', np.mean(precision))

Hit rate using most popular benchmark: 0.037
Precision using most popular benchmark: 0.013


In [6]:
from sklearn.neighbors import NearestNeighbors

In [8]:
precision = []
hr = []
nn = NearestNeighbors(n_jobs=32,n_neighbors=3)
nn.fit(playlist_artist.T)
distances = nn.kneighbors(playlist_artist.T)[1]
for row, y in tqdm_notebook(zip(playlist_artist, y_true)):
    last_listened = np.nonzero(row)[1]
    preds = distances[last_listened[-1]]
    score = len(set(preds) & set(y))
    precision.append(score/3)
    hr.append(int(score > 0))
print('Hit rate using item to item kNN:', np.mean(hr))
print('Precision using item to item kNN:', np.mean(precision))


Hit rate using item to item kNN: 0.077
Precision using item to item kNN: 0.026666666666666665
