In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import json
import os

from tqdm import tqdm_notebook

In [2]:
def load_df():
    df = pd.DataFrame()
    with open('./mpd.slice.0-999.json') as data_file:
        data_string = data_file.read()
        try:
            data = json.loads(data_string)
        except ValueError:
            print('Failed:')
            print(repr(data_string))
    df = pd.concat([df, pd.DataFrame(data['playlists'])], ignore_index=True)
    return df

In [3]:
df = load_df()
df.drop(['description','name', 'pid', 'num_albums','num_artists', 
         'num_edits', 'num_followers', 'num_tracks', 'collaborative'], axis = 1, inplace = True) #dropping columns 
                                                                                                 #that we are not going
                                                                                                 #to use

artist_list = []
vocab_artist = set()

for row in tqdm_notebook(df.iterrows()): #iterating through df to get sequence of artists name 
                                         #that are contained in playlist
    artists = [x['artist_name'] for x in row[1]['tracks']] #getting artists from playlist(json type)
    for x in row[1]['tracks']:
        vocab_artist.add(x['artist_name']) #creating set with unique artists name
    artist_list.append(artists) 

df['artist_list'] = artist_list 

w2x_artist = {artist:i for i, artist in enumerate(vocab_artist)} #artist name to index
x2w_artist = {i:artist for i, artist in enumerate(vocab_artist)} #index to artist name

df['artist_idx'] = df['artist_list'].apply(lambda x: [w2x_artist[a] for a in x]) #converting sequence of artist name 
                                                                                 #to sequence of artists idx


df['train_seq_artist'] = df['artist_idx'].apply(lambda x: x[:-3]) #creating train sequence
df['target_val_artist'] = df['artist_idx'].apply(lambda x: x[-3:]) #creating validation sequence




In [4]:
inds = df['train_seq_artist']
playlist_artist_train = scipy.sparse.lil_matrix((df.shape[0], len(vocab_artist)), dtype=np.int8) 
#creating binary playlist artist matrix for train
for i, row in tqdm_notebook(enumerate(inds)):
    playlist_artist_train[i, row] = 1 




In [5]:
inds = df['target_val_artist']
playlist_artist_val = scipy.sparse.lil_matrix((df.shape[0], len(vocab_artist)), dtype=np.int8)
#creating binary playlist artist matrix for validation
for i, row in tqdm_notebook(enumerate(inds)):
    playlist_artist_val[i, row] = 1 




In [6]:
precision = []
hr = []
sum_artists = np.asarray(np.sum(playlist_artist_train, axis = 0)).reshape((9722, ))
preds = np.argsort(sum_artists)[-3:]
y_true = df['target_val_artist']
for y in y_true:
    score = len(set(preds) & set(y))
    precision.append(score/3)
    hr.append(int(score > 0))
print('Hit rate using most popular benchmark:', np.mean(hr))
print('Precision using most popular benchmark:', np.mean(precision))

Hit rate using most popular benchmark: 0.037
Precision using most popular benchmark: 0.013


In [7]:
def get_neg_candidates_train(i):
    #getting negative candidates for supervised learning algorithm for train
    np.random.seed(42)
    neg = np.where(playlist_artist_train.getrow(i).toarray()[0] == 0)[0]
    ind = np.random.randint(0, neg.shape[0], size = 3).tolist()
    return neg[ind].tolist()

In [8]:
f = open('libfm-1.40.windows/train.txt', 'w')
n_users = 1000
for row in tqdm_notebook(enumerate(playlist_artist_train)): #converting train data for libfm format 
    for j in np.nonzero(row[1].toarray())[1]: #writing down positive candidates for playlist №row
        f.write(str(1) + ' ')
        f.write(str(row[0]) + ':' + '1 ') 
        f.write(str(n_users + j) + ':' + '1 ')
        f.write('\n')
    neg_candidates = get_neg_candidates_train(row[0]) #writing down negative candidates for playlist №row
    for j in neg_candidates:
        f.write(str(0) + ' ')
        f.write(str(row[0]) + ':' + '1 ')
        f.write(str(j) + ':' + '1 ')
        f.write('\n')
f.close()




In [9]:
def get_neg_candidates_val(i):
    #getting negative candidates for supervised learning algorithm for evaluating MF algorithm
    np.random.seed(42)
    neg = np.where(playlist_artist_val.getrow(i).toarray()[0] == 0)[0]
    ind = np.random.randint(0, neg.shape[0], size = 3).tolist()
    return neg[ind].tolist()

In [10]:
f = open('libfm-1.40.windows/val.txt', 'w')
n_users = 1000
answer_dict = {i:[] for i in range(playlist_artist_val.shape[0])}
for row in tqdm_notebook(enumerate(playlist_artist_val)): #converting train data for libfm format 
    positive_candidates = np.nonzero(row[1].toarray())[1] #writing down positive candidates for playlist №row
    for j in positive_candidates:
        f.write(str(1) + ' ')
        f.write(str(row[0]) + ':' + '1 ')
        f.write(str(n_users + j) + ':' + '1 ')
        f.write('\n')
    neg_candidates = get_neg_candidates_val(row[0])
    answer_dict[row[0]] += positive_candidates.tolist() + neg_candidates #using dict playlist : pos + neg candidates
    for j in neg_candidates: #writing down negative candidates for playlist №row
        f.write(str(0) + ' ')
        f.write(str(row[0]) + ':' + '1 ')
        f.write(str(n_users + j) + ':' + '1 ')
        f.write('\n')
f.close()




In [11]:
import subprocess

In [12]:
cmd = ' '.join(['libfm-1.40.windows/libFM', '-task', 'r', '-train', 'libfm-1.40.windows/train.txt', 
                         '-test', '../libfm-1.42.src/bin/val.txt', '-iter', '20', '-method', 'sgd',
                         '-regular', '’3,3,15’', '-dim', '’1,1,4’', '-init_stdev',
                         '0.1', '-out', 'output.txt', '-learn_rate', '0.001']) #hyperparameters (for mor info see 
                                                                               #manual attached to the course)
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) #starting subprocess 
                                                                                                           #in console
for line in iter(proc.stdout.readline, ''): #evaluating libfm
    if line == b'':
        print('Finished training')
        break

Finished training


In [13]:
with open('./output.txt', 'r') as f:
    val_answers = [float(x.strip()) for x in f.readlines()] #opening file with answers

In [14]:
num_read = 0
precision = []
hr = []
for i in tqdm_notebook(range(playlist_artist_val.shape[0])): #calculating metric
    all_answers = np.asarray(answer_dict[i])
    y_true = all_answers[:-3] #true answers (first 3 elements in the array)
    mf_answers = val_answers[num_read:num_read + len(all_answers)] #answers from algorithm
    num_read += len(all_answers) #num of rows that were read from the val_answer
    y_pred_ind = np.argsort(mf_answers)[-3:] #top3 by probability
    y_pred = all_answers[y_pred_ind] #getting idx of these artists
    score = len(set(y_pred) & set(y_true)) #num of guessed artists
    precision.append(score/3)
    hr.append(int(score > 0))
print('MF HR@3 score:', np.mean(hr))
print('MF precision@3 score:', np.mean(precision))


MF HR@3 score: 0.939
MF precision@3 score: 0.569


In [15]:
answer_dict[0]

[1325, 3410, 9260, 7272, 860, 5392]