In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import json
from sklearn.metrics import ndcg_score

In [None]:
def load_challenge_set():
    f = open('challenge_set_answers.json')
    js = f.read()
    f.close()
    ans = json.loads(js)
    for playlist in ans['playlists']:
        playlist['tracks'] = list(map(lambda x: x['track_uri'][14:], playlist['tracks']))
    return ans

challenge_set_answers = load_challenge_set()

['5IbCV9Icebx8rR6wAp5hhP', '0ABxAcsRWlqckkyONsfP67', '1iVkEiZS6ijZ76KeMsrXG6', '0yLjkpa51CN1uiryMDOrtt', '0hfFDkS845UKM3BIKXmFBo', '2JEQiyNx2BuERPuQG6qNT4', '4AONRuRx872zyuIK3wyf3P', '5TwMVNl4Uh5XyZILQbID8J', '07Ikb6oVQ5bddOktyquwFL', '5Kwv0VoK9SIdt6RMRusora', '7FzcQTmkQZdm0aN54xdj8c', '3Tcv2Ck5n0R7wrd4q7mB7K', '7zsw78LtXUD7JfEwH64HK2', '1YzCSIwQ1wlchlqoRrAh1c', '6oYkwjI1TKP9D0Y9II1GT7', '6kkFjjMbO9EL1YxNt8BKm3', '6oYkwjI1TKP9D0Y9II1GT7', '7tUSJY4nsDBJTjd1UXKRsT', '7zsw78LtXUD7JfEwH64HK2', '4HGIPyqDxSf863tPOwXiLJ', '3ChNbqbiPteCoYtufO79qw', '09HBvmj0dUeQYHGbKJefR5', '5DFqjT6TxAE5BdKb8YcPmZ', '3IGe9TnUiwWSWqOZi7XE38', '7p6p8OCAsGUFvtEklW93uo', '4G9yyEFrvfRpKaXXlajj0X', '2Y6XIn3zUdUYKLnKdtfNoA', '4dsoM6q2kDJhpYB4IOL0Od', '3Gbq2jcKXAOAPQTz06sNAi', '0M79KNBtkt77h5CzZoFRxn', '6V58LMEeGuOfLkCRJKnItb', '0loKEMa9DP6mLibdFMe6Uv', '51ywMlgjW0YZvAkhaH82zL', '6LBfozDVA7crIeuaPAVVlE', '1m1lMBFsQmFHNEceXYHL0K', '5a0TFx0126Z9KcIDgaj64f', '5SkliS5GQmtMHYqAmU74b0', '66GzaMmr8iiRLOfaejUzuY', '5Lli3VgydV

In [25]:
# Load tracks into a dictionary for fast lookup
tracks = pd.read_hdf('df_data/df_tracks.hdf')
track_dict = dict(zip(tracks['track_uri'], tracks['track_name']))
artist_dict = dict(zip(tracks['track_uri'], tracks['artist_name']))

results = []
fp = open('submission.hdf')
out = open('results.txt', 'w')

# next(fp)
for line in fp:
    track_ids = line.strip().split(',')[1:]
    results.append(list(map(lambda x: x[15:], track_ids)))
    titles = [track_dict.get(track[1:], "Unknown") for track in track_ids]
    results.append(track_ids)
    out.write(', '.join(titles) + '\n')

fp.close()
out.close()

In [None]:
def get_ndcg_score(actual, preds):
    n_actual = len(actual)
    n_preds = len(preds)
    
    real_scores = np.zeros((n_actual, n_actual))
    pred_scores = np.zeros((n_actual, n_actual))
    track_map = {}
    for i, track in enumerate(actual):
        track_map[track] = i
        real_scores[i, i] = 1.0
    for i, track in enumerate(preds):
        if track in track_map:
            pred_scores[i, track_map[track]] = 1.0

    return ndcg_score(real_scores, pred_scores, k=n_preds)

# click is defined as # of times to refresh 10 tracks until you find a relevant one
def get_clicks(actual, preds):
    i = 0
    for track in preds:
        if track in actual:
            return i // 10
        i += 1
    return -1

def r_precision(actual, preds):
    actual = set(actual)
    preds = set(preds)
    return len(actual.intersection(preds)) / len(actual)


# def get_submission_stats():
for i in range(len(results['playlists'])):
    actual = results[i]
    preds = challenge_set_answers['playlists'][i]['tracks']
    # ndcg = get_ndcg_score(preds, actual)
    precision = r_precision(preds, actual)
    if precision > max_precision:
        max_precision = precision


1.0


In [None]:
def get_performance_results(submission_file):  
    '''
    This function will take a CSV file, (submission file)
    and return the results how well the model performed on
    the challenge set.

    The first column of the CSV file should be the playlist id, 
    which will be dropped. The rest should all be spotify
    track URIs. Each line should contain 500 URIs.
    ''' 
    # grab the results from the challenge set answers
    answers = json.load(open('challenge_set_answers.json'))
    challenge_set = json.load(open('challenge_set.json'))
    challenge_set_playlists = challenge_set['playlists']

    playlists = answers['playlists']
    preds = pd.read_csv(submission_file, header=None)
    preds.drop(columns=[0], inplace=True)

    percentages = []
    results = []

    for i, row in preds.iterrows():
        playlist = playlists[i]

        # change playlist to a set of URIs (faster lookup, playlist will not have duplicates)
        tracks = set(map(lambda x: x['track_uri'], playlist['tracks']))
        predicted = 0

        for item in row:
            # for some reason some "nan"s got thrown in there. just continue
            track_uri = ""
            try:
                track_uri = item[1:]
            except:
                continue

            if track_uri in tracks:
                predicted += 1

        pct = predicted / playlist['num_holdouts'] * 100
        percentages.append(pct)
        results.append({
            "num_holdouts": playlist['num_holdouts'],
            "num_samples": playlist['num_samples'],
            "num_tracks": playlist['num_tracks'],
            "num_predicted": predicted,
            "pct": pct,
            "playlist_name": playlist['name'],
            "includes_name": challenge_set_playlists[i]['name'] != "",
            "random": playlist['random']
        })
        
    return results

In [4]:
def interpret_results(results):
    '''
    Take a "results" object and extract meaningful data from it

    Challenge set data rules are included for reference:

    (1) title only (2) title and the first track (3) title and the first 
    five tracks (4) the first five tracks (no title) 
    (5) title and the first 10 tracks (6) the first 10 tracks (no title) 
    (7) title and the first 25 tracks (8) title and 25 random tracks
    (9) title and the first 100 tracks (10) title and 100 random tracks. A
    final submission for this challenge should contain 500 tracks for
    each of the test playlists, ordered by relevance.
    '''

    # divide the playlists into buckets
    # we will combine buckets 7 & 8, and 9 & 10
    buckets = [[] for i in range(8)]
    for res in results:
        if res['num_tracks'] == 0:
            # title only
            buckets[0].append(res)
        elif res['num_samples'] == 1:
            # title and the first track
            buckets[1].append(res)
        elif res['num_samples'] == 5:
            if res['includes_title']:
                # title and the first five tracks
                buckets[2].append(res)
            else:
                # the first five tracks (no title)
                buckets[3].append(res)
        elif res['num_samples'] == 10:
            if res['includes_title']:
                # title and the first 10 tracks
                buckets[4].append(res)
            else:
                # the first 10 tracks (no title)
                buckets[5].append(res)
        elif res['num_samples'] == 25:
            buckets[6].append(res)
        elif res['num_samples'] == 100:
            buckets[7].append(res)
    return buckets

In [5]:
# results from the original model
original_results = get_performance_results('submission.hdf')