In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import json

In [2]:
# Load tracks into a dictionary for fast lookup
tracks = pd.read_hdf('df_data/df_tracks.hdf')
track_dict = dict(zip(tracks['track_uri'], tracks['track_name']))
artist_dict = dict(zip(tracks['track_uri'], tracks['artist_name']))

results = []
fp = open('submission.hdf')
out = open('results.txt', 'w')

# next(fp)
for line in fp:
    track_ids = line.strip().split(',')[1:]
    titles = [track_dict.get(track[1:], "Unknown") for track in track_ids]
    results.append(titles)
    out.write(', '.join(titles) + '\n')

fp.close()
out.close()

In [3]:
def get_performance_results(submission_file):  
    '''
    This function will take a CSV file, (submission file)
    and return the results how well the model performed on
    the challenge set.

    The first column of the CSV file should be the playlist id, 
    which will be dropped. The rest should all be spotify
    track URIs. Each line should contain 500 URIs.
    ''' 
    # grab the results from the challenge set answers
    answers = json.load(open('challenge_set_answers.json'))
    challenge_set = json.load(open('challenge_set.json'))
    challenge_set_playlists = challenge_set['playlists']

    playlists = answers['playlists']
    preds = pd.read_csv(submission_file, header=None)
    preds.drop(columns=[0], inplace=True)

    percentages = []
    results = []

    for i, row in preds.iterrows():
        playlist = playlists[i]

        # change playlist to a set of URIs (faster lookup, playlist will not have duplicates)
        tracks = set(map(lambda x: x['track_uri'], playlist['tracks']))
        predicted = 0

        for item in row:
            # for some reason some "nan"s got thrown in there. just continue
            track_uri = ""
            try:
                track_uri = item[1:]
            except:
                continue

            if track_uri in tracks:
                predicted += 1

        pct = predicted / playlist['num_holdouts'] * 100
        percentages.append(pct)
        results.append({
            "num_holdouts": playlist['num_holdouts'],
            "num_samples": playlist['num_samples'],
            "num_tracks": playlist['num_tracks'],
            "num_predicted": predicted,
            "pct": pct,
            "playlist_name": playlist['name'],
            "includes_name": challenge_set_playlists[i]['name'] != ""
        })
        
    return results

In [4]:
def interpret_results(results):
    '''
    Take a "results" object and extract meaningful data from it

    Challenge set data rules are included for reference:

    (1) title only (2) title and the first track (3) title and the first 
    five tracks (4) the first five tracks (no title) 
    (5) title and the first 10 tracks (6) the first 10 tracks (no title) 
    (7) title and the first 25 tracks (8) title and 25 random tracks
    (9) title and the first 100 tracks (10) title and 100 random tracks. A
    final submission for this challenge should contain 500 tracks for
    each of the test playlists, ordered by relevance.
    '''

    # divide the playlists into buckets
    # we will combine buckets 7 & 8, and 9 & 10
    buckets = [[] for i in range(8)]
    for res in results:
        if res['num_tracks'] == 0:
            # title only
            buckets[0].append(res)
        elif res['num_samples'] == 1:
            # title and the first track
            buckets[1].append(res)
        elif res['num_samples'] == 5:
            if res['includes_title']:
                # title and the first five tracks
                buckets[2].append(res)
            else:
                # the first five tracks (no title)
                buckets[3].append(res)
        elif res['num_samples'] == 10:
            if res['includes_title']:
                # title and the first 10 tracks
                buckets[4].append(res)
            else:
                # the first 10 tracks (no title)
                buckets[5].append(res)
        elif res['num_samples'] == 25:
            buckets[6].append(res)
        elif res['num_samples'] == 100:
            buckets[7].append(res)
    return buckets

In [5]:
# results from the original model
original_results = get_performance_results('submission.hdf')