In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import random
from tqdm import tqdm

### Train and test data size and paths

In [None]:
# Number of playlists to use for the training data to determine known tracks (in thousands)
N_TRAIN_K = 100

# pid of first playlist of the training data
TRAIN_STAGGER = 0

# Number of playlists to sample for the test data (in thousands)
N_TEST_K = 1

# Number of playlists to load as candidates for the final N_TEST_K stratified samples (in thousands)
N_TEST_CANDIDATE_K = 10

# pid of first playlist of the test data
TEST_STAGGER = 900000

# Path to the folder containing the MPD slices
DATA_PATH = "Dataset/spotify_million_playlist_dataset/data/"

# Path to the folder in which the test dataset file is to be created
TEST_DATA_PATH = "Dataset/"

### Thresholds for stratified sampling

In [None]:
############################################################################################################################
# Minimum number of known tracks for each sampled playlist
# Keep this number greater than the number of tracks being used for predicting the remaining tracks
# If 25 tracks per playlist are being used for predicting the remaining tracks, keep this value greater than or equal to 26
############################################################################################################################
THRESHOLD_NUM = 2

############################################################################################################################
# Minimum percentage of known tracks for each sampled playlist
# A large percentage value might cause popular songs to dominate the playlist
# A small percentage value might lead to insufficient representation of the true nature of the playlist
############################################################################################################################
THRESHOLD_PERC = 90

In [None]:
def get_nk_playlist_data(n = 10, stagger = 0):

    combined_data = []
    progress_bar = tqdm(total = n, desc = "Progress", ncols = 100)

    for i in range(n):
        i1 = stagger + i * 1000
        i2 = stagger + i * 1000 + 999
        filename = DATA_PATH + "mpd.slice." + str(i1) + "-" + str(i2) + ".json"
        data = json.load(open(filename))
        for playlist in data["playlists"]:
            combined_data.append(playlist)
        progress_bar.update(1)
    progress_bar.close()

    return combined_data

In [None]:
train_data = get_nk_playlist_data(n = N_TRAIN_K, stagger = TRAIN_STAGGER)

Progress: 100%|███████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.67it/s]


## Identifying the known tracks from the training data

In [None]:
tracks_dict = {}
idx = 1

progress_bar = tqdm(total = len(train_data), desc = "Progress", ncols = 100)

for playlist in train_data:

    for track in playlist['tracks']:
        if track['track_uri'] not in tracks_dict:
            tracks_dict[track['track_uri']] = idx
            idx += 1

    progress_bar.update(1)
progress_bar.close()

Progress: 100%|██████████████████████████████████████████| 100000/100000 [00:02<00:00, 42367.75it/s]


In [None]:
# Number of known tracks
print(len(tracks_dict))

681805


## Creating test dataset using stratified sampling

In [None]:
test_data = get_nk_playlist_data(n = N_TEST_CANDIDATE_K, stagger = TEST_STAGGER)

Progress: 100%|█████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.71it/s]


In [None]:
test_playlists = []
count = 0
n_test = N_TEST_K * 1000

progress_bar = tqdm(total = n_test, desc = "Progress", ncols = 100)

for playlist in test_data:

    if playlist['num_tracks'] >= THRESHOLD_NUM:

        known_tracks = []
        for i in range(playlist['num_tracks']):
            track = playlist['tracks'][i]
            if track['track_uri'] in tracks_dict:
                known_tracks.append(track)

        if len(known_tracks) >= THRESHOLD_NUM and len(known_tracks) / playlist['num_tracks'] >= THRESHOLD_PERC / 100:
            playlist['tracks'] = known_tracks
            test_playlists.append(playlist)

            count += 1
            progress_bar.update(1)
            if count >= n_test:
                break

progress_bar.close()

Progress: 100%|██████████████████████████████████████████████| 1000/1000 [00:00<00:00, 21227.09it/s]


In [None]:
# Number of playlists obtained using stratified sampling
print(len(test_playlists))

1000


In [None]:
test_dataset = {"playlists": test_playlists}
filename = TEST_DATA_PATH + "test_set_" + str(N_TRAIN_K) + "k_" + str(N_TEST_K) + "k_" + str(THRESHOLD_NUM) + "_" + str(THRESHOLD_PERC)
with open(filename, 'w') as json_file:
    json.dump(test_dataset, json_file, indent = 4)