In [None]:
import csv
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import random
from tqdm import tqdm

In [None]:
def get_nk_playlist_data(n = 10, stagger = 0):

    combined_data = []
    progress_bar = tqdm(total = n, desc = "Progress", ncols = 100)

    for i in range(n):
        i1 = stagger + i * 1000
        i2 = stagger + i * 1000 + 999
        filename = "Dataset/spotify_million_playlist_dataset/data/mpd.slice." + str(i1) + "-" + str(i2) + ".json"
        data = json.load(open(filename))
        for playlist in data["playlists"]:
            combined_data.append(playlist)
        progress_bar.update(1)
    progress_bar.close()

    return combined_data

In [None]:
TRAIN_N_K = 10
train_data = get_nk_playlist_data(n = TRAIN_N_K)

Progress: 100%|█████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.72it/s]


In [None]:
# Assign value 1 if key is not present in dictionary, else increment value by 1
def increment_dict_val(D, k):
    if k not in D:
        D[k] = 1
    else:
        D[k] += 1

In [None]:
# Create a transition dictionary using the training data

tracks_dict = {}
transition = {}
tracks_fq = {}
idx = 1

progress_bar = tqdm(total = len(train_data), desc = "Progress", ncols = 100)

for playlist in train_data:

    prev_id = 0

    for track in playlist['tracks']:
        if track['track_uri'] not in tracks_dict:
            tracks_dict[track['track_uri']] = idx
            idx += 1

        track_id = tracks_dict[track['track_uri']]

        if prev_id not in transition:
            transition[prev_id] = {track_id: 1}
        elif track_id not in transition[prev_id]:
            transition[prev_id][track_id] = 1
        else:
            transition[prev_id][track_id] += 1

        increment_dict_val(tracks_fq, track_id)

        prev_id = track_id

    progress_bar.update(1)
progress_bar.close()

for key in transition:
    transition[key] = sorted(transition[key].items(), key=lambda x: x[1], reverse=True)

tracks_fq = dict(sorted(tracks_fq.items(), key=lambda item: item[1], reverse=True))
reverse_tracks_dict = {v: k for k, v in tracks_dict.items()}
top_500_tracks = list(tracks_fq.keys())[: 500]

Progress: 100%|████████████████████████████████████████████| 10000/10000 [00:00<00:00, 15862.84it/s]


In [None]:
def get_stratified_test_set_data(filename):

    stratified_data = []

    data = json.load(open(filename))
    for playlist in data["playlists"]:
        stratified_data.append(playlist)

    return stratified_data

In [None]:
ordered_data = get_stratified_test_set_data(filename = "Dataset/test_set_10k_1k_2_90_ordered.json")

In [None]:
# Stratified sampling data with 1000 playlists having 1 track provided

STRATIFIED_SIZE = 1000

ordered_tracks_x = []
stratified_pids = []
count = 0

progress_bar = tqdm(total = STRATIFIED_SIZE, desc = "Progress", ncols = 100)

for playlist in ordered_data:

    stratified_pids.append(playlist['pid'])
    x_list = []

    for i in range(len(playlist['tracks'])):
        track = playlist['tracks'][i]
        # Guaranteed to have the track in our training data as THRESHOLD_PERC is 90 with only 1 track
        if track['track_uri'] in tracks_dict:
            track_id = tracks_dict[track['track_uri']]
            x_list.append(track_id)
    ordered_tracks_x.append(x_list)

    progress_bar.update(1)
progress_bar.close()

Progress: 100%|█████████████████████████████████████████████| 1000/1000 [00:00<00:00, 124726.54it/s]


In [None]:
# Make predictions on stratified sampled set

ordered_tracks_y_pred = []

progress_bar = tqdm(total = len(ordered_tracks_x), desc = "Progress", ncols = 100)

for i in range(len(ordered_tracks_x)):
    x_list = ordered_tracks_x[i]
    x_set = set(x_list)
    prev_id = x_list[0]
    top_500_recs = []

    for inner_i in range(500):
        if prev_id in transition.keys():
            for (k, v) in transition[prev_id]:
                if k not in x_set and k not in top_500_recs:
                    top_500_recs.append(k)
                    prev_id = k
                    break
        else:
            break

    top_idx = 0
    while len(top_500_recs) < 500:
        if top_500_tracks[top_idx] not in top_500_recs:
            top_500_recs.append(top_500_tracks[top_idx])
        top_idx += 1

    ordered_tracks_y_pred.append(top_500_recs)

    progress_bar.update(1)
progress_bar.close()

Progress: 100%|████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 724.31it/s]


In [None]:
print(top_500_recs)

[15965, 15966, 15967, 16403, 15964, 2224, 2220, 2221, 2222, 2223, 113491, 34413, 2026, 2024, 2025, 2467, 17396, 17411, 17412, 17397, 18914, 5917, 2861, 11576, 11561, 2334, 2858, 2860, 2857, 17342, 2169, 40420, 2168, 12388, 26960, 502, 11567, 11772, 2321, 2322, 2323, 2324, 22617, 6101, 6099, 33267, 11778, 11530, 11532, 13928, 23780, 11534, 12050, 48393, 15892, 15893, 15894, 15895, 12324, 12325, 12326, 12327, 2147, 8490, 2145, 2146, 17392, 17380, 67272, 67273, 67274, 24808, 1655, 3714, 2706, 5984, 2700, 11140, 16812, 8995, 22581, 24270, 24271, 22330, 18303, 24272, 18304, 22331, 24273, 24274, 14437, 2705, 1332, 4101, 10772, 2715, 28208, 87609, 44783, 94807, 23340, 61457, 73444, 16808, 48694, 34111, 4092, 565, 2014, 331, 1009, 328, 2368, 4066, 8288, 8499, 20222, 20223, 8005, 8517, 963, 7973, 2115, 2416, 4647, 708, 1300, 13331, 13328, 13329, 4288, 18159, 18826, 1669, 1665, 4291, 13762, 445, 46353, 23261, 23262, 23263, 23264, 23265, 23266, 23267, 23268, 23269, 7775, 7798, 3678, 7835, 3095, 2

In [None]:
# Write to json

ordered_dict = {}

ordered_tracks_y_pred_uris = [[reverse_tracks_dict[key] for key in inner_list] for inner_list in ordered_tracks_y_pred]

for i in range(len(stratified_pids)):
    ordered_dict[stratified_pids[i]] = ordered_tracks_y_pred_uris[i]

with open("pred_10k_1k_2_90_ordered_markov.json", 'w') as pred_file:
    json.dump(ordered_dict, pred_file, indent = 4)