In [None]:
import json
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [None]:
# Return the first 1000 playlists

def getplaylist(n = 1):
    combinedData=[]
    for i in range(n):
        i1 = i*1000
        i2 = i*1000+999
        filename="Dataset/spotify_million_playlist_dataset/data/mpd.slice."+str(i1)+"-"+str(i2)+".json"
        # print(filename)
        data = json.load(open(filename))
        for j in data["playlists"]:
            combinedData.append(j)

    return combinedData

In [None]:
# Return 1000 test playlists

def gettestplaylist():
    combinedData=[]
    file_path = "Dataset/test_set_1k_1k_26_90_unordered.json"
    data = json.load(open(filename))
    for j in data["playlists"]:
        combinedData.append(j)
    return combinedData

In [None]:
# Get track information from each playlist and structure them as 2D boolean arrays

def get_tracks(data):
    playlistdata = {}
    for i in range(len(data)):
        temp = []
        playlistname = data[i]['pid']
        tempval = data[i]['tracks']
        temp = [tempval[j]['track_uri'] for j in range(len(tempval))]
        playlistdata[int(playlistname)] = temp

    playlistdata = {playlist: list(set(tracks)) for playlist, tracks in playlistdata.items()}

    all_tracks = list(set(track for tracks in playlistdata.values() for track in tracks))
    #Populate the Dataframe
    df = pd.DataFrame(index=playlistdata.keys(), columns=all_tracks)
    for playlist, tracks in playlistdata.items():
        df.loc[playlist, tracks] = 1
    df.fillna(0, inplace=True)
    return df

In [None]:
# Return recommended tracks corresponding to each playlist

def getsimilarplaylist(X_test, X_train):
    # Get all missing tracks in test playlist
    new_columns = X_train.columns.difference(X_test.columns)
    recommend = []
    # Populate zero for each missing track in all playlists
    X_test.loc[:, new_columns] = 0

    # Run cosine similarity to get the top playlists and get 500 popular tracks
    # Sort the playlist based on similarity index and sum of track occurences
    for i in range(len(X_test)):
        test_playlist = np.array(X_test.iloc[i]).reshape(-1,1).T
        playlist_similarity = cosine_similarity(X_train, test_playlist)
        similar_playlists_indices = np.argsort(playlist_similarity[:, 0])[::-1][:100]
        aggregate_preferences = np.sum(X_train.iloc[similar_playlists_indices])
        recommended_tracks_indices = np.where((test_playlist == 0) & (aggregate_preferences > 0))[0]
        recommended_track = aggregate_preferences[recommended_tracks_indices][:500]
        recommend.append([recommended_track.keys()])

    return recommend

In [None]:
# Get playlist and track matrix
data = getplaylist(n=1)
df = get_tracks(data)

# Sort and create a dataframe based on track information
temp = np.sum(df, axis=0)
x = np.argsort(temp)[::-1]
X_train = df.iloc[:, x]

In [None]:
# Get test playlist and track matrix
X_test_d = gettestplaylist()
X_test_s = get_tracks(X_test_d)

# Sort and create a dataframe based on track information
temp1 = np.sum(X_test_s, axis =0)
t = np.argsort(temp1)[::-1]
X_test = X_test_s.iloc[:, t]

In [None]:
# Get similar recommendation of tracks for each playlist
playlist = getsimilarplaylist(X_test, X_train)

In [None]:
recommendations_dict = {}

# Create dictionary for storing recommendations corresponding to each playlist
for i in range(len(playlist)):
    flattened_array = pd.DataFrame(playlist[i])
    tp = X_test_d[i]['pid']
    recommendations_dict[tp] = flattened_array.values.tolist()[0]

In [None]:
# Output recommendations to a JSON file
output_file_path = "pred_1k_1k_26_90_unordered_cf.json"
with open(output_file_path, "w") as json_file:
    json.dump(recommendations_dict, json_file)

print("Recommendations saved to:", output_file_path)