In [1]:
import pandas as pd
import json
from sklearn import preprocessing

In [2]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

In [3]:
df_spotify = df_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

numeric_features = ['playlist_pid', 'track_pos', 'duration_ms']
categorical_features = ['track_uri', 'artist_uri', 'album_uri']

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])


In [5]:
cos_sim_pipeline

In [6]:
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify)

In [7]:
# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)

In [15]:
# Code from neuefische google colab; modified so it works with duplicates for track input
# Build index with track identifiers
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #remove duplicates from recommendation list
    """ remove = []
    for tup in sim_scores:
        for index in dup_list:
            if tup[0] == index[1]:
                remove.append(tup) 
    print(sorted(remove, key=lambda x: x[1], reverse=True)) #Test

    sim_scores = [x for x in sim_scores if x not in remove] """

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:1001]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[:500]
    return recommended_tracks


In [16]:
track_recommendations('spotify:track:7yyRTcZmCiyzzJlNzGC9Ol')

771      spotify:track:7yyRTcZmCiyzzJlNzGC9Ol
36517    spotify:track:0IQHBNc0nHjmpUusMHFrX2
910      spotify:track:6GG2tkV9PRYGN5rniXPFXZ
32287    spotify:track:44Xgp13T5ab99beJ0edP6b
2914     spotify:track:4XQQovRSltOGWevTCgacXY
                         ...                 
505      spotify:track:4y1LsJpmMti1PfRQV9AWWe
506      spotify:track:7uHO4AmKtyGa5v5fsElGoC
507      spotify:track:5ChkMS8OtdzJeqyybCc9R5
508      spotify:track:2tUBqZG2AbRi7Q0BIrVrEj
509      spotify:track:2FMcDUopGfjBh3xMsrm78S
Name: track_uri, Length: 500, dtype: object