In [73]:
import pandas as pd
import json
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

In [60]:
# Load JSON data from file
with open('data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

In [61]:
df_spotify = df_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])

In [66]:
df_sentiment = pd.read_csv('data/sentiment_data.csv')

In [67]:
df_spotify = df_spotify.merge(df_sentiment, on='track_uri', how='left')

In [68]:
df_spotify

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score,sentiment_label
0,1000000,0,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,0.8633,positive
1,1000000,1,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,0.7938,positive
2,1000000,2,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,0.9830,positive
3,1000000,3,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,0.9978,positive
4,1000000,4,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,0.8695,positive
...,...,...,...,...,...,...,...,...
280995,1006767,0,spotify:track:38griAVM808crjbFp9gcPD,spotify:artist:6nnspeopmJAG07xOxHmqTu,spotify:album:2QeEEn8jNy5SFx9coIzS3Z,339573,-0.9905,negative
280996,1006771,0,spotify:track:1JClFT74TYSXlzpagbmj0S,spotify:artist:1ZwdS5xdxEREPySFridCfh,spotify:album:3PO9OtQdvCDJN8zDLtZiYd,285026,,
280997,1006773,0,spotify:track:4InLm5a9Qtkru6YxEjM4Qc,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,spotify:album:5NjFyeZJkYAh5ri9eh8ZSO,279322,0.9956,positive
280998,1006775,0,spotify:track:4hdog9vyyqG9pcppG2Izek,spotify:artist:2cFrymmkijnjDg9SS92EPM,spotify:album:1TkwzY3l4LqAfrQwBAx45Q,223295,-0.9581,negative


In [69]:
df_spotify.dropna(inplace=True)

In [75]:
df_spotify[df_spotify['sentiment_score'].isna()]

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score,sentiment_label


In [71]:
df_spotify.shape

(258678, 8)

In [74]:
#Drop the Label, keep sentiment score
df_spotify.drop(columns=['sentiment_label'])

Unnamed: 0,playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms,sentiment_score
0,1000000,0,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,0.8633
1,1000000,1,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,0.7938
2,1000000,2,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,0.9830
3,1000000,3,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,0.9978
4,1000000,4,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,0.8695
...,...,...,...,...,...,...,...
280994,1006752,0,spotify:track:6FI3RJ58Ztl0X1VtA6pVs9,spotify:artist:09hVIj6vWgoCDtT03h8ZCa,spotify:album:4v5x3Oo3UjQ9YmF3hRAip5,208840,0.9996
280995,1006767,0,spotify:track:38griAVM808crjbFp9gcPD,spotify:artist:6nnspeopmJAG07xOxHmqTu,spotify:album:2QeEEn8jNy5SFx9coIzS3Z,339573,-0.9905
280997,1006773,0,spotify:track:4InLm5a9Qtkru6YxEjM4Qc,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,spotify:album:5NjFyeZJkYAh5ri9eh8ZSO,279322,0.9956
280998,1006775,0,spotify:track:4hdog9vyyqG9pcppG2Izek,spotify:artist:2cFrymmkijnjDg9SS92EPM,spotify:album:1TkwzY3l4LqAfrQwBAx45Q,223295,-0.9581


In [76]:
numeric_features = ['playlist_pid', 'track_pos', 'duration_ms', 'sentiment_score']
categorical_features = ['track_uri', 'artist_uri', 'album_uri']

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])


In [77]:
cos_sim_pipeline

In [78]:
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify)

In [79]:
# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)

In [80]:
# Code from neuefische google colab; modified so it works with duplicates for track input
# Build index with track identifiers
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #remove duplicates from recommendation list
    """ remove = []
    for tup in sim_scores:
        for index in dup_list:
            if tup[0] == index[1]:
                remove.append(tup) 
    print(sorted(remove, key=lambda x: x[1], reverse=True)) #Test

    sim_scores = [x for x in sim_scores if x not in remove] """

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:1001]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[:500]
    return recommended_tracks


In [83]:
lorde = track_recommendations('spotify:track:2UYJqglnOMTvRcqQLNcjjf')

In [84]:
df_lorde = lorde.to_frame()

In [85]:
df_lorde[df_lorde['track_uri'] == 'spotify:track:7yyRTcZmCiyzzJlNzGC9Ol']

Unnamed: 0,track_uri
