In [3]:
import pandas as pd
import json

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

import polars as po

In [5]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

In [6]:
po_spotify = po.from_pandas(df_spotify)

- Remove 'playlist_num_tracks'
    - doesn't contain relevant information
- Remove IDs or real names to avoid correlation (?) -> if algorithm can't handle correlation
- Remove songs with duration of 0
- Remove songs longer than 50 minutes
- Remove playlists that contain too many non-unique songs (see 'How many unique songs are in a playlist')
- Remove very long and very short playlists (see 'Length of playlists') (???)

In [9]:
po_spotify_features = po_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])


  po_spotify_features = po_spotify.drop(columns=['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_name', 'album_name'])


In [10]:
po_spotify_features

playlist_pid,track_pos,track_uri,artist_uri,album_uri,duration_ms
i64,i64,str,str,str,i64
1000000,0,"""spotify:track:66U0ASk1VHZsqIkp…","""spotify:artist:5vCOdeiQt9LyzdI…","""spotify:album:4S5MLjwRSi0NJ5ni…",163809
1000000,1,"""spotify:track:5MhsZlmKJG6X5kTH…","""spotify:artist:5vCOdeiQt9LyzdI…","""spotify:album:1qHVYbxQ6IS8YRvi…",166848
1000000,2,"""spotify:track:0GZoB8h0kqXn7XFm…","""spotify:artist:163tK9Wjr9P9DmM…","""spotify:album:4UEPxQx0cTcYNsE0…",232506
1000000,3,"""spotify:track:35kahykNu00FPysz…","""spotify:artist:163tK9Wjr9P9DmM…","""spotify:album:0rmhjUgoVa17LZuS…",216600
1000000,4,"""spotify:track:3G6hD9B2ZHOsgf4W…","""spotify:artist:163tK9Wjr9P9DmM…","""spotify:album:0rmhjUgoVa17LZuS…",193058
…,…,…,…,…,…
1006767,0,"""spotify:track:38griAVM808crjbF…","""spotify:artist:6nnspeopmJAG07x…","""spotify:album:2QeEEn8jNy5SFx9c…",339573
1006771,0,"""spotify:track:1JClFT74TYSXlzpa…","""spotify:artist:1ZwdS5xdxEREPyS…","""spotify:album:3PO9OtQdvCDJN8zD…",285026
1006773,0,"""spotify:track:4InLm5a9Qtkru6Yx…","""spotify:artist:2Y9lO01ABSO8OkB…","""spotify:album:5NjFyeZJkYAh5ri9…",279322
1006775,0,"""spotify:track:4hdog9vyyqG9pcpp…","""spotify:artist:2cFrymmkijnjDg9…","""spotify:album:1TkwzY3l4LqAfrQw…",223295


In [12]:
int_features = po_spotify_features[['playlist_pid', 'track_pos', 'duration_ms']]
obj_features = po_spotify_features[['track_uri', 'artist_uri', 'album_uri']]

In [16]:
df_vectorized = obj_features.to_dummies()
df_vectorized.head()

track_uri_spotify:track:000xQL6tZNLJzIrtIgxqSl,track_uri_spotify:track:0010mZpCCwlPwoBiBsjoac,track_uri_spotify:track:001m5KK2fu67yZ5ZW46LDZ,track_uri_spotify:track:0032P9X3AD4AQvN7yzpmTo,track_uri_spotify:track:004XT7kCZUEJkVIZjmBdDi,track_uri_spotify:track:004skCQeDn1iLntSom0rRr,track_uri_spotify:track:00598GIAcfDDzqccjZ4u44,track_uri_spotify:track:005GaX6hvgeTFnR9FvejTE,track_uri_spotify:track:005X0FmdtkM1kiutosXLTR,track_uri_spotify:track:0062tddXmdVQgX1mMjVKUR,track_uri_spotify:track:0069UNfEXsA1853CA0C6es,track_uri_spotify:track:006AVH7fq061voGXkUiII4,track_uri_spotify:track:006f4XC6oHnIKCYvlI5gbD,track_uri_spotify:track:006r3Kh3rjAW6WgkSPTMzC,track_uri_spotify:track:006yrnQMCZpiUgkR612gC8,track_uri_spotify:track:007SutdC0rVG2CSkuQMtJw,track_uri_spotify:track:007iY6FDvlAwN1DVlNo8rY,track_uri_spotify:track:009n0zqtbcRTL7T4TiGf41,track_uri_spotify:track:00ANnYctEGGhcmOJ5omaj8,track_uri_spotify:track:00AivYmu1UVmxM91uhR9lM,track_uri_spotify:track:00AxNl4D4jHL2AEf1W55j5,track_uri_spotify:track:00BHe2yBtdOzhKPmI7rpTE,track_uri_spotify:track:00BnfL75e8vHSGCmwUWbEk,track_uri_spotify:track:00BuKLSAFkaEkaVAgIMbeA,track_uri_spotify:track:00Bxugv8RqOqHRjOpQywzP,track_uri_spotify:track:00Ci0EXS4fNPnkTbS6wkOh,track_uri_spotify:track:00CmjeeHvAVKvx3tcIiZTy,track_uri_spotify:track:00CqEmnPLFKDhAb3cuu6Cs,track_uri_spotify:track:00CrtqaRkCyFjY1yiSYJWo,track_uri_spotify:track:00DYRuYJQzfI6dH4Adkimo,track_uri_spotify:track:00DdfpIoie5he0IVzNxUrh,track_uri_spotify:track:00FROhC5g4iJdax5US8jRr,track_uri_spotify:track:00FWuEBvnitgpbMSk0XIoN,track_uri_spotify:track:00FaBIFXUfzaaVcsQmTTJY,track_uri_spotify:track:00GbymcwhRn9Xgr6BZLy2C,track_uri_spotify:track:00Gk1t8qi6zcWNeCJgh7GO,track_uri_spotify:track:00Go3gWOIVO7blYdzztuzf,…,album_uri_spotify:album:7zSqj9Mp9kc38v3Fiynoog,album_uri_spotify:album:7zT0DG3Lp6XIKBq7IWD3Nx,album_uri_spotify:album:7zTgzvexSIq11gZdx7xf21,album_uri_spotify:album:7zTlfidBmhUVtUmLSKk7WF,album_uri_spotify:album:7zU9CV77oLw9bpNmTsK8J1,album_uri_spotify:album:7zUBqTWtQ8TtKuXjVpLKvg,album_uri_spotify:album:7zURSbIVZ5vl3NIKkrVRvl,album_uri_spotify:album:7zVMOFzJOtIURZARFnVHFS,album_uri_spotify:album:7zW5IaveXyKkf5G8dnBgCW,album_uri_spotify:album:7zWf7SM34SbtjsGF8s5khN,album_uri_spotify:album:7za8qVGR3wUEJ01pLFcnPB,album_uri_spotify:album:7zbCgH5GJiEz29ZBzptDmL,album_uri_spotify:album:7zbMCm1kkmioa3Je5PGRnD,album_uri_spotify:album:7zbPztRhDnCJfShVXL2F8C,album_uri_spotify:album:7zcUlBQGskzXp3R7cWLfnt,album_uri_spotify:album:7zcYkWLSiRRuKHYY14xkGk,album_uri_spotify:album:7zcfIpDB1ScsABjule7P5f,album_uri_spotify:album:7zclfiIpH7aSuKrFrBvl1G,album_uri_spotify:album:7zdZNXoapFcOW663zgLdOE,album_uri_spotify:album:7zduRJgS6v79QmNUhKGozu,album_uri_spotify:album:7zesXMFikT4DdgkklIk3Jz,album_uri_spotify:album:7zf9kyxc3ZhJv79HrjTRir,album_uri_spotify:album:7zfyys4Or3HXPmGggCrDUY,album_uri_spotify:album:7zg9ZSOQtYLjWnvSawflg6,album_uri_spotify:album:7zgtqJ8N4kggXPQYglk3Ao,album_uri_spotify:album:7zjlLGWtmj6BCqwEyacb5z,album_uri_spotify:album:7zlMxh1NR0Shklu48L4e7x,album_uri_spotify:album:7zlwNuZ06nWHXvP22SOxMY,album_uri_spotify:album:7zmPJBzQw5k4tNN8Sxa0kp,album_uri_spotify:album:7zo6mfeFrwM2mMaV419UZr,album_uri_spotify:album:7zoQtXc7TUyHnmaC6LtjEe,album_uri_spotify:album:7zqKiMaDUYQLfoLg9uyTh6,album_uri_spotify:album:7zrqMfgjuapqMGQXUmToLW,album_uri_spotify:album:7zsKY6zTPig9e02bCn6uTY,album_uri_spotify:album:7zuqkqhGkTH3PSdywhLgY4,album_uri_spotify:album:7zxfQk44mX0eW4eWtdDlKI,album_uri_spotify:album:7zzDtMTKGpX0TrPQpLCLLr
u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
df_vectorized = po.concat([int_features, df_vectorized], how='horizontal')
df_vectorized.head()

playlist_pid,track_pos,duration_ms,track_uri_spotify:track:000xQL6tZNLJzIrtIgxqSl,track_uri_spotify:track:0010mZpCCwlPwoBiBsjoac,track_uri_spotify:track:001m5KK2fu67yZ5ZW46LDZ,track_uri_spotify:track:0032P9X3AD4AQvN7yzpmTo,track_uri_spotify:track:004XT7kCZUEJkVIZjmBdDi,track_uri_spotify:track:004skCQeDn1iLntSom0rRr,track_uri_spotify:track:00598GIAcfDDzqccjZ4u44,track_uri_spotify:track:005GaX6hvgeTFnR9FvejTE,track_uri_spotify:track:005X0FmdtkM1kiutosXLTR,track_uri_spotify:track:0062tddXmdVQgX1mMjVKUR,track_uri_spotify:track:0069UNfEXsA1853CA0C6es,track_uri_spotify:track:006AVH7fq061voGXkUiII4,track_uri_spotify:track:006f4XC6oHnIKCYvlI5gbD,track_uri_spotify:track:006r3Kh3rjAW6WgkSPTMzC,track_uri_spotify:track:006yrnQMCZpiUgkR612gC8,track_uri_spotify:track:007SutdC0rVG2CSkuQMtJw,track_uri_spotify:track:007iY6FDvlAwN1DVlNo8rY,track_uri_spotify:track:009n0zqtbcRTL7T4TiGf41,track_uri_spotify:track:00ANnYctEGGhcmOJ5omaj8,track_uri_spotify:track:00AivYmu1UVmxM91uhR9lM,track_uri_spotify:track:00AxNl4D4jHL2AEf1W55j5,track_uri_spotify:track:00BHe2yBtdOzhKPmI7rpTE,track_uri_spotify:track:00BnfL75e8vHSGCmwUWbEk,track_uri_spotify:track:00BuKLSAFkaEkaVAgIMbeA,track_uri_spotify:track:00Bxugv8RqOqHRjOpQywzP,track_uri_spotify:track:00Ci0EXS4fNPnkTbS6wkOh,track_uri_spotify:track:00CmjeeHvAVKvx3tcIiZTy,track_uri_spotify:track:00CqEmnPLFKDhAb3cuu6Cs,track_uri_spotify:track:00CrtqaRkCyFjY1yiSYJWo,track_uri_spotify:track:00DYRuYJQzfI6dH4Adkimo,track_uri_spotify:track:00DdfpIoie5he0IVzNxUrh,track_uri_spotify:track:00FROhC5g4iJdax5US8jRr,track_uri_spotify:track:00FWuEBvnitgpbMSk0XIoN,track_uri_spotify:track:00FaBIFXUfzaaVcsQmTTJY,…,album_uri_spotify:album:7zSqj9Mp9kc38v3Fiynoog,album_uri_spotify:album:7zT0DG3Lp6XIKBq7IWD3Nx,album_uri_spotify:album:7zTgzvexSIq11gZdx7xf21,album_uri_spotify:album:7zTlfidBmhUVtUmLSKk7WF,album_uri_spotify:album:7zU9CV77oLw9bpNmTsK8J1,album_uri_spotify:album:7zUBqTWtQ8TtKuXjVpLKvg,album_uri_spotify:album:7zURSbIVZ5vl3NIKkrVRvl,album_uri_spotify:album:7zVMOFzJOtIURZARFnVHFS,album_uri_spotify:album:7zW5IaveXyKkf5G8dnBgCW,album_uri_spotify:album:7zWf7SM34SbtjsGF8s5khN,album_uri_spotify:album:7za8qVGR3wUEJ01pLFcnPB,album_uri_spotify:album:7zbCgH5GJiEz29ZBzptDmL,album_uri_spotify:album:7zbMCm1kkmioa3Je5PGRnD,album_uri_spotify:album:7zbPztRhDnCJfShVXL2F8C,album_uri_spotify:album:7zcUlBQGskzXp3R7cWLfnt,album_uri_spotify:album:7zcYkWLSiRRuKHYY14xkGk,album_uri_spotify:album:7zcfIpDB1ScsABjule7P5f,album_uri_spotify:album:7zclfiIpH7aSuKrFrBvl1G,album_uri_spotify:album:7zdZNXoapFcOW663zgLdOE,album_uri_spotify:album:7zduRJgS6v79QmNUhKGozu,album_uri_spotify:album:7zesXMFikT4DdgkklIk3Jz,album_uri_spotify:album:7zf9kyxc3ZhJv79HrjTRir,album_uri_spotify:album:7zfyys4Or3HXPmGggCrDUY,album_uri_spotify:album:7zg9ZSOQtYLjWnvSawflg6,album_uri_spotify:album:7zgtqJ8N4kggXPQYglk3Ao,album_uri_spotify:album:7zjlLGWtmj6BCqwEyacb5z,album_uri_spotify:album:7zlMxh1NR0Shklu48L4e7x,album_uri_spotify:album:7zlwNuZ06nWHXvP22SOxMY,album_uri_spotify:album:7zmPJBzQw5k4tNN8Sxa0kp,album_uri_spotify:album:7zo6mfeFrwM2mMaV419UZr,album_uri_spotify:album:7zoQtXc7TUyHnmaC6LtjEe,album_uri_spotify:album:7zqKiMaDUYQLfoLg9uyTh6,album_uri_spotify:album:7zrqMfgjuapqMGQXUmToLW,album_uri_spotify:album:7zsKY6zTPig9e02bCn6uTY,album_uri_spotify:album:7zuqkqhGkTH3PSdywhLgY4,album_uri_spotify:album:7zxfQk44mX0eW4eWtdDlKI,album_uri_spotify:album:7zzDtMTKGpX0TrPQpLCLLr
i64,i64,i64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
1000000,0,163809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000000,1,166848,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000000,2,232506,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000000,3,216600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000000,4,193058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
#cosine_sim_sk = pd.DataFrame(cosine_similarity(df_vectorized))

In [None]:
cosine_sim_sk = po.DataFrame(cosine_similarity(df_vectorized))

: 