In [25]:
import os
import json
import pandas as pd

### Preprocessing playlists dataset

In [None]:
# track_id_to_index map
# Import song features
features = pd.read_pickle("features_preprocessed.pkl")
features= features.reset_index(drop=True)
track_id_to_index = {track_id: idx for idx, track_id in enumerate(features["track_id"])}

In [64]:
# Function to find match between playlist song id and features song id
def song_match(playlist_ids, track_id_to_index):
    # Return list of playlist song IDs that exist in features
    return [id for id in playlist_ids if id in track_id_to_index.keys()]


In [65]:
def get_playlist(directory, track_id_to_index, song_match , min_playlist_length):
    playlists= {}
    for filename in os.listdir(directory):
        # Make file path
        file = os.path.join(directory, filename)
        # Check if file exists
        if os.path.isfile(file):
            with open(file, "r") as f:
                content= json.load(f)
                # Iterate on playlists
                for playlist in content["playlists"]:
                        # Get playlist IDs that exist in features
                        track_ids= [track["track_uri"].split(":")[-1] for track in playlist["tracks"]]
                        playlist_ids_matching_feature_ids = song_match(track_ids, track_id_to_index)
                        # Collect playlists with 5 or more matches
                        if len(playlist_ids_matching_feature_ids) >= min_playlist_length:
                            #print(f"playlist contains more than {min_playlist_length} matches")
                            playlists[playlist["pid"]] = {
                                 "name": playlist["name"],
                                 "tracks" : playlist_ids_matching_feature_ids,
                            } 
                            print(f"playlist {playlist['pid']} has been added")
    return playlists

clean_playlists= get_playlist("playlists_subset", track_id_to_index, song_match, 30)
print(len(clean_playlists))

playlist 980023 has been added
playlist 980037 has been added
playlist 980055 has been added
playlist 980063 has been added
playlist 980077 has been added
playlist 980109 has been added
playlist 980125 has been added
playlist 980132 has been added
playlist 980281 has been added
playlist 980283 has been added
playlist 980285 has been added
playlist 980291 has been added
playlist 980317 has been added
playlist 980325 has been added
playlist 980337 has been added
playlist 980372 has been added
playlist 980378 has been added
playlist 980407 has been added
playlist 980447 has been added
playlist 980456 has been added
playlist 980514 has been added
playlist 980605 has been added
playlist 980700 has been added
playlist 980712 has been added
playlist 980736 has been added
playlist 980800 has been added
playlist 980818 has been added
playlist 980825 has been added
playlist 980829 has been added
playlist 980897 has been added
playlist 980917 has been added
playlist 980949 has been added
playlist

In [66]:
with open("clean_playlists.json" , "w") as f:
    json.dump(clean_playlists, f, indent=4)