In [8]:
import json
import os
from os import path, listdir
from sys import getsizeof
import spotipy
from spotipy import SpotifyClientCredentials
import pickle
from tqdm.notebook import tqdm
import pandas as pd

### Init spotify object with client id and secret

In [60]:
with open("configs/spotify_config.json", "r") as f:
    SPOTIFY_CONFIG = json.load(f)
client_id = SPOTIFY_CONFIG['client_id']
client_secret = SPOTIFY_CONFIG['client_secret']

sp = spotipy.Spotify(
        auth_manager=SpotifyClientCredentials(
            client_id=client_id, client_secret=client_secret
        )
    )

### Process genres


#### First: collect tracks from playlists

In [10]:
with open("filtered_genres_playlist_fin.json", "r") as f:
    genres = json.load(f)

In [11]:
def embed_to_url(embed_url):
    playlist_id = embed_url.split("playlist:")[1]
    return playlist_id#f"https://open.spotify.com/{playlist_id}"


In [12]:
to_playlist_db = {}
to_tracks_db = {}

In [28]:
if "genres_tracker.pickle" in listdir():
    with open("genres_tracker.pickle", "rb") as f:
        genres_to_download = pickle.load(f)
else:
    genres_to_download = list(genres.keys())

genres_iter = genres_to_download.copy()
for genre_to_download in tqdm(genres_iter):

    # write playlists info
    playlist_id = embed_to_url(genres[genre_to_download])
    pl = sp.playlist(playlist_id)
    total_followers = pl['followers']['total']
    total_tracks = pl['tracks']['total']
    to_playlist_dict = {"playlist_id": playlist_id, 
                           "genre": genre_to_download, 
                           "total_followers": total_followers,
                           "total_tracks": total_tracks}

    # write tracks_info
    tracks_boxes = pl['tracks']['items']
    to_tracks_dict = {playlist_id: {}}
    for track_box in tracks_boxes:
        
        track = track_box['track']
        if track is None:
            print("Found non-track")
            continue
        track_id = track['id']
        to_tracks_dict[playlist_id].update({track_id: {}})
        
        
        # album info
        album_box = track['album']
        to_tracks_dict[playlist_id][track_id].update({"album_type": album_box['album_type'],
                                           "album_id": album_box['id'],
                                           "album_name": album_box['name'],
                                           "album_release_data": album_box['release_date'],
                                           "album_artists_names": ':artist_custom_separator:'.join(
                                               [i['name'] for i in album_box['artists']]
                                           ),
                                           "album_artists_ids": ':artist_custom_separator:'.join(
                                               [i['id'] for i in album_box['artists']]
                                           ),
                                           "album_total_tracks": album_box['total_tracks']
                                           })
           
        
        # artist info
        to_tracks_dict[playlist_id][track_id].update({"track_artists_names": ':artist_custom_separator:'.join([i['name'] for i in track['artists']]),
                                            "track_artists_ids": ':artist_custom_separator:'.join([i['id'] for i in track['artists']])
                                           })
        
        
    
        # track info
        to_tracks_dict[playlist_id][track_id].update({"track_name": track['name'],
                                            "track_id": track['id'],
                                            "track_type": track['type'],
                                            "track_duration_ms": track['duration_ms'],
                                            "track_explicit": track['explicit'],
                                            "track_popularity": track['popularity'],
                                           })
        
        
    

    to_playlist_db.update({genre_to_download: to_playlist_dict})
    to_tracks_db.update(to_tracks_dict)
    genres_to_download.pop(0)
    with open("genres_tracker.pickle", "wb") as f:
        pickle.dump(genres_to_download, f)

    with open("to_playlist_db.pickle", "wb") as f:
        pickle.dump(to_playlist_db, f)

    with open("to_tracks_db.pickle", "wb") as f:
        pickle.dump(to_tracks_db, f)

        
    

  0%|          | 0/1430 [00:00<?, ?it/s]

Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track
Found non-track


#### Second: collect more track-level info

In [29]:
with open("to_tracks_db.pickle", "rb") as f:
    to_tracks_db = pickle.load(f)

In [30]:
if "to_tracks_db_second.pickle" in listdir():
    with open("to_tracks_db_second.pickle", "rb") as f:
        to_tracks_db_second = pickle.load(f)
else:
    to_tracks_db_second = {}

In [31]:
def split_into_chunks(iterable_, size):
    if size > len(iterable_):
        return None
    
    len_ = len(iterable_)
    n_full = len(iterable_) // size
    to_return = []
    for i in range(n_full):
        to_return.append(iterable_[i*size:(i+1)*size])
    if (last_part := iterable_[n_full*size:]):
        to_return.append(last_part)
    return to_return

In [69]:
if "feature_chunks.pickle" in listdir():
    with open("feature_chunks.pickle", "rb") as f:
        chunks = pickle.load(f)
else:
    to_download = []
    for playlist_id in to_tracks_db:
        for track_id in to_tracks_db[playlist_id]:
            to_download.append([playlist_id, track_id])
    chunks = split_into_chunks(to_download, 100)

chunks_ready = chunks.copy()

for chunk in tqdm(chunks):
    playlist_ids = [i[0] for i in chunk]
    track_ids = [i[1] for i in chunk]
    features_data = sp.audio_features(track_ids)
    
    for feature_set in features_data:
        track_id = feature_set['id']
        to_tracks_db_second.update({track_id: feature_set})
    chunks_ready.pop(0)

    with open("feature_chunks.pickle", "wb") as f:
        pickle.dump(chunks_ready, f)

    with open("to_tracks_db_second.pickle", "wb") as f:
        pickle.dump(to_tracks_db_second, f)

  0%|          | 0/939 [00:00<?, ?it/s]

Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=4JNyYp4SxcUAaB8XeVAeHM,0ci44NVZuqzQkjEuJU59lI,0oAR9pJWkiF0bR7PIYd2es,6444OoiysqF9oMUDqeNuT1,7xGIDrber9QNV6sGkXjbU0,56zmbTpGNq5xEFS78Mdubg,26Kf69sxU5WEI9OHLqTrJV,2bh5oo3PkIyd1MVK75dT37,4T4SRHYqyQYo5r00F8Q6pO,3yKQMKSulgVjQv0k7s6XyG,2KMtjCV6Cn9ssl5qOPgOet,6vgbEyGrXdvgQlXdLRKB2B,5FVgyqg76qynbtdVZcTGPS,6JM9oYKfqsv97YiT2HEWBO,49QNhxYpRLNzlYZJROTj2x,7umLQxLHvNgepDa9f6IwUv,5SPn4sYvYjpGQbLSFBR7vr,2qV5SPdswb3sXUOhaRJBNv,5P15cm1CgK7tz9mA66SXaT,5KOvIodYN0RbZxsT9gtHtw,1XcM9HxA8vcYipa3bmwUrP,5IxMjIjxa91QF8pyCxAJvI,0W9ZHmgWTYZyBp5Z3w0Wan,5BQDxbhpYU5b5cdYkWdRFQ,73owgYvspkDWiLwLDxloxn,7FKH8cRBqFjgxfArnJLqaW,20haAkyAekQ96roINFIANm,7LDq2cE0TQV66BXorbV8FD,0MXZ9WpcuHxmwlH5qD4MaM,1Y1JmZx6PxjGwTA2WihtJS,2YwJMorvqqGcVWwjbvl6jN,4EAOZzQ8FAwlMHpRmI1Um3,0FerzWnhUhZdFLnrpsqAiY,6LMjSt9vwT4Q827avd4y8E,1csCF4ujk4DN4swpMxZPdG,04DVF065NOx5wRtYeOP2OH,36vDa8TbcpXawdaYh1IWVm,3B0hzwc1e8AYOytj9hZS2I,0nn1gOjGZjObCNySWVY7fZ,6YLWkUzdBzCtXIGngc1ROj,7Jj9ygPtg5IzRzX9cfeI80,7yLzGp7vC3qg5FlBmUa8Rg,2ejS1kysr5TPQNGSOgzkdA,4rk0u76lg4D0vVLSeTooNJ,3CmrB7T7Lc9crsdvS8q19s,0P6soDF1syQ592sUCseXHe,26TzNAUDOizTv0EJMDpUWS,5K9ka2I5o2T0tmFSpysNqW,0knAHKgoYTNdA6rJTvc4sg,3IIyTdTmKfL4b3U1qOrBTo,4laidm7QyhsId0XX3RQ8Hk,3Cv5sKQDMRfx1Py7gcT2z1,6Dh0TW3rcPn4UD6F7VBgnO,2dxwOjX0uo5gPG7JTUKZW4,5F0SeFoEvGi64888rJz0HK,5XKvea6ua8cPNxSAFJLB7e,7nulQV4GAPHJmqWzEGDdHd,1GVA3pLyw3CuOBDuFO5YNI,5yA7lbHGaER16zBVxyD5ET,6RyIiYvJLW4ak6IXN1eEQp,24wPUgmnaMWgGLz6XhS60g,1mD0LxzCdfathf6j7xMp7b,0uKx0kvuosOccVNVnar1Ka,3eKoLbxkNhAINXr22jjv7E,2LfwFEFF0FqKXFtX0GHXcH,2CLWYxRZ6wSitCaLCM15CF,656fu7zieCVPjpYFA7Hcqj,0ihD0DZaMuLHP3fMZBmGt1,4gVdmXgR8Lp56NonH5LCY6,1zz2Q0OgNChEJX10pkuMzf,1XzGZGwTh8KmFDwexdfdqw,0qRr1jzv5QbUktvWeZUuvs,3ADiFCMXs91ZyEBSWIiAK6,5UrCArd6UDDR2EPFOeWCxb,7IlvADoEl6VZwvyxPRg3GL,78WZ1kSG1kSWEURNhTGzWt,5Sa9ZJCkpSYgLcCYHOCKpN,0Fq3H0W5mxnSEeWDkCpJlP,3Xfg7AegXaDLoD5GOUMf2e,6ldfbljsXS4kSuGxSvJTm1,2YsLvdm387xBKdmVMVZ1q2,5dsu6BWxTobtUuwmyJg07w,2OUBPoxJ91IIdpPZiVW5X8,3LQNvkeBZlYiXEbzhW6QMM,6zY7DGYzGrkniOU1PxaK4h,6DwogjaSBKSMcd1ahmAUSW,5Jaf4NzAvm7SkvOZKNOZRl,2aTuHt6TPwZkT0079MzPHV,0J9xyNtSafUCjLksYrwFqI,3lEzTG0DV4pKjA5ZZiY0C9,4sDf4MGHF1GF76LkmLn2HV,4qJrDTPDsQqaYGKMj0Thjk,08aqoT82lbhlpCqnMWYb0y,6suOpuXcFt3jte5LsRP6J6,3ESt9OYwANn87h5C6sIFDU,3OOIKPMVqlCALvphXtAPkp,2Zg6z0XddeBbb7ceZBcPdR,0ByC7DPj6qJK3FfcpNoWWg,7seuJCMBqkg1ayg1S7b5hE,46XKwgC2uZZWRrIdUcVpGg:
 Max Retries, reason: too many 429 error responses

### Unite files

In [64]:
to_df = []

for genre_name in to_playlist_db:
    playlist_id = to_playlist_db[genre_name]['playlist_id']
    items = []
    for track_id in to_tracks_db_second:
        to_items = []
        try:
            to_items.extend(list(to_tracks_db[playlist_id][track_id].items()))
            to_items.extend(list(to_tracks_db_second[track_id].items()))
            to_items.extend(list(to_playlist_db[genre_name].items()))
        except Exception:
            continue
        items.append(to_items)
    for item in items:        
        to_df.append({key: val for key, val in item})
    

In [65]:
df = pd.DataFrame(to_df)

In [68]:
with open("tracks_df.pickle", "wb") as f:
    pickle.dump(df, f)