In [1]:

import os, json
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import sqlite3
import time
import pickle


In [2]:
# oauth = SpotifyOAuth(client_id="", client_secret="", scope ="user-modify-playback-state", redirect_uri="http://localhost:8080")
# sp = spotipy.Spotify(auth_manager=oauth)

oauth = SpotifyOAuth(client_id="", client_secret="", scope ="user-modify-playback-state", redirect_uri="http://localhost:8080")
sp = spotipy.Spotify(auth_manager=oauth)
data_path = "..\spotify_million_playlist_dataset\data"

In [4]:
# # DO NOT TOUCH THIS CELL OR YOU MIGHT CLEAR THE WHOLE DATABASE
# con = sqlite3.connect("playlistDB2.db")
# cur = con.cursor()
# cur.execute("DROP TABLE IF EXISTS song")
# cur.execute("DROP TABLE IF EXISTS playlist")
# cur.execute("DROP TABLE IF EXISTS playlistsongs")
# cur.execute("CREATE TABLE song(danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, type, id, uri, track_href, analysis_url, duration_ms, time_signature)")
# cur.execute("CREATE TABLE playlist(pid, name, num_tracks)")
# cur.execute("CREATE TABLE playlistsongs(pid, uri, pos)")
# con.commit()

In [4]:
song_cache = set()
def process_songs(to_process, playlist):
    result = []
    song_features = sp.audio_features(to_process)
    for sf in song_features:
        if sf == None:
            continue
        song_cache.add(sf["uri"])
        result.append(tuple(sf.values()))
    return result

def process_artists(to_process):
    result = []
    song_features = sp.artists(to_process)
    for sf in song_features:
        if sf == None:
            continue
        song_cache.add(sf["uri"])
        result.append(tuple(sf.values()))
    return result
        

def process_chunk(chunk):
    to_process = []
    playlists = []
    songs = []
    playlist_songs = []
    artists = []
    to_process_artists = []
    for playlist in chunk["playlists"]:
        playlists.append((playlist["pid"], playlist["name"], playlist["num_tracks"]))
        for song in playlist["tracks"]:
            id = song["track_uri"]
            playlist_songs.append((playlist["pid"], id, song["pos"]))
            if id in song_cache:
                continue
            to_process.append(id)
            if len(to_process) == 100:
                processed_songs = process_songs(to_process, playlist)
                songs.extend(processed_songs)
                to_process = []
    
    songs.extend(process_songs(to_process, playlist))
    to_process = []
    cur.executemany(f"INSERT INTO song VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", songs)
    cur.executemany(f"INSERT INTO playlist VALUES(?, ?, ?)", playlists)
    cur.executemany(f"INSERT INTO playlistsongs VALUES(?, ?, ?)", playlist_songs)
    con.commit()


def process_chunk_get_artist(chunk):
    to_process = []
    artists_list = []
    artists_seen = set()
    for playlist in chunk["playlists"]:
        for song in playlist["tracks"]:
            artist = song["artist_uri"]
            id = song["track_uri"]
            if id in song_artist_map:
                continue
            song_artist_map[id] = artist
            if artist in artist_map or artist in artists_seen:
                continue
            to_process.append(artist)
            artists_seen.add(artist)
            if len(to_process) == 50:
                processed_artists = sp.artists(to_process)
                artists_list.extend(processed_artists["artists"])
                to_process = []
    if len(to_process) != 0:
        artists_list.extend(sp.artists(to_process)["artists"])
    for artist in artists_list:
        artist_map[artist["uri"]] = artist

def process_chunk_by_file_path(file):
    print(f"starting chunk {file}")
    chunk = json.load(open(f"{data_path}/{file}", "r"))
    process_chunk_get_artist(chunk)
    print(f"finished chunk {file}")
    del chunk

# Notes for processing Artists

295860 artists in the dataset
295860 artists / 50 artist per api call = 5918 api calls

if we make an api call every 15 seconds: 25 hours to process artists
if we make an api call every 20 seconds: 32 hours to process artists

In [5]:
i = 0
time_start = time.time()
print(f"Loading in data from {data_path}, starting at 2AM")
song_artist_map = pickle.load(open("data/song_artists.dat", "rb"))
artist_map = pickle.load(open("data/artists_info.dat", "rb"))
for file in os.listdir(data_path):
    chunk = json.load(open(f"{data_path}/{file}", "r"))
    print(f"starting on slice {chunk['info']['slice']}")
    process_chunk_get_artist(chunk)
    del chunk
    pickle.dump(artist_map, open("data/artists_info.dat", "wb"))
    pickle.dump(song_artist_map, open("data/song_artists.dat", "wb"))
    
    print(f"Completed {i} chunks")
    i += 1
time_done = time.time()
print(f"done creating database in {(time_done - time_start)/60} minutes")
os.system(f'send_message "done creating database in {(time_done - time_start)/60} minutes"')

Loading in data from ..\spotify_million_playlist_dataset\data, starting at 2AM
starting on slice 0-999


In [None]:
# with ThreadPool(mp.cpu_count()) as p:
#     p.map(process_chunk_by_file_path, os.listdir(data_path))

# import pickle
# to_dump = {song: artist_map[artist] for song, artist in song_artist_map.items()}
# pickle.dump(to_dump, open("data/artists.dat", "wb"))