# Using Spotify API to get top tracks of an artist using the Spotipy library

In [1]:
import os
import time
from datetime import datetime
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [3]:
load_dotenv() # load environment variables

client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# spotipy setup
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Check and expand data from million song subset

In [4]:
million_songs_df = pd.read_csv('../data/clean/million_songs_df.csv')

BATCH_SIZE = 50
SLEEP_TIME = 30

# get the last processed index from existing backup file
backup_file = '../data/local/spotify_million_tracks.csv'
if os.path.exists(backup_file):
    processed_df = pd.read_csv(backup_file)
    start_index = len(processed_df)
    print(f'Resuming from index {start_index}')
else:
    start_index = 0
    print('Starting new processing')

# process in batches
for start_idx in range(start_index, len(million_songs_df), BATCH_SIZE):
    batch = million_songs_df.iloc[start_idx:start_idx + BATCH_SIZE]
    tracks_data = []
    
    print(f'\nProcessing batch {start_idx//BATCH_SIZE + 1} of {len(million_songs_df)//BATCH_SIZE + 1}')
    print(f'Processing rows {start_idx} to {min(start_idx + BATCH_SIZE, len(million_songs_df))}')
    
    for i, row in batch.iterrows():
        try:
            # search track
            results = sp.search(q=f'track:{row["song_title"]} artist:{row["artist"]}', type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]
                
                # get genres
                artist_id = track['artists'][0]['id']
                artist_info = sp.artist(artist_id)
                genres = artist_info['genres']
                
                tracks_data.append({
                    'original_title': row['song_title'],
                    'original_artist': row['artist'],
                    'spotify_title': track['name'],
                    'spotify_artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'release_date': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
                    'genres': genres if genres else None
                })
                
        except Exception as e:
            print(f'Error processing track {row["song_title"]}: {str(e)}')
            continue
    
    # save batch results
    if tracks_data:
        temp_df = pd.DataFrame(tracks_data)
        temp_df.to_csv(backup_file, 
                      mode='a', 
                      header=not os.path.exists(backup_file), 
                      index=False)
        
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f'Batch completed at {timestamp}')
        print(f'Processed {len(tracks_data)} tracks in this batch')
        print(f'Total tracks processed: {start_idx + len(tracks_data)}')
        print(f'Last track processed: {row["song_title"]}')
        print(f'Last track genres: {genres if genres else "No genres found"}')
        
        if start_idx + BATCH_SIZE < len(million_songs_df):  # don't sleep after the last batch
            print(f'Sleeping for {SLEEP_TIME} seconds...')
            print('-' * 50)
            time.sleep(SLEEP_TIME)

print(f'\nAll batches processed!')
print(f'Final total tracks processed: {start_idx + len(tracks_data)}')

Resuming from index 4915

Processing batch 99 of 201
Processing rows 4915 to 4965
Batch completed at 11:17:56
Processed 42 tracks in this batch
Total tracks processed: 4957
Last track processed: Nada Mas
Last track genres: ['big band']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 100 of 201
Processing rows 4965 to 5015
Batch completed at 11:18:53
Processed 41 tracks in this batch
Total tracks processed: 5006
Last track processed: Impromptu
Last track genres: No genres found
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 101 of 201
Processing rows 5015 to 5065
Batch completed at 11:19:51
Processed 42 tracks in this batch
Total tracks processed: 5057
Last track processed: Buried Alive In The Blues
Last track genres: ['blues', 'classic blues', 'blues rock']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 102 of 201
Processing rows 5065 to 5115

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:Turandot _ Act III - Scene I: Nessum dorma! artist:Maria Callas/Eugenio Fernandi/Elisabeth Schwarzkopf/Giuseppe Nessi/Nicola Zaccaria/Mario Borriello/Renato Ercolani/Piero de Palma/Giulio Mauri/Elisabetta Fusco/Pinuccia Perotti/Coro del Teatro alla Scala_ Milano/Orchestra del Teatro alla Scala_ Milano/Tul', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Query exceeds maximum length of 250 characters


Error processing track Turandot _ Act III - Scene I: Nessum dorma!: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3ATurandot+_+Act+III+-+Scene+I%3A+Nessum+dorma%21+artist%3AMaria+Callas%2FEugenio+Fernandi%2FElisabeth+Schwarzkopf%2FGiuseppe+Nessi%2FNicola+Zaccaria%2FMario+Borriello%2FRenato+Ercolani%2FPiero+de+Palma%2FGiulio+Mauri%2FElisabetta+Fusco%2FPinuccia+Perotti%2FCoro+del+Teatro+alla+Scala_+Milano%2FOrchestra+del+Teatro+alla+Scala_+Milano%2FTul&limit=1&offset=0&type=track:
 Query exceeds maximum length of 250 characters, reason: None
Batch completed at 12:52:45
Processed 30 tracks in this batch
Total tracks processed: 9995
Last track processed: Souffle 2
Last track genres: No genres found

All batches processed!
Final total tracks processed: 9995
