# Using Spotify API to get top tracks of an artist using the Spotipy library

In [2]:
import os
import time
from datetime import datetime
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [6]:
load_dotenv() # load environment variables

client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# spotipy setup
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Check and expand data from million song subset

In [None]:
million_songs_df = pd.read_csv('../data/clean/million_songs_df.csv')

BATCH_SIZE = 50
SLEEP_TIME = 30

# get the last processed index from existing backup file
backup_file = '../data/local/spotify_million_tracks.csv'
if os.path.exists(backup_file):
    processed_df = pd.read_csv(backup_file)
    start_index = len(processed_df)
    print(f'Resuming from index {start_index}')
else:
    start_index = 0
    print('Starting new processing')

# process in batches
for start_idx in range(start_index, len(million_songs_df), BATCH_SIZE):
    batch = million_songs_df.iloc[start_idx:start_idx + BATCH_SIZE]
    tracks_data = []
    
    print(f'\nProcessing batch {start_idx//BATCH_SIZE + 1} of {len(million_songs_df)//BATCH_SIZE + 1}')
    print(f'Processing rows {start_idx} to {min(start_idx + BATCH_SIZE, len(million_songs_df))}')
    
    for i, row in batch.iterrows():
        try:
            # search track
            results = sp.search(q=f'track:{row["song_title"]} artist:{row["artist"]}', type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]
                
                # get genres
                artist_id = track['artists'][0]['id']
                artist_info = sp.artist(artist_id)
                genres = artist_info['genres']
                
                tracks_data.append({
                    'original_title': row['song_title'],
                    'original_artist': row['artist'],
                    'spotify_title': track['name'],
                    'spotify_artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'release_date': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
                    'genres': genres if genres else None
                })
                
        except Exception as e:
            print(f'Error processing track {row["song_title"]}: {str(e)}')
            continue
    
    # save batch results
    if tracks_data:
        temp_df = pd.DataFrame(tracks_data)
        temp_df.to_csv(backup_file, 
                      mode='a', 
                      header=not os.path.exists(backup_file), 
                      index=False)
        
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f'Batch completed at {timestamp}')
        print(f'Processed {len(tracks_data)} tracks in this batch')
        print(f'Total tracks processed: {start_idx + len(tracks_data)}')
        print(f'Last track processed: {row["song_title"]}')
        print(f'Last track genres: {genres if genres else "No genres found"}')
        
        if start_idx + BATCH_SIZE < len(million_songs_df):  # don't sleep after the last batch
            print(f'Sleeping for {SLEEP_TIME} seconds...')
            print('-' * 50)
            time.sleep(SLEEP_TIME)

print(f'\nAll batches processed!')
print(f'Final total tracks processed: {start_idx + len(tracks_data)}')

Resuming from index 1940

Processing batch 39 of 201
Processing rows 1940 to 1990
Batch completed at 12:26:11
Processed 41 tracks in this batch
Total tracks processed: 1981
Last track processed: The Imperial March from The Empire Strikes Back
Last track genres: ['soundtrack']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 40 of 201
Processing rows 1990 to 2040


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "track:Dialogue et final si j'avais su artist:Maurice Chevalier / Françoise Dorin / Marina Hottine / Suzanne Gabriello / Jo Charrier / Marcel Carpentier / Perrette Souplex / Andree Grandjean / Cadet Rivers / Raymond Girerd / Orchestre De Jacques-Henri Rys / Jacques Henri Rys", 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Query exceeds maximum length of 250 characters


Error processing track Dialogue et final si j'avais su: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3ADialogue+et+final+si+j%27avais+su+artist%3AMaurice+Chevalier+%2F+Fran%C3%A7oise+Dorin+%2F+Marina+Hottine+%2F+Suzanne+Gabriello+%2F+Jo+Charrier+%2F+Marcel+Carpentier+%2F+Perrette+Souplex+%2F+Andree+Grandjean+%2F+Cadet+Rivers+%2F+Raymond+Girerd+%2F+Orchestre+De+Jacques-Henri+Rys+%2F+Jacques+Henri+Rys&limit=1&offset=0&type=track:
 Query exceeds maximum length of 250 characters, reason: None
Batch completed at 12:27:01
Processed 42 tracks in this batch
Total tracks processed: 2032
Last track processed: Unholy Outburst #3
Last track genres: ['minimal techno', 'electronica']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 41 of 201
Processing rows 2040 to 2090
Batch completed at 12:27:50
Processed 40 tracks in this batch
Total tracks processed: 2080
Last track processed: Midnight
Last track genres: ['rockabilly', 'dans

