# Using Spotify API to get top tracks of an artist using the Spotipy library

In [6]:
import os
import time
from datetime import datetime
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [7]:
load_dotenv() # load environment variables

client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

# spotipy setup
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Check and expand data from million song subset

In [None]:
million_songs_df = pd.read_csv('../data/clean/million_songs_df.csv')

BATCH_SIZE = 50
SLEEP_TIME = 30

# get the last processed index from existing backup file
backup_file = '../data/local/spotify_million_tracks.csv'
if os.path.exists(backup_file):
    processed_df = pd.read_csv(backup_file)
    start_index = len(processed_df)
    print(f'Resuming from index {start_index}')
else:
    start_index = 0
    print('Starting new processing')

# process in batches
for start_idx in range(start_index, len(million_songs_df), BATCH_SIZE):
    batch = million_songs_df.iloc[start_idx:start_idx + BATCH_SIZE]
    tracks_data = []
    
    print(f'\nProcessing batch {start_idx//BATCH_SIZE + 1} of {len(million_songs_df)//BATCH_SIZE + 1}')
    print(f'Processing rows {start_idx} to {min(start_idx + BATCH_SIZE, len(million_songs_df))}')
    
    for i, row in batch.iterrows():
        try:
            # search track
            results = sp.search(q=f'track:{row["song_title"]} artist:{row["artist"]}', type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]
                
                # get genres
                artist_id = track['artists'][0]['id']
                artist_info = sp.artist(artist_id)
                genres = artist_info['genres']
                
                tracks_data.append({
                    'original_title': row['song_title'],
                    'original_artist': row['artist'],
                    'spotify_title': track['name'],
                    'spotify_artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'release_date': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
                    'genres': genres if genres else None
                })
                
        except Exception as e:
            print(f'Error processing track {row["song_title"]}: {str(e)}')
            continue
    
    # save batch results
    if tracks_data:
        temp_df = pd.DataFrame(tracks_data)
        temp_df.to_csv(backup_file, 
                      mode='a', 
                      header=not os.path.exists(backup_file), 
                      index=False)
        
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f'Batch completed at {timestamp}')
        print(f'Processed {len(tracks_data)} tracks in this batch')
        print(f'Total tracks processed: {start_idx + len(tracks_data)}')
        print(f'Last track processed: {row["song_title"]}')
        print(f'Last track genres: {genres if genres else "No genres found"}')
        
        if start_idx + BATCH_SIZE < len(million_songs_df):  # don't sleep after the last batch
            print(f'Sleeping for {SLEEP_TIME} seconds...')
            print('-' * 50)
            time.sleep(SLEEP_TIME)

print(f'\nAll batches processed!')
print(f'Final total tracks processed: {start_idx + len(tracks_data)}')

Starting new processing

Processing batch 1 of 201
Processing rows 0 to 50
Batch completed at 10:59:08
Processed 46 tracks in this batch
Total tracks processed: 46
Last track processed: Washington Post 
Last track genres: No genres found
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 2 of 201
Processing rows 50 to 100
Batch completed at 10:59:59
Processed 45 tracks in this batch
Total tracks processed: 95
Last track processed: From My Hands
Last track genres: ['deathcore', 'death metal', 'metalcore']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 3 of 201
Processing rows 100 to 150
Batch completed at 11:00:50
Processed 42 tracks in this batch
Total tracks processed: 142
Last track processed: Bubblin' 
Last track genres: ['east coast hip hop', 'jazz rap']
Sleeping for 30 seconds...
--------------------------------------------------

Processing batch 4 of 201
Processing rows 150 to 200
Batch