In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install tqdm


In [None]:
import requests
import pandas as pd
import json
from tqdm import tqdm
import time

def get_spotify_token(client_id, client_secret):
    url = "https://accounts.spotify.com/api/token"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    payload = {'grant_type': 'client_credentials'}
    response = requests.post(url, headers=headers, data=payload, auth=(client_id, client_secret))
    token = response.json().get('access_token')
    return token

def safe_request(url, headers):
    max_retries = 5  # Set a maximum number of retries
    retry_delay = 1  # Start with a 1 second delay
    for attempt in range(max_retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 429:  # Rate limit exceeded
            retry_after = int(response.headers.get('Retry-After', retry_delay))
            print(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
            if retry_after > 3600:  # If the wait time is longer than an hour
                print(f"Long wait detected. Pausing and resuming after {retry_after} seconds.")
            time.sleep(retry_after)
            retry_delay *= 2  # Double the delay for the next attempt if needed again
        elif response.status_code == 200:
            return response.json()
        else:
            response.raise_for_status()
    raise Exception("Max retries exceeded for API request")

def get_artist_details(artist_id, token):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = {'Authorization': f'Bearer {token}'}
    try:
        artist_data = safe_request(url, headers)
        return {'artist_genre': ', '.join(artist_data['genres']) if 'genres' in artist_data and artist_data['genres'] else 'No Genre'}
    except Exception as e:
        print(f"Error fetching artist details: {e}")
        return {'artist_genre': 'No Genre'}  # Return a default value in case of error

def get_track_details(track_id, token):
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    headers = {'Authorization': f'Bearer {token}'}
    try:
        track_data = safe_request(url, headers)
        artist_info = get_artist_details(track_data['artists'][0]['id'], token)
        track_info = {
            'song_title': track_data['name'],
            'song_id': track_data['id'],
            'track_popularity': track_data['popularity'],
            'track_duration': track_data['duration_ms'],
            'explicit': track_data['explicit'],
            'artist_name': track_data['artists'][0]['name'],
            'artist_id': track_data['artists'][0]['id'],
            'artist_genre': artist_info['artist_genre'],
            'album_id': track_data['album']['id'],
            'album_type': track_data['album']['album_type'],
            'release_date': track_data['album']['release_date']
        }
        return track_info
    except Exception as e:
        print(f"Error fetching track details: {e}")
        return {}  # Return an empty dictionary if there's an error

def get_audio_features(track_id, token):
    url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = {'Authorization': f'Bearer {token}'}
    try:
        audio_features = safe_request(url, headers)
        return audio_features
    except Exception as e:
        print(f"Error fetching audio features: {e}")
        return {}  # Return an empty dictionary if there's an error

client_id = '80d18beea941408b84e4040548a13895'
client_secret = '46ee063bd0f244ec99e6639500e842f2'
token = get_spotify_token(client_id, client_secret)

# Load track IDs from JSON file
with open('/kaggle/input/spotify-songs-id/songs.json') as file:
    track_ids = json.load(file)

# Process tracks in batches of 500
for i in range(20000, len(track_ids), 500):  # Adjusted from 2000 to 500
    batch = track_ids[i:i+500]
    batch_data = []
    print(f"Starting batch {i//500 + 1}...")
    for index, tid in enumerate(tqdm(batch, desc=f"Fetching batch {i//500 + 1}")):
        track_details = get_track_details(tid, token)
        audio_features = get_audio_features(tid, token)
        if track_details and audio_features:
            combined_data = {**track_details, **audio_features}
            batch_data.append(combined_data)
        else:
            print(f"Warning: Missing data for track ID {tid}")

        if (index + 1) % 20 == 0:  # After every 20 songs, add a delay
            time.sleep(1)  # Sleep for 1 second (adjust based on your needs)

    if batch_data:
        df = pd.DataFrame(batch_data)
        csv_filename = f'spotify_data_{i//500 + 1}.csv'
        df.to_csv(csv_filename, index=False)
        print(f"Batch {i//500 + 1} complete and saved to {csv_filename}.")
    else:
        print(f"No data to save for batch {i//500 + 1}.")
