In [56]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests
import glob
import json
import copy
import tqdm
import time
import base64
from requests import post, get
from collections import defaultdict

In [57]:
# Client ID
cid = ''

# Client secret
secret = ''

In [58]:
# Getting all the file names scraped from rate your music
rating = ['top','bottom','popular','esoteric','diverse']
path = "data/rate_your_music_data/{}/"
all_paths = {}
for order in rating:
    temp = path.format(order)
    all_paths[order] = []
    for file in glob.glob(temp+"*.json"):
        all_paths[order].append(file)

In [59]:
# Getting the authorization token
def get_token():
    auth_string = cid + ":" + secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes),"utf-8")
    
    url = "https://accounts.spotify.com/api/token"
    
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type":"client_credentials"}
    
    result = post(url, headers = headers, data = data)
    res = json.loads(result.content)
    token = res["access_token"]
    return token

In [60]:
# Storing the token
token = get_token()

In [61]:
# Function to get the authentication headers
def get_auth_header(token):
    return {"Authorization":"Bearer "+ token}

In [62]:
# Function to get uri's of album and artist
def api_get_uri(album,artist):
    url = "https://api.spotify.com/v1/search?"
    headers = get_auth_header(token)
    query = f"q=album:{album} artist:{artist}&type=album&limit=1"
    result = get(url+query, headers = headers)
    
    # Returns -1 if rate limit of spotify is hit
    if result.status_code == 429:
        print("Limit Reached")
        return -1, -1
    else:
        result = json.loads(result.content)
        if result['albums']['items']:
            album_uri = result['albums']['items'][0]['uri']
            artist_uri = result['albums']['items'][0]['artists'][0]['uri']
            return album_uri, artist_uri
        else:
            return None, None
    
    
    

In [63]:
# Function to get artist data
def api_get_artist_data(uri):
    id_ = uri.split(":")[-1]
    url = f"https://api.spotify.com/v1/artists/{id_}"
    headers = get_auth_header(token)
    result = get(url, headers = headers)
    
    if result.status_code == 429:
        print("Limit Reached")
        return -1
    else:
        one = json.loads(result.content)
        return one

    

In [64]:
# Function to get track features
def api_get_audio_feat(uri):
    id_ = uri.split(":")[-1]
    url = f"https://api.spotify.com/v1/audio-features/{id_}"
    headers = get_auth_header(token)
    
    time.sleep(sleeper['feat'])
    result = get(url, headers = headers)
    if result.status_code == 429:
        print("Limit Reached")
        sleeper['feat']+=0.5
        return -1
    else:
        one = json.loads(result.content)
        return one

In [65]:
# Function to get track data
def api_get_track_data(uri):
    id_ = uri.split(":")[-1]
    
    url = f"https://api.spotify.com/v1/tracks/{id_}"
    headers = get_auth_header(token)
    
    time.sleep(sleeper['tracks'])
    result = get(url, headers = headers)
    
    if result.status_code == 429:
        print("Limit Reached")
        sleeper['tracks']+=0.5
        return -1
    else:
        one = json.loads(result.content)
        return one

In [66]:
# Function to get tracks of each album and its features
def api_get_album_tracks(uri):
    url = "https://api.spotify.com/v1"
    
    headers = get_auth_header(token)
    id_ = uri.split(":")[-1]
    query = "/albums/{}/tracks?offset=0&limit=50".format(id_)
    result = get(url+query,headers = headers)
    
    if result.status_code == 429:
        print("Limit Reached")
        return -1
    else:
        one = json.loads(result.content)
        data = pd.DataFrame(one)
        tracks = []
        for i, x in data['items'].items():
            track = {}
            track['spotify track uri'] = x['uri']
            track['spotify track name'] = x['name']
            track['spotify track number'] = x['track_number']
            track['spotify disc number'] = x['disc_number']
            
            
            track_data = api_get_track_data(x['uri'])
            if track_data == -1:
                track['spotify track popularity'] = None
                track['spotify track duration'] = None
            else:
                track['spotify track popularity'] = track_data['popularity']
                track['spotify track duration'] = track_data['duration_ms']
            
            features = api_get_audio_feat(x['uri'])
            if features != None and features != -1:
                 track['spotify track features'] = {
                'acousticness': features['acousticness'],
                'danceability': features['danceability'],
                'energy': features['energy'],
                'instrumentalness': features['instrumentalness'],
                'liveness': features['liveness'],
                'loudness': features['loudness'],
                'speechiness': features['speechiness'],
                'tempo': features['tempo'],
                'valence': features['valence']}
            else:
                track['spotify track features'] = None




            tracks.append(track)
        return tracks
    
    

In [None]:
# Loading Data and adding tracks for each album
# Counter to slow down the API calls
sleeper = defaultdict(lambda: 0.5)
# Path to store file names, album and artist if the data was not pulled
t = int(time.time()*100)
path_for_no_data = f"data/spotify_song_data/no_data/no_data_{t}.json"

# Counter used to store data
counter = 1

# Storing the API calls that did not work
not_work = []

for order in all_paths.keys():
    
    # Search for a specific category
    if order != 'top':
        continue
    
    # Iterating through all the file paths in that category
    for j_file in all_paths[order]:
        
        # Getting the year from the file path
        year = j_file[-9:-5]
        
        # Look for a specific year
        if int(year) not in [2018]:
            continue
        
        # Opening a file 
        with open(j_file) as f:
            data_dict = json.load(f)
        
        # Appending the new dictionaries with added information into new_vals
        new_vals = []
        
        # Iterating through each entry in the original dictionary
        for entry in tqdm.tqdm(data_dict):
            
            # Sleeper to reduce timeout
            time.sleep(sleeper['search'])
            
            # Checking for errors other than the limit reached
            try:
                # Getting the URI's of the album and artist
                entry["spotify album uri"],entry["spotify artist uri"] =  api_get_uri(entry['Album'],entry['Artist Name'])
            except:
                
                # If the info is not retrieved save the file name, album name and artist name to a file
                not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                with open(path_for_no_data, 'w') as file:
                    json.dump(not_work, file)
                continue
                
            
            # Checking to see whether limit has reached (if limit has reached all functions return -1)
            if entry["spotify album uri"] == -1:
                
                # Increase the sleep time if limit has reached
                sleeper['search']+=0.5
                
                # Appending the file name, album and artist since the data retrieval did not work
                not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                with open(path_for_no_data, 'w') as file:
                    json.dump(not_work, file)
                continue
            
            # To store artist data - we run through the same steps as for uri retrieval
            artist_data = {}
            if entry['spotify artist uri'] != None:
                
                time.sleep(sleeper['artist'])
                
                try:
                    artist = api_get_artist_data(entry['spotify artist uri'])
                    if artist == -1:
                        sleeper['artist']+=0.5
                    else:
                        artist_data = {
                        'spotify artist name': artist['name'],
                        'spotify artist popularity': artist['popularity'],
                        'spotify artist followers': artist['followers']['total'],
                        'spotify artist genres': artist['genres']}
                except:
                    not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                    with open(path_for_no_data, 'w') as file:
                        json.dump(not_work, file)
                

            # To get album track data - we run through the same steps as for uri retrieval
            if entry['spotify album uri'] != None:
                
                time.sleep(sleeper['album'])
                try:
                    # Getting the tracks, track data and features
                    result = api_get_album_tracks(entry['spotify album uri'])

                    if result == -1:
                        sleeper['album']+=0.5
                        not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                        with open(path_for_no_data, 'w') as file:
                            json.dump(not_work, file)
                        continue
                        
                    # Checkin to see if the number of tracks returned is not 0
                    if len(result)!=0:
                        for res in result:
                            # Creating a new dictionary with added information
                            dup_entry = copy.deepcopy(entry)
                            dup_entry.update(res)
                            dup_entry.update(artist_data)
                            
                            # Appending to the main list
                            new_vals.append(dup_entry) 
                    else:
                        not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                        with open(path_for_no_data, 'w') as file:
                            json.dump(not_work, file)
                except:
                    not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                    with open(path_for_no_data, 'w') as file:
                        json.dump(not_work, file)
                    
                # For every 10 counts we save the data - to prevent losing all information in case of rate limit reached
                if counter%10 == 0:
                    store_path = "data/spotify_song_data/{}/{}".format(order,year+".json")
                    with open(store_path, 'w') as file:
                            json.dump(new_vals, file)
                            
                # Getting a new token every 40 iterations
                if counter%40 == 0:
                    token = get_token()
                counter+=1
            else:
                not_work.append({"file":j_file,"album":entry['Album'],"artist":entry['Artist Name']})
                with open(path_for_no_data, 'w') as file:
                    json.dump(not_work, file)
                    
        # Store all the data into the year for the particular chart if all iterations are completed
        store_path = "data/spotify_song_data/{}/{}".format(order,year+".json")            
        with open(store_path, 'w') as file:
            json.dump(new_vals, file)
                

  6%|█████                                                                         | 67/1040 [16:35<4:44:36, 17.55s/it]