We start by importing the required libraries and connecting to Spotify

In [None]:
import spotipy
import spotipy.util as util
import pickle
import sys
import requests
from tqdm import tqdm_notebook as tqdm
import os
import pandas as pd
import numpy as np
from multiprocessing import Pool
from fuzzywuzzy import fuzz
import string
import json
import unidecode

In [None]:
with open('hot-100_data.pkl', 'rb') as f:
        df = pickle.load(f)
with open('hot-100_chart.pkl', 'rb') as f:
        chart = pickle.load(f)

In [None]:


# Spotify Infos
scope = 'user-library-read'
username = "ilias.miraoui@gmail.com"

token = util.prompt_for_user_token(username, scope, client_id='6d6b00a4a500471e94c5a3f1559e8cc6', client_secret='500950b79db94ac88f6859d24e4e6033', redirect_uri='http://localhost:8888/callback')
if token:
    spotify = spotipy.Spotify(auth=token)
else:
    print("Connexion to spotify API failed.")
    sys.exit()

    


We create a function to compare strings and make sure that we are fetching the right songs

In [None]:
def simplify_string(s):
    return unidecode.unidecode(s.lower()).replace('the', ' ').replace(' ', '').translate(str.maketrans('', '', string.punctuation))

def similar(a, b, threshold=70):
    return fuzz.partial_ratio(simplify_string(a), simplify_string(b)) >= threshold

We define a few functions to get the required data from spotify

In [None]:
def get_song_details(artist,track):
    song_details = dict()
    results = spotify.search(q=" track:" + track + ' artist:' + artist  , type='track')
    maxi = None
    if results['tracks']['total'] > 0:
        for i in results['tracks']['items']:
            #and similar(artist, i["artists"][0]["name"], 80)
            if similar(track, i['name'], 80):
                maxi = i
                break
        if maxi is None:
            return None
        if results is None or len(results) == 0:
            return None
        _id = maxi["id"] 
        song_details[_id] = dict()
        song_details[_id]["explicit"] = maxi["explicit"]
        song_details[_id]["duration_ms"] = maxi["duration_ms"]
        song_details[_id]["disc_number"] = maxi["disc_number"]
        song_details[_id]["track_number"] = maxi["track_number"]
        song_details[_id]["album_id"] = maxi["album"]["id"]
        song_details[_id]["album_release_date"] = maxi["album"]["release_date"]
        song_details[_id]["album_release_date_precision"] = maxi["album"]["release_date_precision"]
        song_df.loc[song,"num_artists"] = len(maxi["artists"])
    return song_details


    

In [None]:
def get_song_features(_id):
    song_features = dict()
    results = spotify.audio_features(_id)
    results = results[0]
    if results != None: 
        song_features["danceability"] = results["danceability"]
        song_features["energy"] = results["energy"]
        song_features["key"] = results["key"]
        song_features["loudness"] = results["loudness"]
        song_features["mode"] = results["mode"]
        song_features["speechiness"] = results["speechiness"]
        song_features["acousticness"] = results["acousticness"]
        song_features["instrumentalness"] = results["instrumentalness"]
        song_features["liveness"] = results["liveness"]
        song_features["valence"] = results["valence"]
        song_features["tempo"] = results["tempo"]
        song_features["time_signature"] = results["time_signature"]
        return song_features
    else:
        return None

In [None]:
def get_label(album_id):
    label = spotify.album(album_id)["label"]
    return label

def get_album_type(album_id):
    album_type = spotify.album(album_id)["album_type"]
    return album_type

We only keep one line per song and we start fetching the data via the Spotify API

In [None]:
song_df = df.drop_duplicates(["artist","title"])
song_df["spotify_id"] = None

In [None]:
i=0
for song in song_df.index:
    if pd.isna(song_df.loc[song,"spotify_id"]) == True:
        title = song_df.loc[song,"title"]
        artist = song_df.loc[song,"artist"]
        song_details = get_song_details(artist,title)
        if type(song_details) == dict and len(song_details)>0 :
            song_df.loc[song,"spotify_id"] = list(song_details.keys())[0]
            for feature in list(song_details[list(song_details.keys())[0]].keys()):
                song_df.loc[song,"spotify_"+feature] = song_details[list(song_details.keys())[0]][feature]
            song_df.loc[song,"album_label"] = get_label(song_df.loc[song,"spotify_album_id"])
            song_df.loc[song,"album_type"] = get_album_type(song_df.loc[song,"spotify_album_id"])

Unfortunately, the Spotify Search API has very specific requirements and almost half of the songs are not caught directly. Sometimes, the way multiple artists are presented trips up the spotify API. It could also be the way the track has been modified (because of some explicit words or because of some special characters). We create a new functions to adjust for these problems and we run the following to complete our dataset

In [None]:
def get_song_details_multiple(artist,track):
    song_details = dict()
    artist = artist.split(" & ")[0]
    artist = artist.split(" x ")[0]
    artist = artist.split(" + ")[0]
    artist = artist.split(", ")[0]
    artist = artist.split(" From ")[0]
    artist = artist.split("(")[0]
    artist = artist.split(" Presents ")[0]
    artist = artist.split(" featuring ")[0]
    artist = artist.split(" Featuring ")[0]
    artist = artist.split(" Feauring ")[0]
    artist = artist.split(" Or ")[0]
    artist = artist.split(" And ")[0]
    track = track.replace("****","igga")
    track = track.replace("***","igg")
    track = track.replace("**","uc")
    track = track.replace(":",".")
    track = track.replace("$","s")
    
    results = spotify.search(q=" track:" + track + ' artist:' + artist  , type='track')
    maxi = None
    if results['tracks']['total'] > 0:
        for i in results['tracks']['items']:
            spotify_track = i['name'].split("(")[0]
            spotify_track = i['name'].split(" - ")[0]
            print(spotify_track)
            print(track)
            if similar(track,spotify_track, 60):
                maxi = i
                break
        if maxi is None:
            return None
        if results is None or len(results) == 0:
            return None
        _id = maxi["id"] 
        song_details[_id] = dict()
        song_details[_id]["explicit"] = maxi["explicit"]
        song_details[_id]["duration_ms"] = maxi["duration_ms"]
        song_details[_id]["disc_number"] = maxi["disc_number"]
        song_details[_id]["track_number"] = maxi["track_number"]
        song_details[_id]["album_id"] = maxi["album"]["id"]
        song_details[_id]["album_release_date"] = maxi["album"]["release_date"]
        song_details[_id]["album_release_date_precision"] = maxi["album"]["release_date_precision"]
    return song_details

i=0
pbar = tqdm(song_df.index)
for song in pbar:
    if pd.isna(song_df.loc[song,"spotify_id"]) == True:
        title = song_df.loc[song,"title"]
        artist = song_df.loc[song,"artist"]
        song_details = get_song_details_multiple(artist,title)
        if type(song_details) == dict and len(song_details)>0 :
            song_df.loc[song,"spotify_id"] = list(song_details.keys())[0]
            for feature in list(song_details[list(song_details.keys())[0]].keys()):
                song_df.loc[song,"spotify_"+feature] = song_details[list(song_details.keys())[0]][feature]
            song_df.loc[song,"album_label"] = get_label(song_df.loc[song,"spotify_album_id"])

In [None]:
for song in song_df[song_df["spotify_id"].notnull()].index:
    spotify_key = song_df.loc[song,"spotify_key"]
    spotify_id = song_df.loc[song,"spotify_id"]
    if pd.isna(spotify_key) == True:
        song_features = get_song_features(spotify_id)
        if song_features != None:
            for feature in list(song_features.keys()):
                song_df.loc[song,"spotify_"+feature] = song_features[feature]

We are missing just a bit more than 200 lines which is already a very good results (most of the missing songs are very old and do not exist on Spotify given Spotify is a relatively recent Streaming Website). So we save the file

In [None]:
with open('song_df.pkl', 'wb') as f:
    pickle.dump(song_df, f)