### Spotify - grab songs and features

#### Basic Initial imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import random
from random import randint
from time import sleep
import re

#### Creating the Spotify connection with spotipy object

In [2]:
secrets_file = open("spotkey.txt","r")
string = secrets_file.read()

secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

#### Scraping for playlist URI's

In [3]:
url = "https://audiohype.io/resources/the-best-spotify-playlists/"

response = requests.get(url)
response.status_code

200

Creating soup to extract the playlist URI's

In [4]:
soup = BeautifulSoup(response.content, "html.parser")
data = soup.select('.rank-math-link')

Utility funtions

In [5]:
def flatten(input_list):
    return [item for sublist in input_list for item in sublist]

def reg_replace(x):
    return re.findall(r'\bhttps?:\/\/open.spotify\.com\/playlist\/([^\s?]+)',x)

Creating the playlists dataset and adding nanually 2 rows with big playlists

In [12]:
# name = [x.get_text() for x in data]
# uri = flatten([reg_replace(x['href']) for x in data])
# name.pop(0)

# playlist_uri = pd.DataFrame({"name": name, "uri":uri})

playlist_uri.loc[len(playlist_uri.index)] = ['Really big one', '4rnleEAOdmFAbRcNCgZMpY']
playlist_uri.loc[len(playlist_uri.index)] = ['The longest playlist Official', '5S8SJdl1BDc0ugpkEvFsIL']

#### Creating a dataframe to host the results from the API

In [None]:
spotify_df = pd.DataFrame()
playlist_lst = ['track_id','artist','track_name','danceability','energy','key','loudness',
                'mode', 'speechiness','instrumentalness','liveness','valence','tempo',
                'duration_ms','time_signature']


for i in playlist_uri['uri'][28:30]:
    try:
        playlist = sp.playlist_items(i)
        
        while playlist['next']!=None:
            # Create empty dictionary to be added to the dataframe
            playlist_features = {}

            # Get metadata for eache track
            playlist_features['artist'] = [x['track']['album']['artists'][0]['name'] for x in playlist['items']]  
            playlist_features['track_name'] = [x['track']['name'] for x in playlist['items']]
            playlist_features['track_id'] = [x['track']['id'] for x in playlist['items']]


            # Get audio features and adding them to the respective columns
            audio_features = [sp.audio_features(x['track']['id'])[0] for x in playlist['items']]

            for feature in playlist_lst[3:]:
                playlist_features[feature] = [x[feature] for x in audio_features]

            track_df = pd.DataFrame(playlist_features)
            spotify_df = pd.concat([spotify_df, track_df], axis = 0)
            print(i, 'done')
            
            playlist = sp.next(playlist)
            # respectful nap
            wait_time = random.randint(1,3)
            print('Napping for ',wait_time,'s')
            sleep(wait_time)
    except:
        print(i, 'invalid')


#### Adding few more songs from kaggle dataset

In [30]:
# Loading auxiliary dataset 
aux_data = pd.read_csv('Data/SpotifyAudioFeaturesApril2019.csv')
aux_data.rename(columns={'artist_name': 'artist'}, inplace=True)
aux_data = aux_data[spotify_df.columns]

# Concatenating datasets into one big final dataset
spotify_df = pd.concat([spotify_df, aux_data], axis = 0)

spotify_df = spotify_df.drop_duplicates().reset_index(drop=True)
spotify_df

Unnamed: 0,artist,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Hozier,Take Me To Church,7dS5EaCoMnN7DzlpT6aRn2,0.566,0.664,4,-5.303,0,0.0464,0.000000,0.1160,0.4370,128.945,241688,4
1,Mike Posner,Cooler Than Me - Single Mix,2V4bv1fNWfTcyRJKmej6Sj,0.768,0.820,7,-4.630,0,0.0474,0.000000,0.6890,0.6250,129.965,213293,4
2,"Tyler, The Creator",See You Again (feat. Kali Uchis),7KA4W4McWYRpgf0fWsJZWB,0.558,0.559,6,-9.222,1,0.0959,0.000007,0.1090,0.6200,78.558,180387,4
3,Bastille,Pompeii,3gbBpTdY8lnQwqxNCcf795,0.679,0.715,9,-6.383,1,0.0407,0.000000,0.2710,0.5710,127.435,214148,4
4,Shakira,Hips Don't Lie (feat. Wyclef Jean),3ZFTkvIE7kyPt6Nu3PEa7V,0.778,0.824,10,-5.892,0,0.0707,0.000000,0.4050,0.7580,100.024,218093,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139443,Calum Scott,Come Back Home,0cvfSKcm9VeduwyYPrxtLx,0.601,0.801,11,-5.174,1,0.0323,0.000000,0.0991,0.2890,131.049,190539,4
139444,Saint Claire,Enough for You,43MP9F7UzvfilSrw2SqZGJ,0.387,0.249,9,-13.233,1,0.0437,0.000000,0.1030,0.3460,94.039,194583,4
139445,Mike Stud,Do It,4TWlUuFk81NGUNKwndyS5Q,0.717,0.532,8,-8.351,0,0.2060,0.000000,0.0997,0.5460,156.977,139191,4
139446,D Savage,No Smoke,5iGBXzOoRo4sBTy8wdzMyK,0.772,0.510,4,-9.670,0,0.1200,0.000000,0.1310,0.0755,120.049,180013,4


#### Saving our tedious work to a csv file for further work.

In [31]:
spotify_df.to_csv('Data/spotify_df.csv',index=False)