### Spotify - grab songs and features

#### Basic Initial imports

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import random
from random import randint
from time import sleep
import re

#### Creating the Spotify connection with spotipy object

In [1]:
secrets_file = open("spotkey.txt","r")
string = secrets_file.read()

secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

#### Scraping for playlist URI's

In [2]:
url = "https://audiohype.io/resources/the-best-spotify-playlists/"

response = requests.get(url)
response.status_code

200

Creating soup to extract the playlist URI's

In [3]:
soup = BeautifulSoup(response.content, "html.parser")
data = soup.select('.rank-math-link')

Utility funtions

In [4]:
def flatten(input_list):
    return [item for sublist in input_list for item in sublist]

def reg_replace(x):
    return re.findall(r'\bhttps?:\/\/open.spotify\.com\/playlist\/([^\s?]+)',x)

Creating the playlists dataset and adding nanually 2 rows with big playlists

In [5]:
name = [x.get_text() for x in data]
uri = flatten([reg_replace(x['href']) for x in data])

playlist_df = pd.DataFrame({"name": name[:-1],
                       "uri": uri
                      })

playlist_df.loc[len(playlist_df.index)] = ['Really big one', '4rnleEAOdmFAbRcNCgZMpY']
playlist_df.loc[len(playlist_df.index)] = ['The longest playlist Official', '5S8SJdl1BDc0ugpkEvFsIL']

#### Main function to extract songs and their features

In [6]:
def playlist_dff(playlist_id):
    # status flag to check if the list is in the first page or not
    flag=False
    
    # Creatinga a dataframe with the columns that we need
    playlist_lst = ['track_id','artist','track_name','danceability','energy','key','loudness',
                    'mode', 'speechiness','instrumentalness','liveness','valence','tempo',
                    'duration_ms','time_signature']
    playlist_df = pd.DataFrame(columns = playlist_lst)
    
    # Must use a try method because some of the URI's could be invalid.
    try:
        # Creating the spotipy element for the playlist URI
        playlist = sp.playlist_items(playlist_id)
        
        while playlist['next']!=None:
            
            # If the flag is true, we have to update the spotipy object to the next page
            if flag:
                playlist = sp.next(playlist)
                # respectful nap
                wait_time = random.randint(1,3)
                print('Napping for ',wait_time,'s')
                sleep(wait_time)
            
            # for each track on the request we get details and create a dataframe
            for track in playlist['items']:
                
                # Create empty dictionary to be added to the dataframe
                playlist_features = {}
                # Get metadata for eache track
                playlist_features['artist'] = track['track']['album']['artists'][0]['name']
                playlist_features['track_name'] = track['track']['name']
                playlist_features['track_id'] = track['track']['id']

                # Get audio features and adding them to the respective columns
                audio_features = sp.audio_features(track['track']['id'])[0]
                for feature in playlist_lst[3:]:
                    playlist_features[feature] = audio_features[feature]

                # Concatenate the dataframes
                track_df = pd.DataFrame(playlist_features, index = [0])
                playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
            
            flag = True
            print(playlist_id, 'Success!')        
    except:
        print(playlist_id, 'Invalid playlist')
        
    return playlist_df

#### Creating a dataframe to host the results from the API

In [None]:
spotify_df = pd.DataFrame()

for i in playlist_df['uri']:
    spotify_df = pd.concat([spotify_df, playlist_dff(i)], axis = 0)


#### Saving our tedious work to a csv file for further work.

In [13]:
spotify_df.drop_duplicates().to_csv('spotify_df.csv',index=False)