## Lab | Extending the internal databases with audio features

In [1]:
import sys
from config import *
import pandas as pd
import numpy as np
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials
import time
from bs4 import BeautifulSoup
import requests

In [2]:
hot_100_songs = pd.read_csv('hot_100_songs.csv')
not_hot_songs = pd.read_csv('not_hot_songs.csv')

In [3]:
hot_100_songs.head()
hot_100_songs = hot_100_songs.drop(columns='Unnamed: 0')

In [4]:
not_hot_songs.head()
columns_to_drop = ['Unnamed: 0', 'index']
not_hot_songs= not_hot_songs.drop(columns=columns_to_drop)

In [21]:
# a function to extract songs id from spotify API
def search_song(title, artist, limit):
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))
    try:
        results = sp.search(q=f"{title} {artist}", type='track', limit=limit)
        song_id = results['tracks']['items'][0]['id']
        return song_id
    except IndexError:
        print("Song not found!")
        return "None"
    
    
# a function to extract the audio features of songs from spotify using the songs id

def get_audio_features(list_of_song_ids):
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))

    audio_features_dict = {'id': [], 'danceability': [], 'energy': [],
                           'key': [], 'loudness': [], 'mode': [],
                           'speechiness': [], 'acousticness': [],
                           'instrumentalness': [], 'liveness': [],
                           'valence': [], 'tempo': []}
    
    chunks = np.array_split(list_of_song_ids, len(list_of_song_ids) // 50 + 1)
    
    for i, chunk in enumerate(chunks):
        print("Collecting IDs for chunk...", i+1)        
        features = sp.audio_features(chunk)
        for feature in features:
            for key, value in feature.items():
                if key in audio_features_dict:
                    audio_features_dict[key].append(value)
        time.sleep(20)
    audio_features_df = pd.DataFrame(audio_features_dict)
    return audio_features_df   
    
# a function to concat a given dataframe with the audio features dataframe and return the extended data frame  
def add_audio_features(df, audio_features_df):
  
    merged_df = pd.merge(df, audio_features_df, on='id', how='inner')
    return merged_df    


In [13]:
# getting the songs id for the hot_100_songs
song_ids = []
for row in range(hot_100_songs.shape[0]):
    print("Getting the id for song: ", row)
    song_id = search_song(hot_100_songs.iloc[row,1], hot_100_songs.iloc[row,0],1)
    song_ids.append(song_id)
    if row%50 == 0:
        print("Sleeping for 20 seconds before asking for the next 50 songs")
        time.sleep(20)

song_ids[:10]

Getting the id for song:  0
Sleeping for 20 seconds before asking for the next 50 songs
Getting the id for song:  1
Getting the id for song:  2
Getting the id for song:  3
Getting the id for song:  4
Getting the id for song:  5
Getting the id for song:  6
Getting the id for song:  7
Getting the id for song:  8
Getting the id for song:  9
Getting the id for song:  10
Getting the id for song:  11
Getting the id for song:  12
Getting the id for song:  13
Getting the id for song:  14
Getting the id for song:  15
Getting the id for song:  16
Getting the id for song:  17
Getting the id for song:  18
Getting the id for song:  19
Getting the id for song:  20
Getting the id for song:  21
Getting the id for song:  22
Getting the id for song:  23
Getting the id for song:  24
Getting the id for song:  25
Getting the id for song:  26
Getting the id for song:  27
Getting the id for song:  28
Getting the id for song:  29
Getting the id for song:  30
Getting the id for song:  31
Getting the id for son

['2EjXfH91m7f8HiJN1yQg97',
 '7iKRL1F3m4t4dkMponnD0P',
 '7vQbuQcyTflfCIOu3Uzzya',
 '2FRnf9qhLbvw8fu4IBXx78',
 '77khP2fIVhSW23NwxrRluh',
 '5hslUAKq9I9CG2bAulFkHN',
 '2uFaJJtFpPDc5Pa95XzTvg',
 '0oPdaY4dXtc3ZsaG17V972',
 '4xhsWYTOGcal8zt0J161CU',
 '5ASM6Qjiav2xPe7gRkQMsQ']

In [15]:
# saving the new dataframe with song ids
#hot_100_songs['id'] = song_ids
#hot_100_songs.head()
#hot_100_songs.to_csv("./hot_100_songs_with_id.csv", index=False)


In [6]:
# Getting the songs ID for the not_hot_songs
not_hot_song_ids = []
for row in range(not_hot_songs.shape[0]):
    print("Getting the id for song: ", row+1)
    song_id = search_song(not_hot_songs.iloc[row,1], not_hot_songs.iloc[row,0],1)
    not_hot_song_ids.append(song_id)
    if row%50 == 0:
        print("Sleeping for 20 seconds before asking for the next 50 songs")
        time.sleep(20)

not_hot_song_ids[:10]

Getting the id for song:  1
Sleeping for 20 seconds before asking for the next 50 songs
Getting the id for song:  2
Getting the id for song:  3
Getting the id for song:  4
Getting the id for song:  5
Getting the id for song:  6
Getting the id for song:  7
Getting the id for song:  8
Getting the id for song:  9
Getting the id for song:  10
Getting the id for song:  11
Getting the id for song:  12
Getting the id for song:  13
Getting the id for song:  14
Getting the id for song:  15
Getting the id for song:  16
Getting the id for song:  17
Getting the id for song:  18
Getting the id for song:  19
Getting the id for song:  20
Getting the id for song:  21
Getting the id for song:  22
Getting the id for song:  23
Getting the id for song:  24
Getting the id for song:  25
Getting the id for song:  26
Getting the id for song:  27
Getting the id for song:  28
Getting the id for song:  29
Getting the id for song:  30
Getting the id for song:  31
Getting the id for song:  32
Getting the id for so

['2lCkncy6bIB0LTMT7kvrD1',
 '1hlwA8qvXx0Bl6iYuDbIuD',
 '0idjTHGhGRBdWTmsalq3tF',
 '0PAcdVzhPO4gq1Iym9ESnK',
 '507tQXutCPYNFkpOq8tmKF',
 '6l0Ky2PycJV1Bz88kFeVGL',
 '77p4fRpoeiZMtUy3P9kN21',
 '4Sh8bgQTdqKXnSl73SX6BE',
 '1nGoy0cEj0extAwXYNYTWd',
 '54o0kiaqwj3yBKYHw4E7FR']

In [8]:
#not_hot_song_ids.index ("None")
#not_hot_songs['id'] = not_hot_song_ids
#not_hot_songs = not_hot_songs[not_hot_songs['id'] != "None"]
#not_hot_songs.head()
#not_hot_songs.to_csv("./not_hot_songs_with_id.csv", index=False)

In [22]:
# Getting the audio features for hot 100 songs
audio_features_hot = get_audio_features(song_ids)

Collecting IDs for chunk... 1
Collecting IDs for chunk... 2
Collecting IDs for chunk... 3


In [23]:
audio_features_hot

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2EjXfH91m7f8HiJN1yQg97,0.589,0.472,8,-8.749,1,0.0502,0.61400,0.000000,0.5050,0.898,67.196
1,7iKRL1F3m4t4dkMponnD0P,0.548,0.871,7,-4.253,1,0.0708,0.00113,0.000000,0.7650,0.640,128.136
2,7vQbuQcyTflfCIOu3Uzzya,0.754,0.424,2,-8.463,1,0.0363,0.64300,0.000000,0.0652,0.806,119.705
3,2FRnf9qhLbvw8fu4IBXx78,0.735,0.478,2,-12.472,1,0.0293,0.18900,0.000002,0.3550,0.947,107.682
4,77khP2fIVhSW23NwxrRluh,0.683,0.375,0,-13.056,1,0.0303,0.57900,0.000000,0.0760,0.888,140.467
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0O3U5iwTbiXCREMkvotJuN,0.616,0.834,9,-3.069,1,0.0664,0.06000,0.000000,0.3380,0.746,151.701
96,2KslE17cAJNHTsI2MI0jb2,0.711,0.809,2,-4.389,0,0.0955,0.04470,0.000000,0.3390,0.816,106.017
97,73zawW1ttszLRgT9By826D,0.498,0.764,4,-5.006,1,0.0309,0.12300,0.000031,0.1190,0.489,147.984
98,2Sy3dDqPwjnTO3PnommJPe,0.844,0.637,10,-6.072,0,0.0325,0.22000,0.000000,0.0459,0.960,125.012


In [24]:
# a new dataframe with the audo features for hot 100 songs
hot_100_songs_feat = add_audio_features(hot_100_songs, audio_features_hot)

In [25]:
# getting the audio features and creating a new dataframe with the audo features for not hot songs
audio_features_not_hot = get_audio_features(not_hot_song_ids)
not_hot_songs_feat = add_audio_features(not_hot_songs, audio_features_not_hot)

Collecting IDs for chunk... 1
Collecting IDs for chunk... 2
Collecting IDs for chunk... 3
Collecting IDs for chunk... 4
Collecting IDs for chunk... 5
Collecting IDs for chunk... 6
Collecting IDs for chunk... 7
Collecting IDs for chunk... 8
Collecting IDs for chunk... 9
Collecting IDs for chunk... 10
Collecting IDs for chunk... 11
Collecting IDs for chunk... 12
Collecting IDs for chunk... 13
Collecting IDs for chunk... 14
Collecting IDs for chunk... 15
Collecting IDs for chunk... 16
Collecting IDs for chunk... 17
Collecting IDs for chunk... 18
Collecting IDs for chunk... 19
Collecting IDs for chunk... 20
Collecting IDs for chunk... 21
Collecting IDs for chunk... 22
Collecting IDs for chunk... 23
Collecting IDs for chunk... 24
Collecting IDs for chunk... 25
Collecting IDs for chunk... 26
Collecting IDs for chunk... 27
Collecting IDs for chunk... 28
Collecting IDs for chunk... 29
Collecting IDs for chunk... 30
Collecting IDs for chunk... 31
Collecting IDs for chunk... 32
Collecting IDs fo

In [26]:
not_hot_songs_feat


Unnamed: 0,Artist Name,Title of Song,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,J Balvin,Azul,2lCkncy6bIB0LTMT7kvrD1,0.843,0.836,11,-2.474,0,0.0695,0.08160,0.001380,0.0532,0.6500,94.018
1,Omarion,I'm Up,1hlwA8qvXx0Bl6iYuDbIuD,0.793,0.505,6,-7.061,0,0.1380,0.20400,0.000000,0.0413,0.7040,97.998
2,Sylvan Esso,Radio,0idjTHGhGRBdWTmsalq3tF,0.557,0.895,0,-6.547,1,0.0329,0.64200,0.018600,0.2540,0.5910,119.993
3,Morgan Wallen,One Thing At A Time,0PAcdVzhPO4gq1Iym9ESnK,0.656,0.757,3,-5.775,0,0.0308,0.49200,0.000000,0.1170,0.4290,139.971
4,Zara Larsson,Don't Worry Bout Me,507tQXutCPYNFkpOq8tmKF,0.752,0.706,5,-6.274,1,0.0636,0.07970,0.000752,0.1290,0.3300,125.041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,Ariana Grande,motive,5GkQIP5mWPi4KZLLXeuFTT,0.789,0.742,10,-5.664,0,0.0652,0.06810,0.000000,0.1660,0.6610,116.965
2514,Joji,I'LL SEE YOU IN 40,606F3qdYCXtDVtKN53YsuW,0.411,0.377,3,-9.553,1,0.0296,0.34700,0.765000,0.2220,0.0321,139.881
2515,Jonas Brothers,Used To Be,2n6red10oCd6YhlEUp6jXy,0.710,0.517,1,-6.850,1,0.0448,0.17200,0.000000,0.0786,0.1760,145.946
2516,Gryffin,I Want Love,2ybp2K0VO9sbtGFCfO8tp1,0.586,0.764,2,-6.488,1,0.0345,0.00467,0.012500,0.1620,0.1660,120.996


In [30]:
not_hot_songs_feat = not_hot_songs_feat.drop_duplicates(subset=['id'])

In [32]:
# saving the new dataframes with audio features
hot_100_songs_feat.to_csv('hot_100_songs_with_audio.csv', index=False)
not_hot_songs_feat.to_csv('not_hot_songs_with_audio.csv', index=False)