## 1. SPOTIFY API

In our database, we do not have every track on Earth, so we need to get information about some tracks that users of our bot like.

This functionality can be done using Spotify API

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Replace with your own credentials
client_id = 'ID'
client_secret = 'SECRET'

# Authentication
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Let's try to find some track by the title

In [2]:
track_title = 'FUKUROU'
results = sp.search(q=f'track:{track_title}', type='track')

# Print the first result
if results['tracks']['items']:
    track = results['tracks']['items'][0]
    print(f"Track Name: {track['name']}")
    print(f"Artist: {track['artists'][0]['name']}")
    print(f"Album: {track['album']['name']}")
    print(f"Spotify URI: {track['uri']}")
else:
    print("No tracks found.")

Track Name: FUKUROU
Artist: CASH GROWE
Album: FUKUROU
Spotify URI: spotify:track:0jrtuaOgtbwBDH3r6UQbSc


Now let's increase the functionality by query-based retrieval (it might be '{artist} - {title}', just '{title}' or something else)

In [21]:
def search_track(query, limit=1):
    # we take {limit} tracks from the search results
    results = sp.search(q=query, type='track', limit=limit)
    return results['tracks']['items'][0]


res = search_track('track:FUKUROU')

In [22]:
for elem in res:
    print(f'{elem}:\t{res[elem]}')

album:	{'album_type': 'single', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7eejBhyXLSFgUMq7DFQO3V'}, 'href': 'https://api.spotify.com/v1/artists/7eejBhyXLSFgUMq7DFQO3V', 'id': '7eejBhyXLSFgUMq7DFQO3V', 'name': 'CASH GROWE', 'type': 'artist', 'uri': 'spotify:artist:7eejBhyXLSFgUMq7DFQO3V'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 

From the code above, we can get the 'popularity' of the track, which could potentially be used for clustering later.

Let's get other features like genre, tempo, key, etc. of some track

In [23]:
def get_audio_features(track_id):
    audio_info = sp.audio_features([track_id])[0]
    return audio_info

id = res['id']
feature = get_audio_features(id)
for elem in feature:
    print(f'{elem}:\t{feature[elem]}')

danceability:	0.396
energy:	0.703
key:	5
loudness:	-5.013
mode:	0
speechiness:	0.0899
acousticness:	0.0957
instrumentalness:	0.00386
liveness:	0.167
valence:	0.251
tempo:	85.614
type:	audio_features
id:	0jrtuaOgtbwBDH3r6UQbSc
uri:	spotify:track:0jrtuaOgtbwBDH3r6UQbSc
track_href:	https://api.spotify.com/v1/tracks/0jrtuaOgtbwBDH3r6UQbSc
analysis_url:	https://api.spotify.com/v1/audio-analysis/0jrtuaOgtbwBDH3r6UQbSc
duration_ms:	162462
time_signature:	4


In [24]:
def find_track_features(title, artist=None):
    if artist:
        query = f'track:{title} artist:{artist}'
    else:
        query = f'track:{title}'

    features = search_track(query)
    features = features | get_audio_features(features['id'])
    return features

In [25]:
info = find_track_features('SWITCHING LANES', 'HXVRMXN')

In [28]:
for f in info:
    print(f'{f}:\t{info[f]}')

album:	{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2kxLXJ17WUKNIwyIfeDigH'}, 'href': 'https://api.spotify.com/v1/artists/2kxLXJ17WUKNIwyIfeDigH', 'id': '2kxLXJ17WUKNIwyIfeDigH', 'name': 'HXVRMXN', 'type': 'artist', 'uri': 'spotify:artist:2kxLXJ17WUKNIwyIfeDigH'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM'

Now, let's change the structure of the dick a little bit

In [29]:
def format_track(info):
    features = dict({
        'id': info.get('id'),
        'name': info.get('name'),
        'album': info['album'].get('name') if info.get('album') else None,
        'album_id': info['album'].get('id') if info.get('album') else None,
        'artists': '',
        'artist_ids': '',
        'track_number': info.get('track_number'),
        'disc_number': info.get('disc_number'),
        'explicit': info.get('explicit'),
        'danceability': info.get('danceability'),
        'energy': info.get('energy'),
        'key': info.get('key'),
        'loudness': info.get('loudness'),
        'mode': info.get('mode'),
        'speechiness': info.get('speechiness'),
        'acousticness': info.get('acousticness'),
        'instrumentalness': info.get('instrumentalness'),
        'liveness': info.get('liveness'),
        'valence': info.get('valence'),
        'tempo': info.get('tempo'),
        'duration_ms': info.get('duration_ms'),
        'time_signature': info.get('time_signature'),
        'year': info['album']['release_date'].split('-')[0] if info.get('album') and info['album'].get('release_date') else None,
        'release_date': info['album'].get('release_date') if info.get('album') else None,

        # new fields
        'popularity': info.get('popularity')
        })
    
    artists_names = [artist['name'] for artist in info['artists']]
    features['artists'] = artists_names
    artists_ids = [artist['id'] for artist in info['artists']]
    features['artist_ids'] = artists_ids
    return features

In [30]:
feature = format_track(info)
for elem in feature:
    print(f'{elem}:\t{feature[elem]}')

id:	5WMzWvcNbVMUDrOIRcZAVa
name:	SWITCHING LANES
album:	SWITCHING LANES
album_id:	4BiEgXM0oeleWNNEevB16J
artists:	['HXVRMXN', 'SLVG', 'XHNORT']
artist_ids:	['2kxLXJ17WUKNIwyIfeDigH', '779QJkvhTmrxtUGaMo8QbK', '23vwxJA0kYYvK0iwe06MAd']
track_number:	2
disc_number:	1
explicit:	True
danceability:	0.675
energy:	0.789
key:	7
loudness:	-10.197
mode:	0
speechiness:	0.035
acousticness:	0.0452
instrumentalness:	0.917
liveness:	0.165
valence:	0.106
tempo:	117.984
duration_ms:	162908
time_signature:	4
year:	2021
release_date:	2021-10-08
popularity:	37


## 2. LIMITATIONS

The code below supposed to extend our data frame with new features(popularity and genres), but the API of Spotify have rate and amount of searches limits, so it is not possible to extend features of 1M+ records

So, that code will not present in the resulting project, it was just for testing

In [11]:
def get_audio_features(track_id):
    features = sp.track(track_id)
    audio_info = sp.audio_features([track_id])[0]
    summary = features | audio_info

    genres = set()
    artists_ids = [artist['id'] for artist in summary['artists']]
    for artist_id in artists_ids:
        artist_info = sp.artist(artist_id)
        genres.update(artist_info['genres'])
    genres = list(genres)
    summary['genres'] = genres

    return summary

id = search_track('SWITCHING LANES HXVRMXN', 1)[0]['id']
feature = get_audio_features(id)
for elem in feature:
    print(f'{elem}:\t{feature[elem]}')

album:	{'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2kxLXJ17WUKNIwyIfeDigH'}, 'href': 'https://api.spotify.com/v1/artists/2kxLXJ17WUKNIwyIfeDigH', 'id': '2kxLXJ17WUKNIwyIfeDigH', 'name': 'HXVRMXN', 'type': 'artist', 'uri': 'spotify:artist:2kxLXJ17WUKNIwyIfeDigH'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM'

Let's see what information we could get from the 'album' field

In [13]:
for f in feature['album']:
    print(f'{f}:\t{feature['album'][f]}')

album_type:	album
artists:	[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2kxLXJ17WUKNIwyIfeDigH'}, 'href': 'https://api.spotify.com/v1/artists/2kxLXJ17WUKNIwyIfeDigH', 'id': '2kxLXJ17WUKNIwyIfeDigH', 'name': 'HXVRMXN', 'type': 'artist', 'uri': 'spotify:artist:2kxLXJ17WUKNIwyIfeDigH'}]
available_markets:	['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE'

Now, we are able to collect all the features from the dataframe features list and even more(also genres associated with authors and popularity of the track), so it is possible to make our search look a little bit more understandable

In [14]:
def format_track(info):
    features = dict({
        'id': info.get('id'),
        'name': info.get('name'),
        'album': info['album'].get('name') if info.get('album') else None,
        'album_id': info['album'].get('id') if info.get('album') else None,
        'artists': '',
        'artist_ids': '',
        'track_number': info.get('track_number'),
        'disc_number': info.get('disc_number'),
        'explicit': info.get('explicit'),
        'danceability': info.get('danceability'),
        'energy': info.get('energy'),
        'key': info.get('key'),
        'loudness': info.get('loudness'),
        'mode': info.get('mode'),
        'speechiness': info.get('speechiness'),
        'acousticness': info.get('acousticness'),
        'instrumentalness': info.get('instrumentalness'),
        'liveness': info.get('liveness'),
        'valence': info.get('valence'),
        'tempo': info.get('tempo'),
        'duration_ms': info.get('duration_ms'),
        'time_signature': info.get('time_signature'),
        'year': info['album']['release_date'].split('-')[0] if info.get('album') and info['album'].get('release_date') else None,
        'release_date': info['album'].get('release_date') if info.get('album') else None,

        # new fields
        'popularity': info.get('popularity'),
        'genres': info.get('genres')
})
    artists_names = [artist['name'] for artist in info['artists']]
    features['artists'] = artists_names
    artists_ids = [artist['id'] for artist in info['artists']]
    features['artist_ids'] = artists_ids
    return features

feature = format_track(feature)
for elem in feature:
    print(f'{elem}:\t{feature[elem]}')

id:	5WMzWvcNbVMUDrOIRcZAVa
name:	SWITCHING LANES
album:	SWITCHING LANES
album_id:	4BiEgXM0oeleWNNEevB16J
artists:	['HXVRMXN', 'SLVG', 'XHNORT']
artist_ids:	['2kxLXJ17WUKNIwyIfeDigH', '779QJkvhTmrxtUGaMo8QbK', '23vwxJA0kYYvK0iwe06MAd']
track_number:	2
disc_number:	1
explicit:	True
danceability:	0.675
energy:	0.789
key:	7
loudness:	-10.197
mode:	0
speechiness:	0.035
acousticness:	0.0452
instrumentalness:	0.917
liveness:	0.165
valence:	0.106
tempo:	117.984
duration_ms:	162908
time_signature:	4
year:	2021
release_date:	2021-10-08
popularity:	37
genres:	['drift phonk']


## 3. DATAFRAME EXTENSION

Now, we can test how it will look like...

In [17]:
import pandas as pd

df = pd.read_csv("../data/tracks_features.csv")

df_mini = df.iloc[0:5]

df_mini

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


Let's add popularity and genres of tracks

In [None]:
df_mini['popularity'] = ''
df_mini['genres'] = ''

for id, track_id in enumerate(df_mini['id']):
    features = format_track(get_audio_features(track_id))
    df_mini.loc[df_mini['id'] == track_id, 'popularity'] = features['popularity']
    df_mini.loc[df_mini['id'] == track_id, 'genres'] = "['" + "', '".join(features['genres']) + "']"
    df_mini.loc[df_mini['id'] == track_id, 'artists'] = "['" + "', '".join(features['artists']) + "']"

And do it for the whole dataframe...

In [None]:
import pandas as pd
from IPython.display import clear_output

df = pd.read_csv("../data/tracks_features.csv")

df['popularity'] = ''
df['genres'] = ''

errors = []

for id, track_id in enumerate(df['id']):
    print(id)
    try:
        features = format_track(get_audio_features(track_id))
        df.loc[df['id'] == track_id, 'popularity'] = features['popularity']
        df.loc[df['id'] == track_id, 'genres'] = "['" + "', '".join(features['genres']) + "']"
    except Exception:
        errors.append(track_id)
    clear_output()

671
