In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# Connect to the spotify API
client_id = "207d263141ce4bd489a6759df62d55fc"
client_secret = "3678f66a438f48108e0c47045aa13732"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

<h4>Functions</h4>

In [4]:
def artist_tracks(artist_id):
    """
    Expects: The unique spotify id for artists
    Return: dataframe with all the artist's tracks and features
    """
    
    # Get all of the works from the artist
    artist_uri = sp.artist(artist_id)['uri']
    artist_album = sp.artist_albums(artist_uri)['items']

    # Create a list of all the singles for an artists
    artist_single_names = []
    artist_single_uri = []
    for i in range(len(artist_album)):
        artist_album_id = artist_album[i]['id']
        artist_album_len = len(sp.album(artist_album_id)['tracks']['items'])
        for j in range(artist_album_len):
            # print(i,j)
            artist_single_names.append(sp.album_tracks(artist_album[i]['id'])['items'][j]['name'])
            artist_single_uri.append(sp.album_tracks(artist_album[i]['id'])['items'][j]['uri'])

    # Create blank dict to populate all the necessary info
    tracks = defaultdict(list)
    keys = [
        'arist_id',
        'artist_name',
        'track_name',
        'track_id',
    ]

    # Populate dictionary
    for i in artist_single_uri:
        track_features = sp.track(i)
        
        tracks['arist_id'].append(track_features['artists'][0]['id'])
        tracks['artist_name'].append(track_features['artists'][0]['name'])
        tracks['track_name'].append(track_features['name'])
        tracks['track_id'].append(track_features['id'])

    #Convert dictionary into pandas dataframe
    df_tracks = pd.DataFrame(tracks)
    return df_tracks

In [5]:
# Get all the tracks from a list of artists
def get_tracks(artist_id):
    """
    Expects: List of unique spotify ids for artists
    Return: Dataframe with all the artists' tracks and features
    """
    df_track = pd.DataFrame()
    for artist in artist_id:
        df_track_2 = artist_tracks(artist)
        df_track = pd.concat([df_track, df_track_2], ignore_index=True)
    return df_track    

In [6]:
def playlist_artists(playlist_id):
    
    """
    Expects: Unique spotify playlist_id
    Return: Dataframe with all the artists_name and artist_id in playlist
    """
    
    # Create playlist object
    playlist = sp.playlist(playlist_id)['tracks']['items']
    
    # Collect artist name and id
    artists = []
    artist_id = []
    for track in playlist:
        artists.append(track['track']['album']['artists'][0]['name'])
        artist_id.append(track['track']['album']['artists'][0]['id'])
    
    # Create dataframe
    df_artist = pd.DataFrame({'artist': artists, 'id': artist_id}).drop_duplicates(subset='id', keep='first').sort_values('artist').reset_index(drop=True)
    return df_artist

In [7]:
# Get all the tracks from a list of artists
def get_artists(playlist_id):
    
    """
    Expects: List of unique spotify playlist_id
    Return: Dataframe with all the artists_name and artist_id in playlists
    """
    
    df_artist = pd.DataFrame()
    for artist in playlist_id:
        df_artist_2 = playlist_artists(artist)
        df_artist = pd.concat([df_artist, df_artist_2]).drop_duplicates(subset='id', keep='first').sort_values('artist').reset_index(drop=True)
    return df_artist  

In [8]:
# Add tracks info to existing dataframe
def add_tracks(df_artist, df_old):
    
    """
    Expects: Dataframe with artist_ids and Dataframe to append to
    Modifies: Append to existing dataframe that was entered as the second argument
    Return: Dataframe with all the artists' tracks and features
    """
    
    for artist_id in df_artist['id']:
        df = artist_tracks(artist_id)
        df_old = pd.concat([df, df_old], ignore_index=True)
    return df_old

#### Get Artists

Playlists
- Dancehall Offical - https://open.spotify.com/playlist/37i9dQZF1DXan38dNVDdl4?si=796f8dcd18504250
- Dancehall Throwback - https://open.spotify.com/playlist/37i9dQZF1DWWDLvD3P5wOG?si=9cfe16982d4a4619
- Dancehall for the Ladies - https://open.spotify.com/playlist/0Dyqw9ctRDr99alJMSs9hg?si=6b89f30cbfce4e27
- Dancehall 2022 VP records - https://open.spotify.com/playlist/4Fz74ZmK4kIPlRAZrpS4qr?si=5d90c766cd144074
- Reggae Mix - https://open.spotify.com/playlist/37i9dQZF1EQpjs4F0vUZ1x
- Chilled Reggae - https://open.spotify.com/playlist/37i9dQZF1DWYtKpmml7moA
- Island Reggae - https://open.spotify.com/playlist/37i9dQZF1DX3mvx0imueXo
- One Love - https://open.spotify.com/playlist/37i9dQZF1DWSiyIBdVQrkk
- Reggae Classics - https://open.spotify.com/playlist/37i9dQZF1DXbSbnqxMTGx9
- Gully vs Gaza - https://open.spotify.com/playlist/5aq6JG4RPxrZVKwSGXIpe9 
- Dancehall 2006~2017 - https://open.spotify.com/playlist/65H35PVeMgbHpLBaJ8dG8t
- https://open.spotify.com/playlist/1fHhCYhV1qTDbG7AlksF9E
- Y.t Mix Dancehall  2006 -2012 - https://open.spotify.com/playlist/3XyCpoHT3dmVK1ynfTB95k
- 90s Dancehall - https://open.spotify.com/playlist/3dEL2fn2HqnnIpSlearSJE  
- 80s/90s Dancehall (Old Dub) - https://open.spotify.com/playlist/3NiL4K6dkIjXWghSteDwXM
- 80s Dancehall - https://open.spotify.com/playlist/3wHBhXPY0XBc3bIscoJ0pI
- 90s Dancehall [1995-1999] - https://open.spotify.com/playlist/0hSaGasIKyAd0mjaQmiicP

All playlist_ids are added to list

In [11]:
playlist_id = ['37i9dQZF1DXan38dNVDdl4', '37i9dQZF1DWWDLvD3P5wOG', '0Dyqw9ctRDr99alJMSs9hg', '4Fz74ZmK4kIPlRAZrpS4qr','37i9dQZF1EQpjs4F0vUZ1x', 
               '37i9dQZF1DWYtKpmml7moA', '37i9dQZF1DX3mvx0imueXo', '37i9dQZF1DWSiyIBdVQrkk', '37i9dQZF1DXbSbnqxMTGx9','5aq6JG4RPxrZVKwSGXIpe9', 
               '65H35PVeMgbHpLBaJ8dG8t', '3XyCpoHT3dmVK1ynfTB95k', '3dEL2fn2HqnnIpSlearSJE', '3NiL4K6dkIjXWghSteDwXM', 
               '3wHBhXPY0XBc3bIscoJ0pI','0hSaGasIKyAd0mjaQmiicP'
              ]

Get all the unique artists from all the playlist and return a dataframe with artist_name and artist_id

In [12]:
# df_artist = get_artists(playlist_id)

Use industry knowledge to remove unnecessary or irrelevant artists from the dataframe, i.e. artists whose primary genre isn't Jamaican Music.

In [14]:
# drop = ['The Yutes','¡MAYDAY!','Taste of Pluto','echoBeats','MXSSIVH','H.E.R.','Haleek Maul','Safaree','Active Wingz','Cruel Santino','Kevin Lyttle',
#         'Father Philis','Various Artists','Salaam Remi','Mink Jo','Total Recall Vol. 5','Yerry Solis','The FaNaTiX','Reggae RoastInner Circle', 'Dj Jahmar',
#         'Soul Jazz Records Presents','Rihanna','Marlon Asher','Chris Brown','Notch','Ape Drums','Juls','Mura Masa','Reggae Gold','Playing For Change',
#         'JR Kenna','Chaka Demus & Pliers','WSTRN','Frighty','Capella Grey','Puzzle XIII','Shan Chanai','Verse Simmonds'
# ]

In [16]:
# for artist in drop:
#     if artist in list(df_artist['artist']):
#         df_artist.drop(df_artist[df_artist['artist'] == artist].index, axis=0, inplace=True)
#         print(artist)
# df_artist.head()

Unnamed: 0,artist,id
0,10Tik,6I1j34QzSTWe6u4qQWKYJe
1,1Biggs Don,6NuqWCp8VQhokoHpiUY1mS
2,450,2v6V75NbousiJwy2HV44VL
3,A92,5uWT1NONby2BqNCu42fdDc
5,Admiral Bailey,1zYGgHaZVz4Q2NrPngXiCx


Create csv file from dataframe

In [17]:
# df_artist.to_csv('artists.csv', index=False)

#### Get Tracks for each artists

Create empty dataframe to store track information

In [19]:
# df_tracks = pd.DataFrame()

Due to an unavoidable timeout error from the spotify api, the df_artist dataframe needs to be splited and supervised.
Timeout errors may still occur, however those dataframes will be manually recorded as failures, concatinated, then splited again until all the track information, for all artists is recorded.

In [18]:
# df_split = np.array_split(df_artist, 50)

Manually iterate through the splits and manually record failures. If there was a way to avoid the timeout error from the api, this step would not be necessary.

In [22]:
# %%time
# df_tracks = add_tracks(df_split[0], df_tracks)

CPU times: total: 3 s
Wall time: 1min 49s


In [16]:
# %%time
# df_tracks = add_tracks(df_split[0], df_tracks)

In [17]:
# %%time
# df_tracks = add_tracks(df_split[2], df_tracks)

- Spotify also restricts the number of requests made but does not state the number. When this happens the api will fail to launch and a new app may need to be created.
- Max runtime before error was observed to be 15 minutes

Store the first database as csv before working on the failues.

In [18]:
# df_tracks.to_csv('tracks.csv', index=False)

Failures
- [0] - timeout
- [3] - timeout

- [10] - timeout
- [11] - timeout
- [12] - timeout
- [15] - timeout
- [18] - timeout
- [19] - timeout

- [20] - timeout
- [24] - timeout
- [26] - timeout

- [37] - timeout
- [38] - timeout
- [42] - timeout

Concatinate all the failures and repeat process.

In [19]:
# df_split = pd.concat([
#     df_split[0],
#     df_split[3],
#     df_split[10],
#     df_split[11],
#     df_split[12],
#     df_split[15],
#     df_split[18],
#     df_split[19],
#     df_split[20],
#     df_split[24],
#     df_split[26],
#     df_split[37],
#     df_split[38],
#     df_split[42],
# ], ignore_index=True)

In [20]:
# df_split = np.array_split(df_split, 20)

In [21]:
# df_tracks_2 = pd.DataFrame()

In [22]:
# %%time
# df_tracks_2 = add_tracks(df_split[0], df_tracks_2)

In [23]:
# df_tracks_2.to_csv('tracks_2.csv', index=False)

Concatinate all the tracks dataframe and store as one final csv.

In [24]:
# df_track = pd.read_csv('tracks.csv')
# df_track_2 = pd.read_csv('tracks_2.csv')

In [25]:
# df_tracks = pd.concat([
#     df_track,
#     df_track_2
# ], ignore_index=True).sort_values('artist_name').reset_index(drop=True)

In [26]:
# df_tracks.to_csv('tracks.csv', index=False)

<h4>Extras</h4>

In [43]:
# df = pd.read_csv('tracks.csv')

In [44]:
# track_id = df['track_id'].tolist()

In [62]:
# track_id_split = np.array_split(track_id, 460)

In [90]:
# %%time
# mode = []
# for i in range(100):
#     mode.append(sp.audio_features(track_id_split[i]))

CPU times: user 455 ms, sys: 34.2 ms, total: 489 ms
Wall time: 36.9 s


In [102]:
# %%time
# mode_2 = []
# for i in range(100, 200):
#     mode_2.append(sp.audio_features(track_id_split[i]))

CPU times: user 452 ms, sys: 34.6 ms, total: 486 ms
Wall time: 41.6 s


In [103]:
# %%time
# mode_3 = []
# for i in range(200, 300):
#     mode_3.append(sp.audio_features(track_id_split[i]))

CPU times: user 476 ms, sys: 35 ms, total: 511 ms
Wall time: 42.3 s


In [104]:
# %%time
# mode_4 = []
# for i in range(300, 400):
#     mode_4.append(sp.audio_features(track_id_split[i]))

CPU times: user 453 ms, sys: 33.6 ms, total: 486 ms
Wall time: 47 s


In [105]:
# %%time
# mode_5 = []
# for i in range(400, 460):
#     mode_5.append(sp.audio_features(track_id_split[i]))

CPU times: user 276 ms, sys: 23.3 ms, total: 299 ms
Wall time: 29.8 s


In [119]:
# track_features = []
# for i in mode:
#     for j in i:
#          track_features.append(j)

In [122]:
# for i in mode_2:
#     for j in i:
#          track_features.append(j)

In [123]:
# for i in mode_3:
#     for j in i:
#          track_features.append(j)

In [124]:
# for i in mode_4:
#     for j in i:
#          track_features.append(j)

In [125]:
# for i in mode_5:
#     for j in i:
#          track_features.append(j)

In [126]:
# len(track_features)

45484

In [156]:
# df_track_features = pd.DataFrame(track_features)

In [157]:
# df_track_features.to_csv('track_features.csv')

In [165]:
# df['mode'] = df_track_features['mode']

In [166]:
# df.to_csv('tracks.csv', index=False)