In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from config import cid, secret
import pandas as pd
import timeit

auth_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In a separate .py file, set your own credentials (CID and secret). I think using the Web API requires a paid subscription to Spotify. Log into developer.spotify.com, navigate to Dashboards and create an app (give an app name, description, accept the ToS) to generate your credentials.

#### Example list of genres we're interested in:

In [2]:
genre_list = [
    'soundtrack','indie','jazz','pop','electronic','folk','hip-hop','rock','alternative','classical','rap','world',
    'soul','blues','R&B','reggae','ska','dance','country','opera'
]
print(f'We have {len(genre_list)} genres.')

We have 20 genres.


## Get the track/artist/album data

In [3]:
# Start timer
start = timeit.default_timer()

# Set empty lists for each feature
track_name = []
popularity = []
track_id = []
artist_name = []
artist_id = []
artist_genre = []
album_name = []
album_id = []
release_date = []
followers = []

# Loop through the genre list and use each genre as a parameter when running the track-type search 
for genre in genre_list:
    # save track-type search in variable
    track_looped = sp.search(q='genre:'+genre, type='track',limit=3) # set limit to number of songs to grab per genre
    # loop through saved search results, append according to their index/position within the nested levels of dictionaries/lists
    for i, t in enumerate(track_looped['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        artist_id.append(t['artists'][0]['id'])
        album_name.append(t['album']['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
        artist_genre.append(genre)

# Using the album names collected from the first search, conduct album-type search to retrieve release date (only found with album search)
for album in album_name:
    album_looped = sp.search(q='album:'+album, type='album',limit=1)
    # similarly, append appropriate metrics found within the nests
    for i, a in enumerate(album_looped['albums']['items']):
        album_id.append(a['id'])
        release_date.append(a['release_date'])

# Lastly, using the artist names collected from first search, conduct artist-type search to retrieve number of followers per artist (only found with artist search)
for name in artist_name: 
    artist_looped = sp.search(q='artist:'+name, type='artist',limit=1)
    for i, n in enumerate(artist_looped['artists']['items']):
        followers.append(n['followers']['total'])

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

This took 11.761856000000002 seconds to complete.


#### Checking total tracks gathered:

In [4]:
print(f'Total number of track IDs: {len(track_id)}')

Total number of track IDs: 60


#### Load the lists as a DataFrame:

In [5]:
df_tracks = pd.DataFrame({
    'genre':artist_genre,
    'artist_name':artist_name, 
    'artist_id':artist_id,
    'followers':followers,
    'track_name':track_name, 
    'track_id':track_id,
    'popularity':popularity,
    'album_name':album_name,
    'album_id':album_id,
    'release_date':release_date
})
df_tracks.head()

Unnamed: 0,genre,artist_name,artist_id,followers,track_name,track_id,popularity,album_name,album_id,release_date
0,soundtrack,Toby Fox,57DlMWmbVIf2ssJ8QBpBau,711972,Fallen Down,1jDMi92a9zNQuPD3uPMkla,77,UNDERTALE Soundtrack,2M2Ae2SvZe3fmzUtlVOV5Z,2015-09-15
1,soundtrack,Michael Giacchino,4kLvhMAuCloLxoP1aVM7Lr,253169,The Batman,1NkI8DtCnjcWVCVLF0gB71,74,The Batman (Original Motion Picture Soundtrack),18nTX27XXEYARGmWMTgD19,2022-02-24
2,soundtrack,Hans Zimmer,0YC192cP3KPCRWx8zr8MfZ,2924485,Cornfield Chase,6pWgRkpqVfxnj3WuIcJ7WP,78,Interstellar (Original Motion Picture Soundtra...,3B61kSKTxlY36cYgzvf3cP,2014-11-18
3,indie,The Walters,027TpXKGwdXP7iwbjUSpV8,324716,I Love You So,4SqWKzw0CbA05TGszDgMlc,96,I Love You So,7ucm85tRsWk6EyVHaYAxe9,2014-11-28
4,indie,Yot Club,6FugQjLquBF4JzATRN70bR,150080,YKWIM?,2vWBUC9djv6BtiGlmKiQaH,88,Bipolar,60jUlxAOAcsiQUEW0XLroT,2019-05-31


In [6]:
group_track_artist_id = df_tracks.sort_values(by=['artist_name','track_name'])
group_track_artist_id
# concatenate artist_id and track_id and assess unique values
# one to many relationship between track and genre
# store all songs in table PK=track_id, another table with just genre, artist table
# if spotify classifies multiple genres for a song, it could 
# aggregate popular songs to genres they're associated to

Unnamed: 0,genre,artist_name,artist_id,followers,track_name,track_id,popularity,album_name,album_id,release_date
37,soul,Adele,4dpARuHxo51G3z768sgnrY,36021792,Easy On Me,0gplL1WMoJ6iYaPgMCL0gX,95,Easy On Me,224jZ4sUX7OhAuMwaxp86S,2021-10-14
38,soul,Adele,4dpARuHxo51G3z768sgnrY,36021792,Oh My God,3Kkjo3cT83cw09VJyrLNwX,92,30,21jF5jlMtzo94wbxmJ18aa,2021-11-19
59,opera,Andrea Bocelli,3EA9hVIzKfFiQI0Kikz2wo,2081771,Con te partirò,7zrpoAJte9o12TzawqgdD0,69,Bocelli (Remastered),3uARqNN4bYqts3Ltg5Jku3,1995-01-01
5,indie,Arctic Monkeys,7Ln80lUS6He07XvHI8qqHH,14112457,505,58ge6dfP91o9oXMzq3XkIS,86,Favourite Worst Nightmare (Standard Version),6rsQnwaoJHxXJRCDBPkBRw,2007-04-24
23,rock,Arctic Monkeys,7Ln80lUS6He07XvHI8qqHH,14112457,505,58ge6dfP91o9oXMzq3XkIS,86,Favourite Worst Nightmare (Standard Version),6rsQnwaoJHxXJRCDBPkBRw,2007-04-24
46,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,45908897,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04
45,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,11164712,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
53,dance,Becky G,4obzFoKoKRHIphyHzJ35G3,11164712,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
39,blues,Cage The Elephant,26T3LtbuGT1Fu9m0eRq5X3,2838652,Cigarette Daydreams,2tznHmp70DxMyr2XhWLOW0,83,Melophobia,4EK8gtQfdVsmDTji7gBFlz,2013-10-08
41,blues,Cage The Elephant,26T3LtbuGT1Fu9m0eRq5X3,2838652,Come a Little Closer,4sebUbjqbcgDSwG6PbSGI0,80,Melophobia,4EK8gtQfdVsmDTji7gBFlz,2013-10-08


In [7]:
df_tracks.shape

(60, 10)

In [9]:
df_tracks.drop_duplicates(inplace=True)
df_tracks.shape

(60, 10)

#### Checking for track or artist duplicates

In [10]:
grouped = df_tracks.groupby(['artist_name','track_name'], as_index=True).size()
f'This df_tracks has {grouped[grouped > 1].count()} duplicates to be addressed.'

'This df_tracks has 6 duplicates to be addressed.'

In [11]:
grouped

artist_name            track_name                                                 
Adele                  Easy On Me                                                     1
                       Oh My God                                                      1
Andrea Bocelli         Con te partirò                                                 1
Arctic Monkeys         505                                                            2
Bad Bunny              Yonaguni                                                       1
Becky G                MAMIII                                                         2
Cage The Elephant      Cigarette Daydreams                                            1
                       Come a Little Closer                                           1
Cory Asbury            Reckless Love                                                  1
DJ Snake               Let Me Love You                                                1
                       U Are My High 

Duplicates will probably occur frequently since its an issue with how Spotify classifies multiple genres per artist and how our provided genre parameters are purposely broad. E.g. Earth, Wind & Fire is classified as Jazz and Soul (among other things), so may appear in our dataset multiple times if the dataset is large enough.

#### Drop duplicates

In [None]:
print(f'We start with a DataFrame with shape {df_tracks.shape}.')

In [None]:
df_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [None]:
grouped_after_drop = df_tracks.groupby(['artist_name','track_name'], as_index=True).size()
print(f'This df_tracks has {grouped_after_drop[grouped_after_drop > 1].count()} duplicates to be addressed.')

In [None]:
print(f'DataFrame has shape {df_tracks.shape} after addressing any duplicates.')

## Get the audio features per track

In [None]:
# The audio features search has a limit of 100 track IDs that can be submitted per query.
start = timeit.default_timer()

# empty list for rows, define the batchsize as the limit per query, zeroing the None_counter
rows = []
batchsize = 100
None_counter = 0

for i in range(0, len(df_tracks['track_id']), batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter +1
        else:
            rows.append(t)

print(f'Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

#### Inspect the audio features dataset

In [None]:
print(f'Number of elements in audio features dataset: {len(rows)}')

In [None]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
df_audio_features.head()

In [None]:
df_audio_features[df_audio_features.duplicated(subset=['id'],keep=False)]

In [None]:
df_audio_features.columns

In [None]:
df_tracks.columns

#### Transform audio features DataFrame

In [None]:
df_audio_features.drop(['track_href','analysis_url','uri','type'], axis=1, inplace=True)
df_audio_features.rename(columns={'id':'track_id'}, inplace=True)

### Generate the dataframes in the format of tables in ERD

In [None]:
df_all = pd.merge(df_tracks, df_audio_features, on='track_id', how='inner')
df_all

In [None]:
df_all.shape

In [None]:
df_all.columns

In [None]:
artist_data = df_all[['artist_name','artist_id','followers']]
album_data = df_all[['album_name','album_id','release_date']]
track_features = df_all[[
    'track_name',
    'track_id',
    'artist_id',
    'album_id',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]]

In [None]:
print(f'DataFrames artist_data has shape {artist_data.shape}, album_data has shape {album_data.shape} and track_features has shape {track_features.shape}')

In [None]:
# # Imports
# from sqlalchemy import create_engine
# import psycopg2 
# from config import db_password

# # Create connection to database (endpoint to be decided)
# db_string = f"postgresql://postgres:{db_password}@{endpoint}"

# # instantiate engine
# engine = create_engine(db_string)

# artist_data.to_sql(name='artist_data', con=engine, if_exists='replace', index=False)
# album_data.to_sql(name='album_data', con=engine, if_exists='replace', index=False)
# track_features.to_sql(name='track_features', con=engine, if_exists='replace', index=False)