In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from config import cid, secret
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

import timeit

auth_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In a separate .py file, set your own credentials (CID and secret). I think using the Web API requires a paid subscription to Spotify. Log into developer.spotify.com, navigate to Dashboards and create an app (give an app name, description, accept the ToS) to generate your credentials.

#### Example list of genres we're interested in:

In [3]:
genre_list = [
    'soundtrack','indie','jazz','pop','electronic','folk','hip-hop','rock','alternative','classical','rap','world',
    'soul','blues','R&B','reggae','ska','dance','country','opera'
]
print(f'We have {len(genre_list)} genres.')

We have 20 genres.


## Get the track/artist/album data

In [4]:
# Start timer
start = timeit.default_timer()

# Set empty lists for each feature
track_name = []
popularity = []
track_id = []
artist_name = []
artist_id = []
artist_genre = []
album_name = []
album_id = []
release_date = []
followers = []

# Loop through the genre list and use each genre as a parameter when running the track-type search 

for genre in genre_list:
    # save track-type search in variable
    track_looped = sp.search(q='genre:'+genre, type='track',limit=50) # set limit to number of songs to grab per genre.
    time.sleep(90)        

    # loop through saved search results, append according to their index/position within the nested levels of dictionaries/lists
    for i, t in enumerate(track_looped['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        artist_id.append(t['artists'][0]['id'])
        album_name.append(t['album']['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
        artist_genre.append(genre)

# Using the album names collected from the first search, conduct album-type search to retrieve release date (only found with album search)
for album in album_name:
    album_looped = sp.search(q='album:'+album, type='album',limit=1)
    # similarly, append appropriate metrics found within the nests
    for i, a in enumerate(album_looped['albums']['items']):
        album_id.append(a['id'])
        release_date.append(a['release_date'])

# Lastly, using the artist names collected from first search, conduct artist-type search to retrieve number of followers per artist (only found with artist search)
for name in artist_name: 
    artist_looped = sp.search(q='artist:'+name, type='artist',limit=1)
    for i, n in enumerate(artist_looped['artists']['items']):
        followers.append(n['followers']['total'])

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

This took 1999.4626278 seconds to complete.


#### Checking total tracks gathered:

In [5]:
print(f'Total number of track IDs: {len(track_id)}')

Total number of track IDs: 1000


In [7]:
print(
    len(artist_genre),
    len(artist_name),
    len(artist_id),
    len(followers),
    len(track_name),
    len(track_id),
    len(popularity),
    len(album_name),
    len(album_id),
    len(release_date)
     )

1000 1000 1000 999 1000 1000 1000 1000 995 995


In [13]:
release_date

['2015-09-15',
 '2022-02-24',
 '2014-11-18',
 '2022-03-11',
 '2010-07-09',
 '2005-01-26',
 '2003-07-22',
 '1999-05-04',
 '2015-09-15',
 '2004-11-19',
 '2018-11-09',
 '2009-01-01',
 '2010-01-01',
 '2003-05-20',
 '2022-01-21',
 '2021-12-17',
 '2022-02-24',
 '2019-11-12',
 '2001-07-18',
 '2021-09-17',
 '2002-08-20',
 '2015-09-15',
 '2012-01-01',
 '2010-01-01',
 '2013-01-31',
 '2001-04-23',
 '2009-01-01',
 '2019-11-20',
 '2017-06-07',
 '1993-05-25',
 '2014-11-18',
 '2005-09-19',
 '2015-01-01',
 '2014-11-24',
 '2015-09-15',
 '2018-01-10',
 '2021-09-17',
 '2020-12-11',
 '2004-11-19',
 '1969',
 '2001-11-19',
 '2020-06-05',
 '2017-06-07',
 '2014-11-18',
 '1980-01-01',
 '2010-01-01',
 '2014-11-18',
 '2015-09-15',
 '2022-02-24',
 '2014-11-28',
 '2019-05-31',
 '2007-04-24',
 '2016-05-13',
 '2020-08-07',
 '2015-08-28',
 '2021-12-02',
 '2021-05-15',
 '2011-09-02',
 '2015-07-24',
 '2018-02-09',
 '2019-11-22',
 '2020-06-29',
 '2020-08-06',
 '2021-09-10',
 '2016-01-15',
 '2021-09-03',
 '2017-05-05',
 

#### Load the lists as a DataFrame:

In [6]:
df_tracks = pd.DataFrame({
    'genre':artist_genre,
    'artist_name':artist_name, 
    'artist_id':artist_id,
    'followers':followers,
    'track_name':track_name, 
    'track_id':track_id,
    'popularity':popularity,
    'album_name':album_name,
    'album_id':album_id,
    'release_date':release_date
})
df_tracks

ValueError: arrays must all be same length

In [None]:
group_track_artist_id = df_tracks.sort_values(by=['artist_name','track_name'])
group_track_artist_id
# concatenate artist_id and track_id and assess unique values
# one to many relationship between track and genre
# store all songs in table PK=track_id, another table with just genre, artist table
# if spotify classifies multiple genres for a song, it could 
# aggregate popular songs to genres they're associated to

In [None]:
df_tracks.shape

In [None]:
df_tracks.drop_duplicates(inplace=True)
df_tracks.shape

#### Create natural key 

In [None]:
df_tracks['natural_key'] = df_tracks.artist_name.map(str) + "_" + df_tracks.track_name

df_tracks

#### Are there duplicates?

In [None]:
duplicates = df_tracks[df_tracks.duplicated('natural_key')]

duplicates

## Get the audio features per track

In [None]:
# The audio features search has a limit of 100 track IDs that can be submitted per query.
start = timeit.default_timer()

# empty list for rows, define the batchsize as the limit per query, zeroing the None_counter
rows = []
batchsize = 100
None_counter = 0

for i in range(0, len(df_tracks['track_id']), batchsize):
    if (i % 10 == 0 and i >= 10):
        set_count += 1
        record_count = 1
        
    print(f'Processing record {record_count} of Set {set_count} | {genre}')
    record_count += 1
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    time.sleep(60)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter +1
        else:
            rows.append(t)

print(f'Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

#### Inspect the audio features dataset

In [None]:
print(f'Number of elements in audio features dataset: {len(rows)}')

In [None]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
df_audio_features.head()

In [None]:
df_audio_features.columns

In [None]:
df_tracks.columns

#### Transform audio features DataFrame

In [None]:
df_audio_features.drop(['track_href','analysis_url','uri','type'], axis=1, inplace=True)
df_audio_features.rename(columns={'id':'track_id'}, inplace=True)

### Generate the dataframes in the format of tables in ERD

In [None]:
df_all = pd.merge(df_tracks, df_audio_features, on='track_id', how='inner')
df_all

In [None]:
df_all.shape

In [None]:
df_all.columns

In [None]:
genre_data = df_all[['artist_name','artist_id','followers']]
album_data = df_all[['album_name','album_id','release_date']]
track_features = df_all[[
    'track_name',
    'track_id',
    'artist_id',
    'album_id',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]]

In [None]:
print(f'DataFrames artist_data has shape {artist_data.shape}, album_data has shape {album_data.shape} and track_features has shape {track_features.shape}')

In [None]:
# # Imports
# from sqlalchemy import create_engine
# import psycopg2 
# from config import db_password

# # Create connection to database (endpoint to be decided)
# db_string = f"postgresql://postgres:{db_password}@{endpoint}"

# # instantiate engine
# engine = create_engine(db_string)

# genre_data.to_sql(name='genre_data', con=engine, if_exists='replace', index=False)
# album_data.to_sql(name='album_data', con=engine, if_exists='replace', index=False)
# track_features.to_sql(name='track_features', con=engine, if_exists='replace', index=False)