In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from config import cid, secret
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

import timeit

auth_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In a separate .py file, set your own credentials (CID and secret). I think using the Web API requires a paid subscription to Spotify. Log into developer.spotify.com, navigate to Dashboards and create an app (give an app name, description, accept the ToS) to generate your credentials.

#### Example list of genres we're interested in:

In [2]:
genre_list = [
#     'soundtrack','indie','jazz','pop','electronic',
#     'folk','hip-hop','rock','alternative','classical',
#     'rap','world','soul','blues','R&B',
    'reggae','ska','dance','country','opera'
]
print(f'We have {len(genre_list)} genres.')

We have 5 genres.


## Get the track/artist/album data

In [3]:
# Start timer
start = timeit.default_timer()

# Set empty lists for each feature
track_name = []
popularity = []
track_id = []
artist_name = []
artist_id = []
artist_genre = []
album_name = []
album_id = []
release_date = []
followers = []

# Loop through the genre list and use each genre as a parameter when running the track-type search 

for genre in genre_list:
    # save track-type search in variable
    track_looped = sp.search(q='genre:'+genre, type='track',limit=2) # set limit to number of songs to grab per genre.
    time.sleep(10)        

    # loop through saved search results, append according to their index/position within the nested levels of dictionaries/lists
    for i, t in enumerate(track_looped['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        artist_id.append(t['artists'][0]['id'])
        album_name.append(t['album']['name'])
        album_id.append(t['album']['id'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
        release_date.append(t['album']['release_date'])
        artist_genre.append(genre)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

This took 25.702692000000003 seconds to complete.


In [4]:
print(f'Total number of track IDs: {len(track_id)}')

Total number of track IDs: 10


In [5]:
artist_id

['4obzFoKoKRHIphyHzJ35G3',
 '4q3ewBCX7sLwd24euuV69X',
 '0EdvGhlC1FkGItLOWQzG4J',
 '5LfGQac0EIXyAN8aUwmNAQ',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '4oUHIQIBe0LHzYfvXNW4QM',
 '4oUHIQIBe0LHzYfvXNW4QM',
 '6cXMpsP9x0SH4kFfMyVezF',
 '4JSWO1Pf2zV991fq64uAce']

In [6]:
# # Lastly, using the artist names collected from first search, conduct artist-type search to retrieve number of followers per artist (only found with artist search)
# artist_looped = []
# artist_looped = sp.artists(artists=artist_id)
# for i, n in enumerate(artist_looped['artists']):
#     followers.append(n['followers']['total'])
# print(followers)
# print(f'Total: {len(followers)}')

#### Checking total tracks gathered:

In [22]:
print(
    len(artist_genre),
    len(artist_name),
    len(artist_id),
    len(track_name),
    len(track_id),
    len(popularity),
    len(album_name),
    len(album_id),
    len(release_date)
     )

10 10 10 10 10 10 10 10 10


#### Load the lists as a DataFrame:

In [23]:
df_tracks = pd.DataFrame({
    'genre':artist_genre,
    'artist_name':artist_name, 
    'artist_id':artist_id,
#     'followers':followers,
    'track_name':track_name, 
    'track_id':track_id,
    'popularity':popularity,
    'album_name':album_name,
    'album_id':album_id,
    'release_date':release_date
})
df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06


In [15]:
group_track_artist_id = df_tracks.sort_values(by=['artist_name','track_name'])
group_track_artist_id
# concatenate artist_id and track_id and assess unique values
# one to many relationship between track and genre
# store all songs in table PK=track_id, another table with just genre, artist table
# if spotify classifies multiple genres for a song, it could 
# aggregate popular songs to genres they're associated to

Unnamed: 0,genre,artist_name,artist_id,followers,track_name,track_id,popularity,album_name,album_id,release_date
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,45995488,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,11174590,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,60381,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,777889,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,2186497,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,4605992,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17


In [16]:
df_tracks.shape

(10, 10)

In [17]:
df_tracks.drop_duplicates(inplace=True)
df_tracks.shape

(10, 10)

#### Create natural key 

In [18]:
df_tracks['natural_key'] = df_tracks.artist_name.map(str) + "_" + df_tracks.track_name

df_tracks

Unnamed: 0,genre,artist_name,artist_id,followers,track_name,track_id,popularity,album_name,album_id,release_date,natural_key
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,11174590,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10,Becky G_MAMIII
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,45995488,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04,Bad Bunny_Yonaguni
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,2186497,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30,Sublime_Santeria
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,4605992,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17,"The Offspring_You're Gonna Go Far, Kid"
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,Doja Cat_Woman
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,Doja Cat_Need to Know
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08,Morgan Wallen_Wasted On You
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27,Morgan Wallen_Whiskey Glasses
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,777889,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11,Josh Groban_You Raise Me Up
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,60381,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06,Jean-Philippe Rameau_The Arts and the Hours


In [24]:
df_tracks.dtypes

genre           object
artist_name     object
artist_id       object
track_name      object
track_id        object
popularity       int64
album_name      object
album_id        object
release_date    object
dtype: object

#### Create new column to bin dates to seasons

In [None]:
df_tracks['release_season'] = 

#### Are there duplicates?

In [21]:
len(df_tracks[df_tracks.duplicated('natural_key')])

0

## Get the audio features per track

In [None]:
# The audio features search has a limit of 100 track IDs that can be submitted per query.
start = timeit.default_timer()

# empty list for rows, define the batchsize as the limit per query, zeroing the None_counter
rows = []
batchsize = 100
None_counter = 0

for i in range(0, len(df_tracks['track_id']), batchsize):
    if (i % 10 == 0 and i >= 10):
        set_count += 1
        record_count = 1
        
    print(f'Processing record {record_count} of Set {set_count} | {genre}')
    record_count += 1
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    time.sleep(60)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter +1
        else:
            rows.append(t)

print(f'Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

#### Inspect the audio features dataset

In [None]:
print(f'Number of elements in audio features dataset: {len(rows)}')

In [None]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
df_audio_features.head()

In [None]:
df_audio_features.columns

In [None]:
df_tracks.columns

#### Transform audio features DataFrame

In [None]:
df_audio_features.drop(['track_href','analysis_url','uri','type'], axis=1, inplace=True)
df_audio_features.rename(columns={'id':'track_id'}, inplace=True)

### Generate the dataframes in the format of tables in ERD

In [None]:
df_all = pd.merge(df_tracks, df_audio_features, on='track_id', how='inner')
df_all

In [None]:
df_all.shape

In [None]:
df_all.columns

In [None]:
genre_data = df_all[['artist_name','artist_id','followers']]
album_data = df_all[['album_name','album_id','release_date']]
track_features = df_all[[
    'track_name',
    'track_id',
    'artist_id',
    'album_id',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]]

In [None]:
print(f'DataFrames artist_data has shape {artist_data.shape}, album_data has shape {album_data.shape} and track_features has shape {track_features.shape}')

In [None]:
# # Imports
# from sqlalchemy import create_engine
# import psycopg2 
# from config import db_password

# # Create connection to database (endpoint to be decided)
# db_string = f"postgresql://postgres:{db_password}@{endpoint}"

# # instantiate engine
# engine = create_engine(db_string)

# genre_data.to_sql(name='genre_data', con=engine, if_exists='replace', index=False)
# album_data.to_sql(name='album_data', con=engine, if_exists='replace', index=False)
# track_features.to_sql(name='track_features', con=engine, if_exists='replace', index=False)