In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from config import cid, secret
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

import timeit

auth_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In a separate .py file, set your own credentials (CID and secret). I think using the Web API requires a paid subscription to Spotify. Log into developer.spotify.com, navigate to Dashboards and create an app (give an app name, description, accept the ToS) to generate your credentials.

Code is borrowed heavily from tgel0 (https://github.com/tgel0/spotify-data/blob/master/notebooks/SpotifyDataRetrieval.ipynb)

#### Example list of genres we're interested in:

In [2]:
genre_list = [
    'soundtrack','indie','jazz','pop','electronic',
    'folk','hip-hop','rock','alternative','classical',
    'rap','world','soul','blues','R&B',
    'reggae','ska','dance','country','opera'
]
print(f'We have {len(genre_list)} genres.')

We have 20 genres.


## Get the track/artist/album data

In [3]:
# Start timer
start = timeit.default_timer()

# Set empty lists for each feature
track_name = []
popularity = []
track_id = []
artist_name = []
artist_id = []
artist_genre = []
album_name = []
album_id = []
release_date = []
followers = []

# Loop through the genre list and use each genre as a parameter when running the track-type search 

for genre in genre_list:
    # save track-type search in variable
    track_looped = sp.search(q='genre:'+genre, type='track',limit=50) # set limit to number of songs to grab per genre.
    time.sleep(60)

    # loop through saved search results, append according to their index/position within the nested levels of dictionaries/lists
    for i, t in enumerate(track_looped['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        artist_id.append(t['artists'][0]['id'])
        album_name.append(t['album']['name'])
        album_id.append(t['album']['id'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
        release_date.append(t['album']['release_date'])
        artist_genre.append(genre)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

This took 1203.6422868 seconds to complete.


In [4]:
print(f'Total number of track IDs: {len(track_id)}')

Total number of track IDs: 1000


In [5]:
artist_id

['57DlMWmbVIf2ssJ8QBpBau',
 '4kLvhMAuCloLxoP1aVM7Lr',
 '0YC192cP3KPCRWx8zr8MfZ',
 '08tfDO4dSrwxax35a3HIMC',
 '24eDfi2MSYo3A87hCcgpIL',
 '0YC192cP3KPCRWx8zr8MfZ',
 '7nzSoJISlVJsn7O0yTeMOB',
 '03GruNQP8X25PCoWzdvIGZ',
 '3dRfiJ2650SZu6GbydcHNb',
 '57DlMWmbVIf2ssJ8QBpBau',
 '7nzSoJISlVJsn7O0yTeMOB',
 '18oYqNtcLUHrqO7LfX7qni',
 '4kLvhMAuCloLxoP1aVM7Lr',
 '3EAHF3jdnHHdko5DBrhRUP',
 '1csBgT42N4pPPs1HJhxXIK',
 '566MlWaCa63jvMZV9YMj3V',
 '24eDfi2MSYo3A87hCcgpIL',
 '4kLvhMAuCloLxoP1aVM7Lr',
 '24eDfi2MSYo3A87hCcgpIL',
 '7nzSoJISlVJsn7O0yTeMOB',
 '57DlMWmbVIf2ssJ8QBpBau',
 '5qBZETtyzfYnXOobDXbmcD',
 '57DlMWmbVIf2ssJ8QBpBau',
 '0Xk15jHKly4c3AhPr5vjoA',
 '5sy77gt4bfsLcSQ8GIe4ZZ',
 '77yY2QmM6bYvjJ3y5L2R0v',
 '00sazWvoTLOqg5MFwC68Um',
 '4kLvhMAuCloLxoP1aVM7Lr',
 '0Riv2KnFcLZA3JSVryRg4y',
 '0YC192cP3KPCRWx8zr8MfZ',
 '3dRfiJ2650SZu6GbydcHNb',
 '0YC192cP3KPCRWx8zr8MfZ',
 '0s1ec6aPpRZ4DCj15w1EFg',
 '4kLvhMAuCloLxoP1aVM7Lr',
 '2M4eNCvV3CJUswavkhAQg2',
 '57DlMWmbVIf2ssJ8QBpBau',
 '4L0gz16xuM1as1OgYzh1SR',
 

#### Checking total tracks gathered:

In [6]:
print(
    len(artist_genre),
    len(artist_name),
    len(artist_id),
    len(track_name),
    len(track_id),
    len(popularity),
    len(album_name),
    len(album_id),
    len(release_date)
     )

1000 1000 1000 1000 1000 1000 1000 1000 1000


#### Load the lists as a DataFrame:

In [7]:
df_tracks = pd.DataFrame({
    'genre':artist_genre,
    'artist_name':artist_name, 
    'artist_id':artist_id,
    'track_name':track_name, 
    'track_id':track_id,
    'popularity':popularity,
    'album_name':album_name,
    'album_id':album_id,
    'release_date':release_date
})
df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date
0,soundtrack,Toby Fox,57DlMWmbVIf2ssJ8QBpBau,Fallen Down,1jDMi92a9zNQuPD3uPMkla,77,UNDERTALE Soundtrack,2M2Ae2SvZe3fmzUtlVOV5Z,2015-09-15
1,soundtrack,Michael Giacchino,4kLvhMAuCloLxoP1aVM7Lr,The Batman,1NkI8DtCnjcWVCVLF0gB71,74,The Batman (Original Motion Picture Soundtrack),18nTX27XXEYARGmWMTgD19,2022-02-24
2,soundtrack,Hans Zimmer,0YC192cP3KPCRWx8zr8MfZ,Cornfield Chase,6pWgRkpqVfxnj3WuIcJ7WP,78,Interstellar (Original Motion Picture Soundtra...,3B61kSKTxlY36cYgzvf3cP,2014-11-18
3,soundtrack,Toshifumi Hinata,08tfDO4dSrwxax35a3HIMC,Reflections,5Zf25eS8E1znm9mez4cGsm,73,ひとつぶの海,0k4j3uetJHFATIt4VxwCPz,1986-11-28
4,soundtrack,Ludwig Goransson,24eDfi2MSYo3A87hCcgpIL,Pandas Unite / Nobody Like U (Reprise),4dksoaJqf0k9mwrvAuUcqR,70,Turning Red (Original Motion Picture Soundtrack),6z1EZ0KfoiVW0bXIbOWAu3,2022-03-11
...,...,...,...,...,...,...,...,...,...
995,opera,Secret Garden,6GUg2fRLklsnSJsG3PrTt0,Adagio,4qqXbRF8ZWBLP9wwL8QtzB,60,Songs From A Secret Garden,2yHyt3K2n1u6Qe6INOTosb,1995-01-01
996,opera,Bernward Koch,3I47W6SIIS24cECbGOeakB,An Evening Walk,0qeGdBTMsG8arBbybvH17q,58,Gentle Spirit,3aTqk4g13Bau6oedCbnKdE,2009-05-19
997,opera,Gioachino Rossini,0roWUeP7Ac4yK4VN6L2gF4,Il barbiere di Siviglia: Overture (Sinfonia),1chTrqszWQEOP4RsNuOZf7,55,Rossini: Complete Overtures,6r3UcK6cVlZZ28pVKszcaX,2003-01-01
998,opera,William Joseph,0XfkH3kgVTy5EYdv0DzDCQ,Safe & Sound,3JvdZv9dxIHZlHu0eCKajZ,54,Safe & Sound,00g18289CQoHu1R0CXn8bY,2012-10-24


In [8]:
group_track_artist_id = df_tracks.sort_values(by=['artist_name','track_name'])
group_track_artist_id


Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date
848,ska,+44,7gkRNHOOt7QfhhXf0rEnmj,When Your Heart Stops Beating,1VpTwecl7EbQiGyVsvwFQE,60,When Your Heart Stops Beating,46aFyiJiOIKPuxt6j574XA,2006-01-01
507,rap,2Pac,1ZwdS5xdxEREPySFridCfh,Hail Mary,2zoobJFEB9h15fjYjRd6oP,71,Greatest Hits,1WBZyULtlANBKed7Zf9cDP,1998-01-01
519,rap,2Pac,1ZwdS5xdxEREPySFridCfh,Hit 'Em Up - Single Version,0Z2J91b2iTGLVTZC4fKgxf,86,Greatest Hits,1WBZyULtlANBKed7Zf9cDP,1998-01-01
428,alternative,3 Doors Down,2RTUTCvo6onsAnheUk3aL9,Kryptonite,6ZOBP3NvffbU4SZcrnt1k6,83,The Better Life,5gO2acKSOaJnP0Mcy8IpU6,2000
130,jazz,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,Can I Kick It?,3Ti0GdlrotgwsAVBBugv0I,75,People's Instinctive Travels and the Paths of ...,3kV0i1qqudjf0PGawJ4jck,1990-04-17
...,...,...,...,...,...,...,...,...,...
398,rock,fun.,5nCi3BB41mBaMH9gfr6Su0,We Are Young (feat. Janelle Monáe),7a86XRg84qjasly9f6bPSD,79,Some Nights,7iycyHwOW2plljYIK6I1Zo,2012-02-21
71,indie,girl in red,3uwAm6vQy7kWPS2bciKWx9,we fell in love in october,1BYZxKSf0aTxp8ZFoeyM3d,88,we fell in love in october / forget her,4oRcMHsdNG9IAF3xwv1kWu,2018-11-21
844,ska,zebrahead,6SiyKSeJo6gcsS2NvuAbsl,All My Friends Are Nobodies,3xhvI8X8PGCKuKd49ZyFNJ,69,Brain Invaders (Deluxe Edition),5JxQ55Wvq2kCWtOQ0gSr9h,2019-08-09
472,classical,Ólafur Arnalds,7E3BRXV9ZbCt5lQTCXMTia,Saudade (When We Are Born),1ijwLR1iybtxaUbasUj7kJ,68,Saudade (When We Are Born),3s5UDbcu9PBKBJ4xIz0zcW,2021-06-11


In [9]:
df_tracks.shape

(1000, 9)

In [10]:
df_tracks.drop_duplicates(inplace=True)
df_tracks.shape

(1000, 9)

In [12]:
df_tracks[df_tracks['track_id'] == '4kbj5MwxO1bq9wjT5g9HaA']

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date
394,rock,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02
447,alternative,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02
895,dance,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02


#### Create natural key 

In [13]:
df_tracks['track_natural_key'] = df_tracks.artist_name.map(str) + "_" + df_tracks.track_name

df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,track_natural_key
0,soundtrack,Toby Fox,57DlMWmbVIf2ssJ8QBpBau,Fallen Down,1jDMi92a9zNQuPD3uPMkla,77,UNDERTALE Soundtrack,2M2Ae2SvZe3fmzUtlVOV5Z,2015-09-15,Toby Fox_Fallen Down
1,soundtrack,Michael Giacchino,4kLvhMAuCloLxoP1aVM7Lr,The Batman,1NkI8DtCnjcWVCVLF0gB71,74,The Batman (Original Motion Picture Soundtrack),18nTX27XXEYARGmWMTgD19,2022-02-24,Michael Giacchino_The Batman
2,soundtrack,Hans Zimmer,0YC192cP3KPCRWx8zr8MfZ,Cornfield Chase,6pWgRkpqVfxnj3WuIcJ7WP,78,Interstellar (Original Motion Picture Soundtra...,3B61kSKTxlY36cYgzvf3cP,2014-11-18,Hans Zimmer_Cornfield Chase
3,soundtrack,Toshifumi Hinata,08tfDO4dSrwxax35a3HIMC,Reflections,5Zf25eS8E1znm9mez4cGsm,73,ひとつぶの海,0k4j3uetJHFATIt4VxwCPz,1986-11-28,Toshifumi Hinata_Reflections
4,soundtrack,Ludwig Goransson,24eDfi2MSYo3A87hCcgpIL,Pandas Unite / Nobody Like U (Reprise),4dksoaJqf0k9mwrvAuUcqR,70,Turning Red (Original Motion Picture Soundtrack),6z1EZ0KfoiVW0bXIbOWAu3,2022-03-11,Ludwig Goransson_Pandas Unite / Nobody Like U ...
...,...,...,...,...,...,...,...,...,...,...
995,opera,Secret Garden,6GUg2fRLklsnSJsG3PrTt0,Adagio,4qqXbRF8ZWBLP9wwL8QtzB,60,Songs From A Secret Garden,2yHyt3K2n1u6Qe6INOTosb,1995-01-01,Secret Garden_Adagio
996,opera,Bernward Koch,3I47W6SIIS24cECbGOeakB,An Evening Walk,0qeGdBTMsG8arBbybvH17q,58,Gentle Spirit,3aTqk4g13Bau6oedCbnKdE,2009-05-19,Bernward Koch_An Evening Walk
997,opera,Gioachino Rossini,0roWUeP7Ac4yK4VN6L2gF4,Il barbiere di Siviglia: Overture (Sinfonia),1chTrqszWQEOP4RsNuOZf7,55,Rossini: Complete Overtures,6r3UcK6cVlZZ28pVKszcaX,2003-01-01,Gioachino Rossini_Il barbiere di Siviglia: Ove...
998,opera,William Joseph,0XfkH3kgVTy5EYdv0DzDCQ,Safe & Sound,3JvdZv9dxIHZlHu0eCKajZ,54,Safe & Sound,00g18289CQoHu1R0CXn8bY,2012-10-24,William Joseph_Safe & Sound


In [14]:
df_tracks.dtypes

genre                object
artist_name          object
artist_id            object
track_name           object
track_id             object
popularity            int64
album_name           object
album_id             object
release_date         object
track_natural_key    object
dtype: object

#### Create new column to bin dates to seasons

In [15]:
# convert release_date column to datetime format 
df_tracks['release_date'] = pd.to_datetime(df_tracks.release_date, format='%Y-%m-%d')

In [16]:
df_tracks.dtypes

genre                        object
artist_name                  object
artist_id                    object
track_name                   object
track_id                     object
popularity                    int64
album_name                   object
album_id                     object
release_date         datetime64[ns]
track_natural_key            object
dtype: object

In [17]:
# new column for seasons (borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # numeric trick is to wasp the winter separation at 12-31 / 01-01. 
    # By subtracting the end of winter, which is 03-21, and take modulo, you effectively change 01-01 to - 320 = 980 mod 1300, 
    # which is larger than the last day of autumn. So now your winter season is in one chunk instead of two.

date_offset = (df_tracks.release_date.dt.month*100 + df_tracks.release_date.dt.day - 320)%1300

df_tracks['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,track_natural_key,release_season
0,soundtrack,Toby Fox,57DlMWmbVIf2ssJ8QBpBau,Fallen Down,1jDMi92a9zNQuPD3uPMkla,77,UNDERTALE Soundtrack,2M2Ae2SvZe3fmzUtlVOV5Z,2015-09-15,Toby Fox_Fallen Down,summer
1,soundtrack,Michael Giacchino,4kLvhMAuCloLxoP1aVM7Lr,The Batman,1NkI8DtCnjcWVCVLF0gB71,74,The Batman (Original Motion Picture Soundtrack),18nTX27XXEYARGmWMTgD19,2022-02-24,Michael Giacchino_The Batman,winter
2,soundtrack,Hans Zimmer,0YC192cP3KPCRWx8zr8MfZ,Cornfield Chase,6pWgRkpqVfxnj3WuIcJ7WP,78,Interstellar (Original Motion Picture Soundtra...,3B61kSKTxlY36cYgzvf3cP,2014-11-18,Hans Zimmer_Cornfield Chase,autumn
3,soundtrack,Toshifumi Hinata,08tfDO4dSrwxax35a3HIMC,Reflections,5Zf25eS8E1znm9mez4cGsm,73,ひとつぶの海,0k4j3uetJHFATIt4VxwCPz,1986-11-28,Toshifumi Hinata_Reflections,autumn
4,soundtrack,Ludwig Goransson,24eDfi2MSYo3A87hCcgpIL,Pandas Unite / Nobody Like U (Reprise),4dksoaJqf0k9mwrvAuUcqR,70,Turning Red (Original Motion Picture Soundtrack),6z1EZ0KfoiVW0bXIbOWAu3,2022-03-11,Ludwig Goransson_Pandas Unite / Nobody Like U ...,winter
...,...,...,...,...,...,...,...,...,...,...,...
995,opera,Secret Garden,6GUg2fRLklsnSJsG3PrTt0,Adagio,4qqXbRF8ZWBLP9wwL8QtzB,60,Songs From A Secret Garden,2yHyt3K2n1u6Qe6INOTosb,1995-01-01,Secret Garden_Adagio,winter
996,opera,Bernward Koch,3I47W6SIIS24cECbGOeakB,An Evening Walk,0qeGdBTMsG8arBbybvH17q,58,Gentle Spirit,3aTqk4g13Bau6oedCbnKdE,2009-05-19,Bernward Koch_An Evening Walk,spring
997,opera,Gioachino Rossini,0roWUeP7Ac4yK4VN6L2gF4,Il barbiere di Siviglia: Overture (Sinfonia),1chTrqszWQEOP4RsNuOZf7,55,Rossini: Complete Overtures,6r3UcK6cVlZZ28pVKszcaX,2003-01-01,Gioachino Rossini_Il barbiere di Siviglia: Ove...,winter
998,opera,William Joseph,0XfkH3kgVTy5EYdv0DzDCQ,Safe & Sound,3JvdZv9dxIHZlHu0eCKajZ,54,Safe & Sound,00g18289CQoHu1R0CXn8bY,2012-10-24,William Joseph_Safe & Sound,autumn


#### How many duplicates are there per track_natural_key?

In [18]:
t_dup = df_tracks[df_tracks.duplicated('track_natural_key')]
len(t_dup)

161

## Get the audio features per track

In [19]:
# The audio features search has a limit of 100 track IDs that can be submitted per query.
start = timeit.default_timer()

# empty list for rows, define the batchsize as the limit per query, zeroing the None_counter
rows = []
batchsize = 100
None_counter = 0

for i in range(0, len(df_tracks['track_id']), batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)

    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter +1
        else:
            rows.append(t)
    
    time.sleep(60)
    
print(f'Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

Number of tracks where no audio features were available: 0
This took 601.2577963000001 seconds to complete.


#### Inspect the audio features dataset

In [21]:
print(f'Number of elements in audio features dataset: {len(rows)}')

Number of elements in audio features dataset: 1000


In [22]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
df_audio_features.head()

# time_signature ranges from 3 to 7 indicating time signatures of "3/4", to "7/4".
# mode, major = 1 and minor = 0
# key will remain in pitch class notation starting from C=0, C#=1, D=2, etc. which ignores frequency difference in octaves (https://en.wikipedia.org/wiki/Pitch_class)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.624,0.66,2,-12.779,1,0.0569,0.797,0.786,0.114,0.871,110.021,audio_features,1jDMi92a9zNQuPD3uPMkla,spotify:track:1jDMi92a9zNQuPD3uPMkla,https://api.spotify.com/v1/tracks/1jDMi92a9zNQ...,https://api.spotify.com/v1/audio-analysis/1jDM...,57818,3
1,0.209,0.152,10,-16.519,0,0.0324,0.5,0.903,0.103,0.0482,80.136,audio_features,1NkI8DtCnjcWVCVLF0gB71,spotify:track:1NkI8DtCnjcWVCVLF0gB71,https://api.spotify.com/v1/tracks/1NkI8DtCnjcW...,https://api.spotify.com/v1/audio-analysis/1NkI...,407533,4
2,0.18,0.226,9,-16.457,0,0.042,0.951,0.982,0.0998,0.048,94.079,audio_features,6pWgRkpqVfxnj3WuIcJ7WP,spotify:track:6pWgRkpqVfxnj3WuIcJ7WP,https://api.spotify.com/v1/tracks/6pWgRkpqVfxn...,https://api.spotify.com/v1/audio-analysis/6pWg...,126960,4
3,0.224,0.146,9,-16.331,0,0.0338,0.961,0.712,0.113,0.142,94.255,audio_features,5Zf25eS8E1znm9mez4cGsm,spotify:track:5Zf25eS8E1znm9mez4cGsm,https://api.spotify.com/v1/tracks/5Zf25eS8E1zn...,https://api.spotify.com/v1/audio-analysis/5Zf2...,130693,3
4,0.476,0.439,9,-10.827,0,0.0348,0.164,5.3e-05,0.101,0.12,104.707,audio_features,4dksoaJqf0k9mwrvAuUcqR,spotify:track:4dksoaJqf0k9mwrvAuUcqR,https://api.spotify.com/v1/tracks/4dksoaJqf0k9...,https://api.spotify.com/v1/audio-analysis/4dks...,185493,4


In [23]:
df_audio_features.shape

(1000, 18)

In [24]:
df_audio_features.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [25]:
df_tracks.columns

Index(['genre', 'artist_name', 'artist_id', 'track_name', 'track_id',
       'popularity', 'album_name', 'album_id', 'release_date',
       'track_natural_key', 'release_season'],
      dtype='object')

#### Transform audio features DataFrame

In [26]:
# convert duration_ms to minutes. Solution from https://stackoverflow.com/questions/67438170/how-to-convert-milliseconds-to-minutesseconds-output
# duration_mins is a datetime.time object, not datetime64[ns], need to check if compatible with sql table format
mins = []
ms = df_audio_features['duration_ms']
for i in ms:
    seconds, i = divmod(i, 1000)
    minutes, seconds = divmod(seconds, 60)
    mins.append(f'{int(minutes):01d}:{int(seconds):02d}')

df_audio_features['duration_mins'] = mins
df_audio_features['duration_mins'] = pd.to_datetime(df_audio_features.duration_mins, format='%M:%S').dt.time

df_audio_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,duration_mins
0,0.624,0.660,2,-12.779,1,0.0569,0.797,0.786000,0.1140,0.8710,110.021,audio_features,1jDMi92a9zNQuPD3uPMkla,spotify:track:1jDMi92a9zNQuPD3uPMkla,https://api.spotify.com/v1/tracks/1jDMi92a9zNQ...,https://api.spotify.com/v1/audio-analysis/1jDM...,57818,3,00:00:57
1,0.209,0.152,10,-16.519,0,0.0324,0.500,0.903000,0.1030,0.0482,80.136,audio_features,1NkI8DtCnjcWVCVLF0gB71,spotify:track:1NkI8DtCnjcWVCVLF0gB71,https://api.spotify.com/v1/tracks/1NkI8DtCnjcW...,https://api.spotify.com/v1/audio-analysis/1NkI...,407533,4,00:06:47
2,0.180,0.226,9,-16.457,0,0.0420,0.951,0.982000,0.0998,0.0480,94.079,audio_features,6pWgRkpqVfxnj3WuIcJ7WP,spotify:track:6pWgRkpqVfxnj3WuIcJ7WP,https://api.spotify.com/v1/tracks/6pWgRkpqVfxn...,https://api.spotify.com/v1/audio-analysis/6pWg...,126960,4,00:02:06
3,0.224,0.146,9,-16.331,0,0.0338,0.961,0.712000,0.1130,0.1420,94.255,audio_features,5Zf25eS8E1znm9mez4cGsm,spotify:track:5Zf25eS8E1znm9mez4cGsm,https://api.spotify.com/v1/tracks/5Zf25eS8E1zn...,https://api.spotify.com/v1/audio-analysis/5Zf2...,130693,3,00:02:10
4,0.476,0.439,9,-10.827,0,0.0348,0.164,0.000053,0.1010,0.1200,104.707,audio_features,4dksoaJqf0k9mwrvAuUcqR,spotify:track:4dksoaJqf0k9mwrvAuUcqR,https://api.spotify.com/v1/tracks/4dksoaJqf0k9...,https://api.spotify.com/v1/audio-analysis/4dks...,185493,4,00:03:05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.115,0.328,7,-11.154,0,0.0416,0.911,0.771000,0.1010,0.0611,51.096,audio_features,4qqXbRF8ZWBLP9wwL8QtzB,spotify:track:4qqXbRF8ZWBLP9wwL8QtzB,https://api.spotify.com/v1/tracks/4qqXbRF8ZWBL...,https://api.spotify.com/v1/audio-analysis/4qqX...,175040,4,00:02:55
996,0.269,0.127,0,-16.917,0,0.0357,0.940,0.909000,0.1140,0.0981,125.241,audio_features,0qeGdBTMsG8arBbybvH17q,spotify:track:0qeGdBTMsG8arBbybvH17q,https://api.spotify.com/v1/tracks/0qeGdBTMsG8a...,https://api.spotify.com/v1/audio-analysis/0qeG...,239053,3,00:03:59
997,0.322,0.140,4,-20.747,1,0.0489,0.906,0.759000,0.1250,0.2850,92.122,audio_features,1chTrqszWQEOP4RsNuOZf7,spotify:track:1chTrqszWQEOP4RsNuOZf7,https://api.spotify.com/v1/tracks/1chTrqszWQEO...,https://api.spotify.com/v1/audio-analysis/1chT...,425000,4,00:07:05
998,0.440,0.355,7,-14.050,1,0.0276,0.725,0.911000,0.1250,0.0536,73.059,audio_features,3JvdZv9dxIHZlHu0eCKajZ,spotify:track:3JvdZv9dxIHZlHu0eCKajZ,https://api.spotify.com/v1/tracks/3JvdZv9dxIHZ...,https://api.spotify.com/v1/audio-analysis/3Jvd...,241259,4,00:04:01


In [28]:
df_audio_features[df_audio_features['id'] == '4kbj5MwxO1bq9wjT5g9HaA']

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,duration_mins
394,0.578,0.866,1,-3.804,1,0.0619,0.00701,0.0,0.257,0.619,128.038,audio_features,4kbj5MwxO1bq9wjT5g9HaA,spotify:track:4kbj5MwxO1bq9wjT5g9HaA,https://api.spotify.com/v1/tracks/4kbj5MwxO1bq...,https://api.spotify.com/v1/audio-analysis/4kbj...,199080,4,00:03:19
447,0.578,0.866,1,-3.804,1,0.0619,0.00701,0.0,0.257,0.619,128.038,audio_features,4kbj5MwxO1bq9wjT5g9HaA,spotify:track:4kbj5MwxO1bq9wjT5g9HaA,https://api.spotify.com/v1/tracks/4kbj5MwxO1bq...,https://api.spotify.com/v1/audio-analysis/4kbj...,199080,4,00:03:19
895,0.578,0.866,1,-3.804,1,0.0619,0.00701,0.0,0.257,0.619,128.038,audio_features,4kbj5MwxO1bq9wjT5g9HaA,spotify:track:4kbj5MwxO1bq9wjT5g9HaA,https://api.spotify.com/v1/tracks/4kbj5MwxO1bq...,https://api.spotify.com/v1/audio-analysis/4kbj...,199080,4,00:03:19


In [29]:
df_audio_features.shape

(1000, 19)

In [32]:
df_audio_features.drop_duplicates(inplace=True)
df_audio_features.shape

(860, 19)

In [33]:
df_audio_features[df_audio_features['id'] == '4kbj5MwxO1bq9wjT5g9HaA']

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,duration_mins
394,0.578,0.866,1,-3.804,1,0.0619,0.00701,0.0,0.257,0.619,128.038,audio_features,4kbj5MwxO1bq9wjT5g9HaA,spotify:track:4kbj5MwxO1bq9wjT5g9HaA,https://api.spotify.com/v1/tracks/4kbj5MwxO1bq...,https://api.spotify.com/v1/audio-analysis/4kbj...,199080,4,00:03:19


In [34]:
df_audio_features.dtypes

danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
uri                  object
track_href           object
analysis_url         object
duration_ms           int64
time_signature        int64
duration_mins        object
dtype: object

In [35]:
df_audio_features.drop(['track_href','analysis_url','uri','type'], axis=1, inplace=True)
df_audio_features.rename(columns={'id':'track_id'}, inplace=True)

### Generate the dataframes in the format of tables in ERD

In [36]:
df_all = pd.merge(df_tracks, df_audio_features, on='track_id', how='inner')
df_all

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,track_natural_key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_mins
0,soundtrack,Toby Fox,57DlMWmbVIf2ssJ8QBpBau,Fallen Down,1jDMi92a9zNQuPD3uPMkla,77,UNDERTALE Soundtrack,2M2Ae2SvZe3fmzUtlVOV5Z,2015-09-15,Toby Fox_Fallen Down,...,1,0.0569,0.797,0.786000,0.1140,0.8710,110.021,57818,3,00:00:57
1,soundtrack,Michael Giacchino,4kLvhMAuCloLxoP1aVM7Lr,The Batman,1NkI8DtCnjcWVCVLF0gB71,74,The Batman (Original Motion Picture Soundtrack),18nTX27XXEYARGmWMTgD19,2022-02-24,Michael Giacchino_The Batman,...,0,0.0324,0.500,0.903000,0.1030,0.0482,80.136,407533,4,00:06:47
2,soundtrack,Hans Zimmer,0YC192cP3KPCRWx8zr8MfZ,Cornfield Chase,6pWgRkpqVfxnj3WuIcJ7WP,78,Interstellar (Original Motion Picture Soundtra...,3B61kSKTxlY36cYgzvf3cP,2014-11-18,Hans Zimmer_Cornfield Chase,...,0,0.0420,0.951,0.982000,0.0998,0.0480,94.079,126960,4,00:02:06
3,soundtrack,Toshifumi Hinata,08tfDO4dSrwxax35a3HIMC,Reflections,5Zf25eS8E1znm9mez4cGsm,73,ひとつぶの海,0k4j3uetJHFATIt4VxwCPz,1986-11-28,Toshifumi Hinata_Reflections,...,0,0.0338,0.961,0.712000,0.1130,0.1420,94.255,130693,3,00:02:10
4,soundtrack,Ludwig Goransson,24eDfi2MSYo3A87hCcgpIL,Pandas Unite / Nobody Like U (Reprise),4dksoaJqf0k9mwrvAuUcqR,70,Turning Red (Original Motion Picture Soundtrack),6z1EZ0KfoiVW0bXIbOWAu3,2022-03-11,Ludwig Goransson_Pandas Unite / Nobody Like U ...,...,0,0.0348,0.164,0.000053,0.1010,0.1200,104.707,185493,4,00:03:05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,opera,Secret Garden,6GUg2fRLklsnSJsG3PrTt0,Adagio,4qqXbRF8ZWBLP9wwL8QtzB,60,Songs From A Secret Garden,2yHyt3K2n1u6Qe6INOTosb,1995-01-01,Secret Garden_Adagio,...,0,0.0416,0.911,0.771000,0.1010,0.0611,51.096,175040,4,00:02:55
996,opera,Bernward Koch,3I47W6SIIS24cECbGOeakB,An Evening Walk,0qeGdBTMsG8arBbybvH17q,58,Gentle Spirit,3aTqk4g13Bau6oedCbnKdE,2009-05-19,Bernward Koch_An Evening Walk,...,0,0.0357,0.940,0.909000,0.1140,0.0981,125.241,239053,3,00:03:59
997,opera,Gioachino Rossini,0roWUeP7Ac4yK4VN6L2gF4,Il barbiere di Siviglia: Overture (Sinfonia),1chTrqszWQEOP4RsNuOZf7,55,Rossini: Complete Overtures,6r3UcK6cVlZZ28pVKszcaX,2003-01-01,Gioachino Rossini_Il barbiere di Siviglia: Ove...,...,1,0.0489,0.906,0.759000,0.1250,0.2850,92.122,425000,4,00:07:05
998,opera,William Joseph,0XfkH3kgVTy5EYdv0DzDCQ,Safe & Sound,3JvdZv9dxIHZlHu0eCKajZ,54,Safe & Sound,00g18289CQoHu1R0CXn8bY,2012-10-24,William Joseph_Safe & Sound,...,1,0.0276,0.725,0.911000,0.1250,0.0536,73.059,241259,4,00:04:01


In [37]:
# the same check with the same multiply-classified song now has 3 instances per genre. 
df_all[df_all['track_natural_key'] == 'WALK THE MOON_Shut Up and Dance']

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,track_natural_key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_mins
477,rock,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02,WALK THE MOON_Shut Up and Dance,...,1,0.0619,0.00701,0.0,0.257,0.619,128.038,199080,4,00:03:19
478,alternative,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02,WALK THE MOON_Shut Up and Dance,...,1,0.0619,0.00701,0.0,0.257,0.619,128.038,199080,4,00:03:19
479,dance,WALK THE MOON,6DIS6PRrLS3wbnZsf7vYic,Shut Up and Dance,4kbj5MwxO1bq9wjT5g9HaA,86,TALKING IS HARD,3mNoFlD1wsoXfkljfFzExT,2014-12-02,WALK THE MOON_Shut Up and Dance,...,1,0.0619,0.00701,0.0,0.257,0.619,128.038,199080,4,00:03:19


In [38]:
df_left_join = pd.merge(df_tracks, df_audio_features, on='track_id', how='left')

In [None]:
df_left_join[df_left_join['track_natural_key'] == 'WALK THE MOON_Shut Up and Dance']

In [39]:
df_all.shape

(1000, 25)

In [40]:
df_all.columns

Index(['genre', 'artist_name', 'artist_id', 'track_name', 'track_id',
       'popularity', 'album_name', 'album_id', 'release_date',
       'track_natural_key', 'release_season', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'duration_mins'],
      dtype='object')

In [41]:
genre_data = df_all[['genre']].drop_duplicates(ignore_index=True)
album_data = df_all[['album_id','album_name','release_season']]
track_features = df_all[[
    'track_natural_key',
    'track_name',
    'artist_name',
    'album_id',
    'acousticness',
    'danceability',
    'duration_mins',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]]

In [42]:
genre_data

Unnamed: 0,genre
0,soundtrack
1,classical
2,indie
3,pop
4,rock
5,alternative
6,folk
7,country
8,jazz
9,soul


In [43]:
print(f'DataFrames genre_data has shape {genre_data.shape}, album_data has shape {album_data.shape} and track_features has shape {track_features.shape}')

DataFrames genre_data has shape (20, 1), album_data has shape (1000, 3) and track_features has shape (1000, 20)


In [44]:
# Imports
from sqlalchemy import create_engine
import psycopg2 
from config import db_password

# Create connection to database (endpoint to be decided)
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/spotify_db"

# instantiate engine
engine = create_engine(db_string)

track_features.to_sql(name='track_features', con=engine, if_exists='replace', index=False)
genre_data.to_sql(name='genre_data', con=engine, if_exists='replace', index=False)
album_data.to_sql(name='album_data', con=engine, if_exists='replace', index=False)
