In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from config import cid, secret
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import time

import timeit

auth_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In a separate .py file, set your own credentials (CID and secret). I think using the Web API requires a paid subscription to Spotify. Log into developer.spotify.com, navigate to Dashboards and create an app (give an app name, description, accept the ToS) to generate your credentials.

Code is borrowed heavily from tgel0 (https://github.com/tgel0/spotify-data/blob/master/notebooks/SpotifyDataRetrieval.ipynb)

#### Example list of genres we're interested in:

In [2]:
genre_list = [
#     'soundtrack','indie','jazz','pop','electronic',
#     'folk','hip-hop','rock','alternative','classical',
#     'rap','world','soul','blues','R&B',
    'reggae','ska','dance','country','opera'
]
print(f'We have {len(genre_list)} genres.')

We have 5 genres.


## Get the track/artist/album data

In [3]:
# Start timer
start = timeit.default_timer()

# Set empty lists for each feature
track_name = []
popularity = []
track_id = []
artist_name = []
artist_id = []
artist_genre = []
album_name = []
album_id = []
release_date = []
followers = []

# Loop through the genre list and use each genre as a parameter when running the track-type search 

for genre in genre_list:
    # save track-type search in variable
    track_looped = sp.search(q='genre:'+genre, type='track',limit=2) # set limit to number of songs to grab per genre.
    time.sleep(10)        

    # loop through saved search results, append according to their index/position within the nested levels of dictionaries/lists
    for i, t in enumerate(track_looped['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        artist_id.append(t['artists'][0]['id'])
        album_name.append(t['album']['name'])
        album_id.append(t['album']['id'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
        release_date.append(t['album']['release_date'])
        artist_genre.append(genre)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

This took 25.702692000000003 seconds to complete.


In [4]:
print(f'Total number of track IDs: {len(track_id)}')

Total number of track IDs: 10


In [5]:
artist_id

['4obzFoKoKRHIphyHzJ35G3',
 '4q3ewBCX7sLwd24euuV69X',
 '0EdvGhlC1FkGItLOWQzG4J',
 '5LfGQac0EIXyAN8aUwmNAQ',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '5cj0lLjcoR7YOSnhnX0Po5',
 '4oUHIQIBe0LHzYfvXNW4QM',
 '4oUHIQIBe0LHzYfvXNW4QM',
 '6cXMpsP9x0SH4kFfMyVezF',
 '4JSWO1Pf2zV991fq64uAce']

#### Checking total tracks gathered:

In [22]:
print(
    len(artist_genre),
    len(artist_name),
    len(artist_id),
    len(track_name),
    len(track_id),
    len(popularity),
    len(album_name),
    len(album_id),
    len(release_date)
     )

10 10 10 10 10 10 10 10 10


#### Load the lists as a DataFrame:

In [23]:
df_tracks = pd.DataFrame({
    'genre':artist_genre,
    'artist_name':artist_name, 
    'artist_id':artist_id,
    'track_name':track_name, 
    'track_id':track_id,
    'popularity':popularity,
    'album_name':album_name,
    'album_id':album_id,
    'release_date':release_date
})
df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06


In [15]:
group_track_artist_id = df_tracks.sort_values(by=['artist_name','track_name'])
group_track_artist_id


Unnamed: 0,genre,artist_name,artist_id,followers,track_name,track_id,popularity,album_name,album_id,release_date
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,45995488,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,11174590,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,17232250,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,60381,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,777889,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,3127931,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,2186497,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,4605992,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17


In [16]:
df_tracks.shape

(10, 10)

In [17]:
df_tracks.drop_duplicates(inplace=True)
df_tracks.shape

(10, 10)

#### Create natural key 

In [94]:
df_tracks['track_natural_key'] = df_tracks.artist_name.map(str) + "_" + df_tracks.track_name

df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,date_offset,release_season,natural_key
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10,1190,winter,Becky G_MAMIII
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04,284,spring,Bad Bunny_Yonaguni
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30,410,summer,Sublime_Santeria
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17,297,spring,"The Offspring_You're Gonna Go Far, Kid"
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,summer,Doja Cat_Woman
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,summer,Doja Cat_Need to Know
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08,1088,winter,Morgan Wallen_Wasted On You
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27,107,spring,Morgan Wallen_Whiskey Glasses
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11,791,autumn,Josh Groban_You Raise Me Up
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06,1286,winter,Jean-Philippe Rameau_The Arts and the Hours


In [95]:
df_tracks.dtypes

genre                     object
artist_name               object
artist_id                 object
track_name                object
track_id                  object
popularity                 int64
album_name                object
album_id                  object
release_date      datetime64[ns]
date_offset                int64
release_season          category
natural_key               object
dtype: object

#### Create new column to bin dates to seasons

In [96]:
# convert release_date column to datetime format 
df_tracks['release_date'] = pd.to_datetime(df_tracks.release_date, format='%Y-%m-%d')

In [97]:
df_tracks.dtypes

genre                     object
artist_name               object
artist_id                 object
track_name                object
track_id                  object
popularity                 int64
album_name                object
album_id                  object
release_date      datetime64[ns]
date_offset                int64
release_season          category
natural_key               object
dtype: object

In [98]:
# new column for seasons (borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # numeric trick is to wasp the winter separation at 12-31 / 01-01. 
    # By subtracting the end of winter, which is 03-21, and take modulo, you effectively change 01-01 to - 320 = 980 mod 1300, 
    # which is larger than the last day of autumn. So now your winter season is in one chunk instead of two.

date_offset = (df_tracks.release_date.dt.month*100 + df_tracks.release_date.dt.day - 320)%1300

df_tracks['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
df_tracks

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,date_offset,release_season,natural_key
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10,1190,winter,Becky G_MAMIII
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04,284,spring,Bad Bunny_Yonaguni
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30,410,summer,Sublime_Santeria
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17,297,spring,"The Offspring_You're Gonna Go Far, Kid"
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,summer,Doja Cat_Woman
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,summer,Doja Cat_Need to Know
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08,1088,winter,Morgan Wallen_Wasted On You
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27,107,spring,Morgan Wallen_Whiskey Glasses
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11,791,autumn,Josh Groban_You Raise Me Up
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06,1286,winter,Jean-Philippe Rameau_The Arts and the Hours


#### Are there duplicates?

In [99]:
len(df_tracks[df_tracks.duplicated('track_natural_key')])

0

## Get the audio features per track

In [47]:
# The audio features search has a limit of 100 track IDs that can be submitted per query.
start = timeit.default_timer()

# empty list for rows, define the batchsize as the limit per query, zeroing the None_counter
rows = []
batchsize = 100
None_counter = 0

for i in range(0, len(df_tracks['track_id']), batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)

    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter +1
        else:
            rows.append(t)
    
    time.sleep(60)
    
print(f'Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print(f'This took {stop - start} seconds to complete.')

Number of tracks where no audio features were available: 0
This took 60.18565949999902 seconds to complete.


#### Inspect the audio features dataset

In [100]:
print(f'Number of elements in audio features dataset: {len(rows)}')

Number of elements in audio features dataset: 10


In [101]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
df_audio_features.head()

# time_signature ranges from 3 to 7 indicating time signatures of "3/4", to "7/4".
# mode, major = 1 and minor = 0
# key will remain in pitch class notation starting from C=0, C#=1, D=2, etc. which ignores frequency difference in octaves (https://en.wikipedia.org/wiki/Pitch_class)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.843,0.7,4,-3.563,0,0.0803,0.0934,0.0,0.14,0.899,93.991,audio_features,1ri9ZUkBJVFUdgwzCnfcYs,spotify:track:1ri9ZUkBJVFUdgwzCnfcYs,https://api.spotify.com/v1/tracks/1ri9ZUkBJVFU...,https://api.spotify.com/v1/audio-analysis/1ri9...,226088,4
1,0.644,0.648,1,-4.601,1,0.118,0.276,0.0,0.135,0.44,179.951,audio_features,2JPLbjOn0wPCngEot2STUS,spotify:track:2JPLbjOn0wPCngEot2STUS,https://api.spotify.com/v1/tracks/2JPLbjOn0wPC...,https://api.spotify.com/v1/audio-analysis/2JPL...,206710,4
2,0.682,0.765,1,-5.021,0,0.0395,0.0268,3.4e-05,0.188,0.567,90.807,audio_features,2hnMS47jN0etwvFPzYk11f,spotify:track:2hnMS47jN0etwvFPzYk11f,https://api.spotify.com/v1/tracks/2hnMS47jN0et...,https://api.spotify.com/v1/audio-analysis/2hnM...,182747,4
3,0.55,0.917,0,-3.159,1,0.0638,0.00428,0.0,0.197,0.601,126.115,audio_features,6TfBA04WJ3X1d1wXhaCFVT,spotify:track:6TfBA04WJ3X1d1wXhaCFVT,https://api.spotify.com/v1/tracks/6TfBA04WJ3X1...,https://api.spotify.com/v1/audio-analysis/6TfB...,177827,4
4,0.824,0.764,5,-4.175,0,0.0854,0.0888,0.00294,0.117,0.881,107.998,audio_features,6Uj1ctrBOjOas8xZXGqKk4,spotify:track:6Uj1ctrBOjOas8xZXGqKk4,https://api.spotify.com/v1/tracks/6Uj1ctrBOjOa...,https://api.spotify.com/v1/audio-analysis/6Uj1...,172627,4


In [102]:
df_audio_features.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [103]:
df_tracks.columns

Index(['genre', 'artist_name', 'artist_id', 'track_name', 'track_id',
       'popularity', 'album_name', 'album_id', 'release_date', 'date_offset',
       'release_season', 'natural_key'],
      dtype='object')

#### Transform audio features DataFrame

In [104]:
# convert duration_ms to minutes. Solution from https://stackoverflow.com/questions/67438170/how-to-convert-milliseconds-to-minutesseconds-output
# duration_mins is a datetime.time object, not datetime64[ns], need to check if compatible with sql table format
mins = []
ms = df_audio_features['duration_ms']
for i in ms:
    seconds, i = divmod(i, 1000)
    minutes, seconds = divmod(seconds, 60)
    mins.append(f'{int(minutes):01d}:{int(seconds):02d}')

df_audio_features['duration_mins'] = mins
df_audio_features['duration_mins'] = pd.to_datetime(df_audio_features.duration_mins, format='%M:%S').dt.time

df_audio_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,duration_mins
0,0.843,0.7,4,-3.563,0,0.0803,0.0934,0.0,0.14,0.899,93.991,audio_features,1ri9ZUkBJVFUdgwzCnfcYs,spotify:track:1ri9ZUkBJVFUdgwzCnfcYs,https://api.spotify.com/v1/tracks/1ri9ZUkBJVFU...,https://api.spotify.com/v1/audio-analysis/1ri9...,226088,4,00:03:46
1,0.644,0.648,1,-4.601,1,0.118,0.276,0.0,0.135,0.44,179.951,audio_features,2JPLbjOn0wPCngEot2STUS,spotify:track:2JPLbjOn0wPCngEot2STUS,https://api.spotify.com/v1/tracks/2JPLbjOn0wPC...,https://api.spotify.com/v1/audio-analysis/2JPL...,206710,4,00:03:26
2,0.682,0.765,1,-5.021,0,0.0395,0.0268,3.4e-05,0.188,0.567,90.807,audio_features,2hnMS47jN0etwvFPzYk11f,spotify:track:2hnMS47jN0etwvFPzYk11f,https://api.spotify.com/v1/tracks/2hnMS47jN0et...,https://api.spotify.com/v1/audio-analysis/2hnM...,182747,4,00:03:02
3,0.55,0.917,0,-3.159,1,0.0638,0.00428,0.0,0.197,0.601,126.115,audio_features,6TfBA04WJ3X1d1wXhaCFVT,spotify:track:6TfBA04WJ3X1d1wXhaCFVT,https://api.spotify.com/v1/tracks/6TfBA04WJ3X1...,https://api.spotify.com/v1/audio-analysis/6TfB...,177827,4,00:02:57
4,0.824,0.764,5,-4.175,0,0.0854,0.0888,0.00294,0.117,0.881,107.998,audio_features,6Uj1ctrBOjOas8xZXGqKk4,spotify:track:6Uj1ctrBOjOas8xZXGqKk4,https://api.spotify.com/v1/tracks/6Uj1ctrBOjOa...,https://api.spotify.com/v1/audio-analysis/6Uj1...,172627,4,00:02:52
5,0.664,0.609,1,-6.509,1,0.0707,0.304,0.0,0.0926,0.194,130.041,audio_features,3Vi5XqYrmQgOYBajMWSvCi,spotify:track:3Vi5XqYrmQgOYBajMWSvCi,https://api.spotify.com/v1/tracks/3Vi5XqYrmQgO...,https://api.spotify.com/v1/audio-analysis/3Vi5...,210560,4,00:03:30
6,0.505,0.657,11,-5.24,0,0.0318,0.373,0.00107,0.126,0.252,196.0,audio_features,3cBsEDNhFI9E82vPj3kvi3,spotify:track:3cBsEDNhFI9E82vPj3kvi3,https://api.spotify.com/v1/tracks/3cBsEDNhFI9E...,https://api.spotify.com/v1/audio-analysis/3cBs...,178520,3,00:02:58
7,0.614,0.68,6,-4.58,1,0.0289,0.369,2e-06,0.115,0.707,149.959,audio_features,6foY66mWZN0pSRjZ408c00,spotify:track:6foY66mWZN0pSRjZ408c00,https://api.spotify.com/v1/tracks/6foY66mWZN0p...,https://api.spotify.com/v1/audio-analysis/6foY...,234347,4,00:03:54
8,0.25,0.136,10,-12.162,1,0.037,0.673,0.000157,0.107,0.0981,118.086,audio_features,4TbNLKRLKlxZDlS0pu7Lsy,spotify:track:4TbNLKRLKlxZDlS0pu7Lsy,https://api.spotify.com/v1/tracks/4TbNLKRLKlxZ...,https://api.spotify.com/v1/audio-analysis/4TbN...,292333,4,00:04:52
9,0.415,0.026,1,-36.7,1,0.0496,0.995,0.938,0.107,0.268,136.191,audio_features,1G3QPQkoXvpChjNUhd6Tfs,spotify:track:1G3QPQkoXvpChjNUhd6Tfs,https://api.spotify.com/v1/tracks/1G3QPQkoXvpC...,https://api.spotify.com/v1/audio-analysis/1G3Q...,285443,4,00:04:45


In [105]:
df_audio_features.dtypes

danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
type                 object
id                   object
uri                  object
track_href           object
analysis_url         object
duration_ms           int64
time_signature        int64
duration_mins        object
dtype: object

In [106]:
df_audio_features.drop(['track_href','analysis_url','uri','type'], axis=1, inplace=True)
df_audio_features.rename(columns={'id':'track_id'}, inplace=True)

### Generate the dataframes in the format of tables in ERD

In [107]:
df_all = pd.merge(df_tracks, df_audio_features, on='track_id', how='inner')
df_all

Unnamed: 0,genre,artist_name,artist_id,track_name,track_id,popularity,album_name,album_id,release_date,date_offset,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duration_mins
0,reggae,Becky G,4obzFoKoKRHIphyHzJ35G3,MAMIII,1ri9ZUkBJVFUdgwzCnfcYs,98,MAMIII,6GHUywBU0u92lg0Dhrt40R,2022-02-10,1190,...,0,0.0803,0.0934,0.0,0.14,0.899,93.991,226088,4,00:03:46
1,reggae,Bad Bunny,4q3ewBCX7sLwd24euuV69X,Yonaguni,2JPLbjOn0wPCngEot2STUS,95,Yonaguni,6VSOIs13DaSG2IPilNviX5,2021-06-04,284,...,1,0.118,0.276,0.0,0.135,0.44,179.951,206710,4,00:03:26
2,ska,Sublime,0EdvGhlC1FkGItLOWQzG4J,Santeria,2hnMS47jN0etwvFPzYk11f,81,Sublime,14eK347GdWO4mBBx78tsut,1996-07-30,410,...,0,0.0395,0.0268,3.4e-05,0.188,0.567,90.807,182747,4,00:03:02
3,ska,The Offspring,5LfGQac0EIXyAN8aUwmNAQ,"You're Gonna Go Far, Kid",6TfBA04WJ3X1d1wXhaCFVT,84,"Rise And Fall, Rage And Grace",67v63ubEsvDUQkYMzI7A9t,2008-06-17,297,...,1,0.0638,0.00428,0.0,0.197,0.601,126.115,177827,4,00:02:57
4,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Woman,6Uj1ctrBOjOas8xZXGqKk4,97,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,...,0,0.0854,0.0888,0.00294,0.117,0.881,107.998,172627,4,00:02:52
5,dance,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,Need to Know,3Vi5XqYrmQgOYBajMWSvCi,94,Planet Her,1nAQbHeOWTfQzbOoFrvndW,2021-06-25,305,...,1,0.0707,0.304,0.0,0.0926,0.194,130.041,210560,4,00:03:30
6,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Wasted On You,3cBsEDNhFI9E82vPj3kvi3,83,Dangerous: The Double Album,6JlCkqkqobGirPsaleJpFr,2021-01-08,1088,...,0,0.0318,0.373,0.00107,0.126,0.252,196.0,178520,3,00:02:58
7,country,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,83,If I Know Me,1IR2nlwX6YVTXXeu2qzoWO,2018-04-27,107,...,1,0.0289,0.369,2e-06,0.115,0.707,149.959,234347,4,00:03:54
8,opera,Josh Groban,6cXMpsP9x0SH4kFfMyVezF,You Raise Me Up,4TbNLKRLKlxZDlS0pu7Lsy,61,Closer,6Uj9VARQcRBOQZ2uBywlrH,2003-11-11,791,...,1,0.037,0.673,0.000157,0.107,0.0981,118.086,292333,4,00:04:52
9,opera,Jean-Philippe Rameau,4JSWO1Pf2zV991fq64uAce,The Arts and the Hours,1G3QPQkoXvpChjNUhd6Tfs,65,"Rameau, Ólafsson: The Arts and the Hours",4omnRCjOvnHa3ETQlqZpd1,2020-03-06,1286,...,1,0.0496,0.995,0.938,0.107,0.268,136.191,285443,4,00:04:45


In [108]:
df_all.shape

(10, 26)

In [109]:
df_all.columns

Index(['genre', 'artist_name', 'artist_id', 'track_name', 'track_id',
       'popularity', 'album_name', 'album_id', 'release_date', 'date_offset',
       'release_season', 'natural_key', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'duration_mins'],
      dtype='object')

In [112]:
genre_data = df_all[['genre']].drop_duplicates(ignore_index=True)
album_data = df_all[['album_id','album_name','release_season']]
track_features = df_all[[
    'track_natural_key',
    'track_name',
    'artist_name',
    'album_id',
    'acousticness',
    'danceability',
    'duration_mins',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]]

In [114]:
print(f'DataFrames genre_data has shape {genre_data.shape}, album_data has shape {album_data.shape} and track_features has shape {track_features.shape}')

DataFrames genre_data has shape (5, 1), album_data has shape (10, 3) and track_features has shape (10, 20)


In [117]:
# Imports
from sqlalchemy import create_engine
import psycopg2 
from config import db_password

# Create connection to database (endpoint to be decided)
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/spotify_mockup"

# instantiate engine
engine = create_engine(db_string)

track_features.to_sql(name='track_features', con=engine, if_exists='replace', index=False)
genre_data.to_sql(name='genre_data', con=engine, if_exists='replace', index=False)
album_data.to_sql(name='album_data', con=engine, if_exists='replace', index=False)
