<a href="https://www.kaggle.com/code/sejungjenn/spotify-api-with-spotipy-collecting-data?scriptVersionId=115959323" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# How to extract data from spotify using spotify api

# Import libraries

In [1]:
!pip install python-dotenv spotipy
!pip install spotipy
import csv
import pandas as pd
import spotipy as spotipy
from spotipy.oauth2 import SpotifyClientCredentials 

[0m

# Authenticate with Spotipy

In [2]:
client_id = '...'
client_secret = '...'

client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

# Create functions for extracting data

In [3]:
def get_playlist_track_infos(playlist_link, sp, output_file_name):
    '''Extract track information of each track in the playlist, 
    and create a dataframe with those information'''
    offset = 0
    tracks = []
    playlist_uri = playlist_link.split("/")[-1]
    while True:
        content = sp.playlist_tracks(playlist_uri, fields = None, limit = 100, offset = offset, market = None)
        tracks += content['items']
        # Create a if-else loop so that spotify can extract information more than 100
        if content['next'] is not None:
            offset += 100
        else:
            break
            
    track_info_list = []
    for track in tracks:
        # Track URI
        track_uri = track["track"]["uri"]
        #Track name
        track_name = track["track"]["name"]

        #Main Artist
        artist_name = track["track"]["artists"][0]["name"]

        #Popularity of the track
        track_pop = track["track"]["popularity"]

        track_info_list.append([track_uri, track_name, artist_name, track_pop])

    df = pd.DataFrame(track_info_list, columns = ['track_uri', 'track_name', 'artist', 'track_popularity'])
    df.to_csv("{}.csv".format(output_file_name), index = False)

In [4]:
def get_playlist_audio_features(df, sp, output_file_name):
    '''Extract audio features of each track in specified dataframe, 
    then create a dataframe with those information'''
    track_list = df['track_uri'].to_list()
    features_list = []
    for j in track_list:
        features = sp.audio_features(j)[0]
        features_list.append([j, features['danceability'], features['valence'], features['energy'],
                            features['tempo'], features['loudness'], features['speechiness'], 
                            features['instrumentalness'], features['liveness'], 
                            features['acousticness'], features['key']])

    df = pd.DataFrame(features_list, columns = ['track_uri', 'danceability', 'valence', 'energy',
                                               'tempo', 'loudness', 'speechiness', 
                                                'instrumentalness', 'liveness', 
                                               'acousticness', 'key'])
    df.to_csv("{}.csv".format(output_file_name), index = False)

In [6]:
links = ["https://open.spotify.com/playlist/37i9dQZF1DXe2bobNYDtW8", "https://open.spotify.com/playlist/37i9dQZF1DWVRSukIED0e9", "https://open.spotify.com/playlist/2fmTTbBkXi8pewbUvG3CeZ", 
         "https://open.spotify.com/playlist/5GhQiRkGuqzpWZSE7OU4Se", "https://open.spotify.com/playlist/4hMcqod7ERKJ9mtjgdimeV"]


get_playlist_track_infos(links[0], sp, '2018')
track_18 = pd.read_csv('2018.csv')

get_playlist_track_infos(links[1], sp, '2019')
track_19 = pd.read_csv('2019.csv')

get_playlist_track_infos(links[2], sp, '2020')
track_20 = pd.read_csv('2020.csv')

get_playlist_track_infos(links[3], sp, '2021')
track_21 = pd.read_csv('2021.csv')

get_playlist_track_infos(links[4], sp, '2022')
track_22 = pd.read_csv('2022.csv')

In [7]:
get_playlist_audio_features(track_18, sp, 'audio_18')
audio_18 = pd.read_csv('audio_18.csv')

get_playlist_audio_features(track_19, sp, 'audio_19')
audio_19 = pd.read_csv('audio_19.csv')

get_playlist_audio_features(track_20, sp, 'audio_20')
audio_20 = pd.read_csv('audio_20.csv')

get_playlist_audio_features(track_21, sp, 'audio_21')
audio_21 = pd.read_csv('audio_21.csv')

get_playlist_audio_features(track_22, sp, 'audio_22')
audio_22 = pd.read_csv('audio_22.csv')

In [8]:
audio_22.head()

Unnamed: 0,track_uri,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
0,spotify:track:3nqQXoyQOWXiESFLlDF1hG,0.714,0.238,0.472,131.121,-7.375,0.0864,5e-06,0.266,0.013,2
1,spotify:track:1Qrg8KqiBpW07V7PNxwwwL,0.644,0.418,0.735,88.98,-5.747,0.0391,0.144,0.161,0.0521,8
2,spotify:track:4uUG5RXrOk84mYEfFvj3cK,0.561,0.304,0.965,128.04,-3.673,0.0343,7e-06,0.371,0.00383,7
3,spotify:track:0V3wPSX9ygBnCm8psDIegu,0.637,0.533,0.643,97.008,-6.571,0.0519,2e-06,0.142,0.13,4
4,spotify:track:2dHHgzDwk4BJdRwy9uXhTO,0.715,0.172,0.62,97.95,-6.005,0.0484,0.0,0.0822,0.417,1


In [9]:
# Merge dataframes
df_18 = pd.merge(track_18, audio_18, how = 'inner', on = 'track_uri')
df_19 = pd.merge(track_19, audio_19, how = 'inner', on = 'track_uri')
df_20 = pd.merge(track_20, audio_20, how = 'inner', on = 'track_uri')
df_21 = pd.merge(track_21, audio_21, how = 'inner', on = 'track_uri')
df_22 = pd.merge(track_22, audio_22, how = 'inner', on = 'track_uri')

# Confirm
df_18.info()
df_22.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_uri         100 non-null    object 
 1   track_name        100 non-null    object 
 2   artist            100 non-null    object 
 3   track_popularity  100 non-null    int64  
 4   danceability      100 non-null    float64
 5   valence           100 non-null    float64
 6   energy            100 non-null    float64
 7   tempo             100 non-null    float64
 8   loudness          100 non-null    float64
 9   speechiness       100 non-null    float64
 10  instrumentalness  100 non-null    float64
 11  liveness          100 non-null    float64
 12  acousticness      100 non-null    float64
 13  key               100 non-null    int64  
dtypes: float64(9), int64(2), object(3)
memory usage: 11.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 220 entries, 0 to 219
Data colu

# Data Quality Assessment & Reporting

In [12]:
df_18.sample(4)

Unnamed: 0,track_uri,track_name,artist,recent_popularity,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
14,2dpaYNEQHiRxtZbfNsse99,Happier,Marshmello,84,0.687,0.671,0.792,100.015,-2.749,0.0452,0.0,0.167,0.191,5
84,6kPJZM97LwdG9QIsT7khp6,Solo (feat. Demi Lovato),Clean Bandit,71,0.737,0.565,0.636,105.005,-4.546,0.0437,6.7e-05,0.35,0.0441,11
57,3Vo4wInECJQuz9BIBMOu8i,Finesse - Remix; feat. Cardi B,Bruno Mars,75,0.704,0.926,0.859,105.115,-4.877,0.0996,0.0,0.0215,0.0185,5
87,10Igtw8bSDyyFs7KIsKngZ,Freaky Friday (feat. Chris Brown),Lil Dicky,71,0.755,0.755,0.599,133.123,-5.042,0.224,0.0,0.109,0.147,8


In [13]:
df_19.sample(4)

Unnamed: 0,track_uri,track_name,artist,recent_popularity,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
24,22vgEDb5hykfaTwLuskFGD,Sucker,Jonas Brothers,82,0.842,0.952,0.734,137.958,-5.065,0.0588,0.0,0.106,0.0427,1
50,0FZ4Dmg8jJJAPJnvBIzD9z,ZEZE (feat. Travis Scott & Offset),Kodak Black,78,0.861,0.504,0.603,98.043,-5.788,0.176,0.0,0.0924,0.0521,8
71,0KoiMHhqKLoL4xM54rcWYY,"Taki Taki (feat. Selena Gomez, Ozuna & Cardi B)",DJ Snake,75,0.842,0.617,0.801,95.881,-4.167,0.228,5e-06,0.0642,0.157,8
7,2qxmye6gAegTMjLKEBoR3d,Let Me Down Slowly,Alec Benjamin,86,0.652,0.483,0.557,150.073,-5.714,0.0318,0.0,0.124,0.74,1


In [14]:
df_19.describe()

Unnamed: 0,recent_popularity,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,77.71,0.71115,0.485144,0.61982,120.44846,-6.12776,0.125805,0.006473,0.152672,0.24658,5.24
std,4.770056,0.137789,0.222649,0.162395,29.628311,2.403021,0.111219,0.039004,0.114017,0.239313,3.470278
min,69.0,0.351,0.0976,0.111,67.016,-14.505,0.0259,0.0,0.0574,0.000677,0.0
25%,74.0,0.6375,0.326,0.53,98.0075,-7.06725,0.04595,0.0,0.0933,0.063725,2.0
50%,78.0,0.7365,0.493,0.6275,116.1255,-5.7105,0.08435,0.0,0.1105,0.1635,5.0
75%,82.0,0.80825,0.633,0.73425,140.0285,-4.50825,0.16175,3e-06,0.1605,0.34675,8.0
max,90.0,0.95,0.952,0.919,202.015,-2.253,0.467,0.334,0.795,0.978,11.0


In [15]:
df_20.describe()

Unnamed: 0,recent_popularity,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,67.112782,0.703729,0.506097,0.617271,121.330256,-6.359233,0.110994,0.010115,0.174078,0.230216,5.571429
std,22.437001,0.122631,0.227563,0.154487,26.507167,2.194805,0.104059,0.061439,0.13787,0.247238,3.660365
min,0.0,0.333,0.0517,0.219,73.537,-14.454,0.0277,0.0,0.0411,0.00115,0.0
25%,66.0,0.639,0.346,0.522,101.085,-7.34,0.0457,0.0,0.0904,0.0342,2.0
50%,75.0,0.724,0.522,0.62,117.973,-6.266,0.0625,0.0,0.115,0.145,6.0
75%,80.0,0.79,0.684,0.728,140.042,-4.593,0.135,5.4e-05,0.217,0.316,9.0
max,90.0,0.935,0.925,0.955,180.067,-2.975,0.487,0.657,0.792,0.917,11.0


In [16]:
# Check for any duplicated track entries
dfs = [df_18, df_19, df_20, df_21, df_22]

for df in dfs:
    print(df.track_uri.duplicated().sum())

0
0
0
0
0


Data Quality Report

1. All dataframes seems to have full information with no duplicated entries or missing data.
2. Audio features in the dataframes are within the range specified by Spotify, with no outliers.
3. Remove unnecessary string 'spotify:track:' from 'track_uri' column
4. Proper column name for track_popularity. According to Spotify, the algorithm for calculating track popularity, in the most part, is on the total number of plays the track has had and how recent those plays are. Songs that are being played a lot recently will have a greater popularity score than those that were played a lot in the past.

# Wrangle

In [10]:
# Remove unnecessary string
dfs = [df_18, df_19, df_20, df_21, df_22]

for df in dfs:
    df['track_uri'] = df['track_uri'].str.split(":").str.get(-1)
    
# Confirm
df_22.head(4)

Unnamed: 0,track_uri,track_name,artist,track_popularity,danceability,valence,energy,tempo,loudness,speechiness,instrumentalness,liveness,acousticness,key
0,3nqQXoyQOWXiESFLlDF1hG,Unholy (feat. Kim Petras),Sam Smith,100,0.714,0.238,0.472,131.121,-7.375,0.0864,5e-06,0.266,0.013,2
1,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,91,0.644,0.418,0.735,88.98,-5.747,0.0391,0.144,0.161,0.0521,8
2,4uUG5RXrOk84mYEfFvj3cK,I'm Good (Blue),David Guetta,98,0.561,0.304,0.965,128.04,-3.673,0.0343,7e-06,0.371,0.00383,7
3,0V3wPSX9ygBnCm8psDIegu,Anti-Hero,Taylor Swift,97,0.637,0.533,0.643,97.008,-6.571,0.0519,2e-06,0.142,0.13,4


In [11]:
# Rename popularity columns
map = {'track_popularity': 'recent_popularity'}

for df in dfs:
    df.rename(map, axis = 1, inplace = True)
    
# Confirm
df_18.columns, df_21.columns

(Index(['track_uri', 'track_name', 'artist', 'recent_popularity',
        'danceability', 'valence', 'energy', 'tempo', 'loudness', 'speechiness',
        'instrumentalness', 'liveness', 'acousticness', 'key'],
       dtype='object'),
 Index(['track_uri', 'track_name', 'artist', 'recent_popularity',
        'danceability', 'valence', 'energy', 'tempo', 'loudness', 'speechiness',
        'instrumentalness', 'liveness', 'acousticness', 'key'],
       dtype='object'))

### Now that you have your desired dataset, you can carry on exploratory data analysis and visualisation. Enjoy exploring the tracks and their audio features!

### Another option to extract information but with a limitation of 100 track entries

In [None]:
!pip install python-dotenv spotipy
!pip install spotipy
import csv
import os
import re
from dotenv import load_dotenv
import pandas as pd
import spotipy as spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import requests
import base64
from requests import post 
import json 

In [None]:
# Authenticate with Spotipy with client_id and client_secret
load_dotenv()

client_id = os.getenv("CLIENT_ID","") # tyoe in your client_id in between the quotation marks
client_secret = os.getenv("CLIENT_SECRET", "") # input your client_secret in between the quotation marks

client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret= client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

### Extract Track Information

In [None]:
links = ["https://open.spotify.com/playlist/37i9dQZF1DXe2bobNYDtW8", "https://open.spotify.com/playlist/37i9dQZF1DWVRSukIED0e9", "https://open.spotify.com/playlist/2fmTTbBkXi8pewbUvG3CeZ", "https://open.spotify.com/playlist/5GhQiRkGuqzpWZSE7OU4Se", "https://open.spotify.com/playlist/4hMcqod7ERKJ9mtjgdimeV"]
output_files = ["2018.csv", "2019.csv", "2020.csv", "2021.csv", "2022.csv"]

for link, output_file in zip(links, output_files):
    with open(output_file, "w", encoding = 'utf-8') as file:
            uris = link.split("/")[-1]
            writer = csv.writer(file)
            writer.writerow(['track_uri', 'track', 'artist', 'artist_popularity', 'followers', 'artist_genre', 'track_popularity', 'album'])
            for track in sp.playlist_tracks(uris)["items"]:
                #URI
                track_uri = track["track"]["uri"]

                #Track name
                track_name = track["track"]["name"]

                #Main Artist
                artist_uri = track["track"]["artists"][0]["uri"]
                artist_info = sp.artist(artist_uri)

                #Name, popularity, genre
                artist_name = track["track"]["artists"][0]["name"]
                artist_pop = artist_info["popularity"]
                artist_genre = artist_info["genres"]
                followers = artist_info["followers"]['total']

                #Album
                album = track["track"]["album"]["name"]

                #Popularity of the track
                track_pop = track["track"]["popularity"]

                # write a csv file
                writer.writerow([track_uri, track_name, artist_name, artist_pop, followers, artist_genre, track_pop, album])

In [None]:
df2018 = pd.read_csv("2018.csv")
df2019 = pd.read_csv("2019.csv")
df2020 = pd.read_csv("2020.csv")
df2021 = pd.read_csv("2021.csv")
df2022 = pd.read_csv('2022.csv')
df2018.head()

In [None]:
# Add year column to each table
df2018['year'] = 2018
df2019['year'] = 2019
df2020['year'] = 2020
df2021['year'] = 2021
df2022['year'] = 2022

### Extract Audio Features

In [None]:
def get_token():
    '''Get token'''
    auth_string = client_id + ":" + client_secret 
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), 'utf-8')

    url = "https://accounts.spotify.com/api/token"

    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded",
    }

    data = {'grant_type': "client_credentials"}
    result = post(url, headers = headers, data = data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

token = get_token()

headers = {'Authorization': 'Bearer {}'.format(token)}

In [None]:
base_url = 'https://api.spotify.com/v1/'
df_lists = [df2018, df2019, df2020, df2021, df2022]
output_files = ['track2018.csv', 'track2019.csv', 'track2020.csv', 'track2021.csv', 'track2022.csv']

for df, output_file in zip(df_lists, output_files):
    with open(output_file, "w", encoding = 'utf-8') as file:
        uris = df['track_uri'].to_list()
        features = {}
        writer = csv.writer(file)
        writer.writerow(["track_uri", "danceability", "valence", "energy", "tempo", "loudness", "speechiness", "instrumentalness", "liveness", "acousticness", "key"])
        for uri in uris:
            track_uri = uri
            
            features[uri] = {
                            'danceability': 0,
                            'valence': 0,
                            'energy': 0,
                            'tempo': 0,
                            'loudness': 0,
                            'speechiness': 0, 
                            'instrumentalness': 0,
                            'liveness': 0, 
                            'acousticness': 0, 
                            'key': 0
                            }

            s = requests.get(base_url + 'audio-features/' + uri, headers = headers)
            s = s.json()
            
            features[uri]['danceability'] = s['danceability']
            features[uri]['valence'] = s['valence']
            features[uri]['energy'] = s['energy']
            features[uri]['tempo'] = s['tempo']
            features[uri]['loudness'] = s['loudness']
            features[uri]['speechiness'] = s['speechiness']
            features[uri]['instrumentalness'] = s['instrumentalness']
            features[uri]['liveness'] = s['liveness']
            features[uri]['acousticness'] = s['acousticness']
            features[uri]['key'] = s['key']
            
            writer.writerow([track_uri, features[uri]['danceability'], features[uri]['valence'], features[uri]['energy'], features[uri]['tempo'], features[uri]['loudness'], features[uri]['speechiness'], 
                            features[uri]['instrumentalness'], features[uri]['liveness'], features[uri]['acousticness'], features[uri]['key']])

In [None]:
df2_2018 = pd.read_csv("track2018.csv")
df2_2019 = pd.read_csv("track2019.csv")
df2_2020 = pd.read_csv("track2020.csv")
df2_2021 = pd.read_csv("track2021.csv")
df2_2022 = pd.read_csv("track2022.csv")

In [None]:
df2018 = pd.merge(df2018, df2_2018, how = 'inner', on = 'track_uri')
df2019 = pd.merge(df2019, df2_2019, how = 'inner', on = 'track_uri')
df2020 = pd.merge(df2020, df2_2020, how = 'inner', on = 'track_uri')
df2021 = pd.merge(df2021, df2_2021, how = 'inner', on = 'track_uri')
df2022 = pd.merge(df2022, df2_2022, how = 'inner', on = 'track_uri')