In [1]:
# "Data Mining" course Project
# Dimensionality reduction for Spotify music data

# Group 3:
# Athanasios Karampalis
# Reinis Sestakovskis
# Onur Yuksel

# 2022

In [2]:
# Read in secrets from secrets.json

import os
import json

secrets_filename = "secrets.json"

def read_secrets() -> dict:
    filename = os.path.join(secrets_filename)
    try:
        with open(filename, mode='r') as f:
            return json.loads(f.read())
    except FileNotFoundError:
        print(f"Could not find file {secrets_filename}!")
        return {}

secrets = read_secrets()

In [3]:
# Install necessary PIP packages

%pip install numpy pandas spotipy tqdm

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Reinis\anaconda3\python.exe -m pip install --upgrade pip' command.


In [4]:
# Use Spotipy for easier access to the Spotify API
# https://spotipy.readthedocs.io/
# https://github.com/plamere/spotipy

import spotipy
from spotipy.oauth2 import SpotifyOAuth

# scope = "user-library-read"

client_id = secrets["CLIENT_ID"]
client_secret = secrets["CLIENT_SECRET"]
redirect_url = secrets["REDIRECT_URI"]

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_url))

In [5]:
# Use tqdm to see a progress bar when loading the data
from tqdm import tqdm

# Use pandas for DataFrames
import pandas as pd

In [6]:
categories = []

for batch_start_index in tqdm(range(0, 150, 50)):
    categories_results = sp.categories(offset=batch_start_index, limit=50)
    if len(categories_results['categories']['items']) == 0:
        print('Found all categories, finishing early')
        break
    categories = categories + categories_results['categories']['items']
print(len(categories))

  0%|          | 0/3 [00:00<?, ?it/s]Using `localhost` as redirect URI without a port. Specify a port (e.g. `localhost:8080`) to allow automatic retrieval of authentication code instead of having to copy and paste the URL your browser is redirected to.
 33%|███▎      | 1/3 [00:15<00:31, 15.87s/it]

Found all categories, finishing early
46





In [7]:
categories_df = pd.json_normalize(categories)
categories_df.to_csv(path_or_buf=os.path.join('./data', 'categories.csv'))
categories_df.tail(3)

Unnamed: 0,href,icons,id,name
43,https://api.spotify.com/v1/browse/categories/0...,"[{'height': None, 'url': 'https://t.scdn.co/im...",0JQ5DAqbMKFRNXsIvgZF9A,Music + Talk
44,https://api.spotify.com/v1/browse/categories/0...,"[{'height': None, 'url': 'https://t.scdn.co/im...",0JQ5DAqbMKFQ1UFISXj59F,Arab
45,https://api.spotify.com/v1/browse/categories/0...,"[{'height': None, 'url': 'https://t.scdn.co/im...",0JQ5DAqbMKFLVaM30PMBm4,Summer


In [12]:
mood_category_id = categories_df[categories_df['name'] == 'Mood'].iloc[0]['id']
print(mood_category_id)

0JQ5DAqbMKFzHmL4tf05da


In [13]:
playlists = []

for batch_start_index in tqdm(range(0, 200, 50)):
    playlists_results = sp.category_playlists(category_id=mood_category_id, offset=batch_start_index, limit=50)
    if len(playlists_results['playlists']['items']) == 0:
        print('Found all playlists, finishing early')
        break
    playlists = playlists + playlists_results['playlists']['items']
print(len(playlists))

 75%|███████▌  | 3/4 [00:01<00:00,  2.82it/s]

Found all playlists, finishing early
131





In [14]:
playlists_df = pd.json_normalize(playlists)
playlists_df.to_csv(path_or_buf=os.path.join('./data', 'playlists.csv'))
playlists_df.tail(3)

Unnamed: 0,collaborative,description,href,id,images,name,primary_color,public,snapshot_id,type,uri,external_urls.spotify,owner.display_name,owner.external_urls.spotify,owner.href,owner.id,owner.type,owner.uri,tracks.href,tracks.total
128,False,The melancholic and soft sound of Jazz.,https://api.spotify.com/v1/playlists/37i9dQZF1...,37i9dQZF1DWWR73B3Bnjfh,"[{'height': None, 'url': 'https://i.scdn.co/im...",Jazz Noir,,,MTY2NDk3NzY4MiwwMDAwMDAwMGIyNThlNTEyNGViMmQ4MD...,playlist,spotify:playlist:37i9dQZF1DWWR73B3Bnjfh,https://open.spotify.com/playlist/37i9dQZF1DWW...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,100
129,False,Beautifully sad piano music,https://api.spotify.com/v1/playlists/37i9dQZF1...,37i9dQZF1DWZrc3lwvImLj,"[{'height': None, 'url': 'https://i.scdn.co/im...",Melancholy Instrumentals,,,MTY2MzkzOTkzNywwMDAwMDAwMDA1ZDYxYzgyMzAwZjA4Y2...,playlist,spotify:playlist:37i9dQZF1DWZrc3lwvImLj,https://open.spotify.com/playlist/37i9dQZF1DWZ...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,169
130,False,cry just a little,https://api.spotify.com/v1/playlists/37i9dQZF1...,37i9dQZF1DWU4lunzhQdRx,"[{'height': None, 'url': 'https://i.scdn.co/im...",sad girl country,,,MTY2MzAzMDMwNSwwMDAwMDAwMGRkOWRkNjMzNDgwMjBlMD...,playlist,spotify:playlist:37i9dQZF1DWU4lunzhQdRx,https://open.spotify.com/playlist/37i9dQZF1DWU...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,50


In [16]:
playlist_ids = playlists_df['id'].to_numpy()
print(playlist_ids[:5])

['37i9dQZF1DX3rxVfibe1L0' '37i9dQZF1DWSf2RDTDayIx'
 '37i9dQZF1DWSqmBTGDYngZ' '37i9dQZF1DX6VdMW310YC7'
 '37i9dQZF1DXdPec7aLTmlC']


In [17]:
tracks = []

for playlist_id in tqdm(playlist_ids):
    for batch_start_index in range(0, 500, 50):
        track_results = sp.playlist_items(playlist_id=playlist_id, offset=batch_start_index, limit=50)
        if len(track_results['items']) == 0:
            # Found all tracks, finishing early for current playlist
            break
        # Add current playlist id to the current batch of tracks
        for track in track_results['items']:
            track['playlist_id'] = playlist_id
        tracks = tracks + track_results['items']
print(len(tracks))

100%|██████████| 131/131 [02:17<00:00,  1.05s/it]

17860





In [34]:
tracks_df = pd.json_normalize(tracks)
# Drop tracks which have no id
tracks_df = tracks_df[tracks_df['track.id'].notna()]
print(len(tracks_df))
tracks_df = tracks_df.drop_duplicates(subset=['playlist_id', 'track.name', 'track.album.name', 'track.album.release_date'])
print(len(tracks_df))
tracks_df.to_csv(path_or_buf=os.path.join('./data', 'tracks.csv'))
tracks_df.tail(3)

17856
16350


Unnamed: 0,added_at,is_local,primary_color,playlist_id,added_by.external_urls.spotify,added_by.href,added_by.id,added_by.type,added_by.uri,track.album.album_type,...,track.show.id,track.show.images,track.show.is_externally_hosted,track.show.languages,track.show.media_type,track.show.name,track.show.publisher,track.show.total_episodes,track.show.type,track.show.uri
17857,2022-09-13T00:51:45Z,False,,37i9dQZF1DWU4lunzhQdRx,https://open.spotify.com/user/,https://api.spotify.com/v1/users/,,user,spotify:user:,album,...,,,,,,,,,,
17858,2022-09-13T00:51:45Z,False,,37i9dQZF1DWU4lunzhQdRx,https://open.spotify.com/user/,https://api.spotify.com/v1/users/,,user,spotify:user:,album,...,,,,,,,,,,
17859,2022-09-13T00:51:45Z,False,,37i9dQZF1DWU4lunzhQdRx,https://open.spotify.com/user/,https://api.spotify.com/v1/users/,,user,spotify:user:,album,...,,,,,,,,,,


In [35]:
# Retrieve track ids to a list
# These will be needed to retrieve the audio features from the Spotify API
track_ids = tracks_df['track.id'].tolist()
print(track_ids[:10])

['6YIivt0G3ZUuNrM62jp9pL', '4MTmAFWHpvB9kPMSRgLFRp', '6y6xhAgZjvxy5kR5rigpY3', '4h9wh7iOZ0GGn8QVp4RAOB', '3o9kpgkIcffx0iSwxhuNI2', '1imMjt1YGNebtrtTAprKV7', '4svaKoxMMP7ImmKrD5hfQu', '0T5iIrXA4p5GsubkhuBIKV', '0zzVTGyRrWpQu8Fr28NRAv', '18XlJEroUwFo0tLZxscgXE']


In [20]:
# Retrieve audio features from the Spotify API
audio_features = []

for batch_start_index in tqdm(range(0, len(track_ids), 50)):
    audio_features_results = sp.audio_features(track_ids[batch_start_index:batch_start_index+50])
    audio_features = audio_features + audio_features_results
print(len(audio_features))

100%|██████████| 327/327 [00:47<00:00,  6.90it/s]

16350





In [80]:
# Filter out non-object (non-dict) results (example: "None")
audio_features_filtered = [item for item in audio_features if isinstance(item, dict)]
print(len(audio_features_filtered))

16269


In [81]:
# Convert to a Pandas dataframe and check what the data looks like
audio_features_df = pd.DataFrame(audio_features_filtered)
audio_features_df.to_csv(path_or_buf=os.path.join('./data', 'audio_features.csv'))
audio_features_df.tail(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
16266,0.621,0.399,3,-6.807,0,0.0262,0.594,0.0,0.107,0.334,80.993,audio_features,0VsDzurvIUoRNT4fIactL3,spotify:track:0VsDzurvIUoRNT4fIactL3,https://api.spotify.com/v1/tracks/0VsDzurvIUoR...,https://api.spotify.com/v1/audio-analysis/0VsD...,188853,4
16267,0.517,0.378,7,-6.982,1,0.0265,0.0995,1.1e-05,0.0989,0.218,141.903,audio_features,69meckPoEWAVOUXjkp5ShS,spotify:track:69meckPoEWAVOUXjkp5ShS,https://api.spotify.com/v1/tracks/69meckPoEWAV...,https://api.spotify.com/v1/audio-analysis/69me...,247447,4
16268,0.503,0.431,8,-8.171,1,0.0366,0.275,2.4e-05,0.108,0.176,163.728,audio_features,56DsYXpdqkl3t2G3xM8MtA,spotify:track:56DsYXpdqkl3t2G3xM8MtA,https://api.spotify.com/v1/tracks/56DsYXpdqkl3...,https://api.spotify.com/v1/audio-analysis/56Ds...,184760,3


In [82]:
# Merge the two track and the audio feature dataframes into one using the track ids
merged_df = pd.merge(audio_features_df, tracks_df, how='inner', left_on = 'id', right_on = 'track.id')
merged_df.tail(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,track.show.id,track.show.images,track.show.is_externally_hosted,track.show.languages,track.show.media_type,track.show.name,track.show.publisher,track.show.total_episodes,track.show.type,track.show.uri
20484,0.621,0.399,3,-6.807,0,0.0262,0.594,0.0,0.107,0.334,...,,,,,,,,,,
20485,0.517,0.378,7,-6.982,1,0.0265,0.0995,1.1e-05,0.0989,0.218,...,,,,,,,,,,
20486,0.503,0.431,8,-8.171,1,0.0366,0.275,2.4e-05,0.108,0.176,...,,,,,,,,,,


In [83]:
# Merge with the playlist dataframe
merged_df = pd.merge(merged_df, playlists_df, how='inner', left_on = 'playlist_id', right_on = 'id')
merged_df.tail(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,uri_y,external_urls.spotify,owner.display_name,owner.external_urls.spotify,owner.href,owner.id,owner.type,owner.uri,tracks.href,tracks.total
22358,0.611,0.227,8,-12.682,1,0.038,0.986,0.00126,0.102,0.237,...,spotify:playlist:37i9dQZF1DX70dqoLSWJrU,https://open.spotify.com/playlist/37i9dQZF1DX7...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,100
22359,0.356,0.0868,6,-15.106,0,0.0352,0.994,0.0598,0.103,0.0966,...,spotify:playlist:37i9dQZF1DX70dqoLSWJrU,https://open.spotify.com/playlist/37i9dQZF1DX7...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,100
22360,0.243,0.271,10,-15.593,1,0.035,0.642,0.00116,0.275,0.353,...,spotify:playlist:37i9dQZF1DX70dqoLSWJrU,https://open.spotify.com/playlist/37i9dQZF1DX7...,Spotify,https://open.spotify.com/user/spotify,https://api.spotify.com/v1/users/spotify,spotify,user,spotify:user:spotify,https://api.spotify.com/v1/playlists/37i9dQZF1...,100


In [84]:
# Check the columns of our new dataframe
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22361 entries, 0 to 22360
Columns: 107 entries, danceability to tracks.total
dtypes: bool(2), float64(18), int64(5), object(82)
memory usage: 18.1+ MB


In [85]:
# Select only needed columns

columns_to_select = [
    "id_x",             # Track id
    "track.artists",
    "track.name",
    "playlist_id",
    "name",             # Playlist name
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "duration_ms",
    "time_signature",
    "track.popularity"
]

cleaned_df = merged_df[columns_to_select].drop_duplicates(subset=['id_x', 'playlist_id'])
print(len(cleaned_df))
print(cleaned_df.tail(3))

16269
                         id_x  \
22358  5wIXpK3BAW5geIlEmYf23n   
22359  5sOGjH3QgwqhNRwT9I1nYl   
22360  1ScLuFh6ZPY0bkSrv9uVMO   

                                           track.artists  \
22358  [{'external_urls': {'spotify': 'https://open.s...   
22359  [{'external_urls': {'spotify': 'https://open.s...   
22360  [{'external_urls': {'spotify': 'https://open.s...   

                                              track.name  \
22358                           When Your Lover Has Gone   
22359                                    I Cried For You   
22360  The End of a Love Affair - Mono Take 4 with Vo...   

                  playlist_id             name  danceability  energy  key  \
22358  37i9dQZF1DX70dqoLSWJrU  Heartbreak Jazz         0.611  0.2270    8   
22359  37i9dQZF1DX70dqoLSWJrU  Heartbreak Jazz         0.356  0.0868    6   
22360  37i9dQZF1DX70dqoLSWJrU  Heartbreak Jazz         0.243  0.2710   10   

       loudness  mode  speechiness  acousticness  instrumentalness  li

In [86]:
# Convert the 'track.artists' column to just a list of artists' names

cleaned_df['track.artists'] = cleaned_df['track.artists'] \
    .map(lambda obj_list: ', '.join([obj['name'] for obj in obj_list]))

In [87]:
# Check that the "track.artists" column has been cleaned up
cleaned_df['track.artists'].head(20)

0                                     AJR
4                           The Lumineers
8           Alesso, Marshmello, James Bay
12                            OneRepublic
18                             Em Beihold
22                               Dua Lipa
24                                    Jax
28                        Stephen Sanchez
32                Marshmello, Demi Lovato
36                               Maroon 5
38                              Rosa Linn
42                             Ed Sheeran
44                                FINNEAS
46          Lost Frequencies, Calum Scott
48                                  Adele
50                            George Ezra
52                     Alesso, Katy Perry
54                            OneRepublic
58       Hailee Steinfeld, Anderson .Paak
62    benny blanco, Marshmello, Vance Joy
Name: track.artists, dtype: object

In [88]:
# Save the final dataframe to a CSV file
cleaned_df.to_csv(path_or_buf=os.path.join('./data', 'dataset.csv'))