# Getting Spotify's Data on songs through various popular playlists

## Imports

In [1]:
import requests
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
#pip install spotipy --upgrade

## Credentials

In [3]:
#Credentials
cid ='XX'
secret = "XX"

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Finding popular playlists to use

In [None]:
#Getting popular playlist IDs
#example below
sp.search(q='mega hit', type='playlist', market = 'US', limit=50)

I just changed the query and picked out the playlist ID's that I wanted, shooting for the popular playlists and songs

### List of playlist ID's
1. Today's Top Hits ID: '37i9dQZF1DXcBWIGoYBM5M'
2. Mega Hit Mix: '37i9dQZF1DXbYM3nMM0oPk'

## Concrete example of getting the data using a single playlist (Today's Top Hits)

In [5]:
playlist1 = sp.playlist_tracks('37i9dQZF1DXcBWIGoYBM5M') #note the max songs in a playlist is 100
playlist1 = playlist1['items']
playlist1 = pd.DataFrame(playlist1)

### Getting Track ID's, population score, and name

In [6]:
#getting track ID 
track_ids =[]
track_pop =[]
track_name = []
for track in playlist1['track']:
    track_ids.append(track['id'])
    track_pop.append(track['popularity'])
    track_name.append(track['name'])
print(track_ids)
print(track_pop)
print(track_name)

['6WrI0LAC5M1Rw2MnX2ZvEg', '0sf12qNH5qcw8qpgymFOqD', '364dI1bYnvamSnBJ8JcNzN', '4TnjEaWOeW0eKTKIEvJyCa', '0nbXyq5TXYPCO7pr3N8S4I', '5yY9lUy8nbvjM1Uyo1Uqoc', '2usxQITOSDqvkYiI0oIwao', '3yOlyBJuViE2YSGn3nVE1K', '3ZCTVFBt2Brf31RLEnCkWJ', '3Dv1eDb0MEgF93GpLXlucZ', '696DnlkuDOXcMAnKlTgXXK', '4wNIkl5XGiAACjFBlDWuSd', '7HMmFQsKsljwTw8bS7lu19', '421leiR6jKlH5KDdwLYrOs', '44fXOB2eBG8uaQJGwh26Bk', '0TrPqhAMoaKUFLR7iYDokf', '41L3O37CECZt3N7ziG2z7l', '1M4qEo4HE3PRaCOM7EXNJq', '3cqPu20DGTGUoZtbJH2Dmi', '7ce20yLkzuXXLUhzIDoZih', '2slqvGLwzZZYsT4K4Y1GBC', '5JSVa5i6lFPoyOzK4gj0Ox', '4yJiXq86uM56uIfIZgE440', '7FIWs0pqAYbP91WWM0vlTQ', '4VginDwYTP2eaHJzO0QMjG', '0JQ5MbyriK6ruD3t6RZ7ix', '5IUOU5xkzGHsRFOYNu3GSK', '7CHi4DtfK4heMlQaudCuHK', '6U0FIYXCQ3TGrk4tFpLrEA', '6wJYhPfqk3KGhHRG76WzOh', '1Cv1YLb4q0RzL6pybtaMLo', '5hff5RQeE84pznOt8WEWeO', '3o1CUVeHIid49sabk6A6Nf', '07KXEDMj78x68D884wgVEm', '4k3uABcX9iaGlt5pRJhumi', '24Yi9hE78yPEbZ4kxyoXAI', '6p8eEdiZLKJH8tcjGZuNTK', '2kJwzbxV2ppxnQoYw4GLBZ', '7hpJ9tATWb

In [7]:
print(len(track_ids))

print(len(track_pop))

len(track_name)

50
50


50

### Getting artist of the track and artist ID

This got tricky because the length of artists and artist id's did not match the length of the playlist. The artists and artist IDs object is longer because one song can have multiple artists. For example, the song 'intentions' is by Justin Beiber but features Quavo. Here we have one song but two artists.

In [8]:
#full artist data
playlist1['track'][1]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Xyo4u8uXC1ZmMpatF05PJ'},
  'href': 'https://api.spotify.com/v1/artists/1Xyo4u8uXC1ZmMpatF05PJ',
  'id': '1Xyo4u8uXC1ZmMpatF05PJ',
  'name': 'The Weeknd',
  'type': 'artist',
  'uri': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ'}]

In [9]:
#this code works to get only the first artist on a song
#track[1] is intentions by justin beiber featuring quavo. We only want Justin Beiber
playlist1['track'][1]['artists'][0]['name']

'The Weeknd'

We want generalize this idea so we can pull only the first artist within our function.

In [10]:
#Getting artist and artist ID
artist = []
artist_ids =[]
for i in playlist1['track']:
    artist.append(i['artists'][0]['name']) #this points to the first artist listed on the track which is the main one
    artist_ids.append(i['artists'][0]['id'])
print(artist)
print(artist_ids)

['Dua Lipa', 'The Weeknd', 'Justin Bieber', 'Trevor Daniel', 'Roddy Ricch', 'Future', 'Halsey', 'Camila Cabello', 'Billie Eilish', 'Doja Cat', 'Arizona Zervas', 'Jonas Brothers', 'Selena Gomez', 'Tones and I', 'Dua Lipa', 'Khalid', 'Justin Bieber', 'Harry Styles', 'Ali Gatie', 'Lewis Capaldi', 'Taylor Swift', '5 Seconds of Summer', 'blackbear', 'Eminem', 'Post Malone', 'Tones and I', 'Niall Horan', 'MEDUZA', 'BROCKHAMPTON', 'Lil Mosey', 'Surfaces', 'Anne-Marie', 'Kygo', 'Roddy Ricch', 'Tate McRae', 'SAINt JHN', 'Joji', 'JP Saxe', 'Conan Gray', 'Alan Walker', 'KAROL G', 'Mac Miller', 'Selena Gomez', 'YNW Melly', 'Ant Saunders', 'Demi Lovato', 'Noah Cyrus', 'Stormzy', 'Lil Baby', 'The Black Eyed Peas']
['6M2wZ9GZgrQXHCFfjv46we', '1Xyo4u8uXC1ZmMpatF05PJ', '1uNFoZAHBGtllmzznpCI3s', '7uaIm6Pw7xplS8Dy06V6pT', '757aE44tKEUQEqRuT6GnEB', '1RyvyyTE3xzB2ZywiAwp0i', '26VFTg2z8YR0cCuwLzESi2', '4nDoRrQiYLoBzwC5BhVJzF', '6qqNVTkY8uBg9cP3Jd7DAH', '5cj0lLjcoR7YOSnhnX0Po5', '0vRvGUQVUjytro0xpb26bs', '7g

We need to make sure the lengths of everything are the same

In [11]:
print(len(artist))
len(artist_ids)

50


50

### Audio Features

Getting audio features for every song on a playlist

In [12]:
#eventually we will want to concat this audio dataframe to the existing one with the artists and song names
audio = pd.DataFrame(sp.audio_features(tracks = track_ids))
audio = audio[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id','duration_ms', 'time_signature']]
audio

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,duration_ms,time_signature
0,0.794,0.793,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,audio_features,6WrI0LAC5M1Rw2MnX2ZvEg,183290,4
1,0.513,0.796,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,0.345,171.017,audio_features,0sf12qNH5qcw8qpgymFOqD,201573,4
2,0.811,0.553,9,-6.644,1,0.0552,0.317,0.0,0.105,0.86,148.014,audio_features,364dI1bYnvamSnBJ8JcNzN,212869,4
3,0.784,0.43,10,-8.756,0,0.0364,0.123,0.0,0.0887,0.236,127.087,audio_features,4TnjEaWOeW0eKTKIEvJyCa,159382,4
4,0.896,0.586,10,-6.687,0,0.0559,0.104,0.0,0.79,0.642,116.971,audio_features,0nbXyq5TXYPCO7pr3N8S4I,196653,4
5,0.676,0.609,2,-5.831,0,0.481,0.0706,0.0,0.152,0.508,142.037,audio_features,5yY9lUy8nbvjM1Uyo1Uqoc,237735,4
6,0.591,0.585,2,-6.35,1,0.0277,0.143,0.0,0.109,0.324,110.94,audio_features,2usxQITOSDqvkYiI0oIwao,205473,4
7,0.724,0.491,8,-6.024,1,0.0296,0.018,1.3e-05,0.0887,0.383,105.046,audio_features,3yOlyBJuViE2YSGn3nVE1K,170746,4
8,0.704,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,audio_features,3ZCTVFBt2Brf31RLEnCkWJ,245426,4
9,0.787,0.673,11,-4.577,0,0.158,0.256,4e-06,0.0904,0.786,110.962,audio_features,3Dv1eDb0MEgF93GpLXlucZ,237893,4


### Featured Artists Column

Making a column that has a value of 1 if there are featured artists on the track and a 0 if not

In [13]:
featured_artist = []

for i in playlist1['track']:
    if len(i['artists']) > 1:
        featured_artist.append(1)
    else:
        featured_artist.append(0)
        
len(featured_artist)

50

## General Functions

In [14]:
#function that'll take in a playlist ID and return songs and artist info
def playlist_id(num):
    #playlist info
    playlist = sp.playlist_tracks(num)
    playlist = playlist['items']
    playlist = pd.DataFrame(playlist)
   
    #getting track ID 
    track_ids =[]
    track_pop =[]
    track_name = []
    for track in playlist['track']:
        track_ids.append(track['id'])
        track_pop.append(track['popularity'])
        track_name.append(track['name'])
    
    #Getting artist and artist ID
    artist = []
    artist_ids =[]
    for i in playlist['track']:
        artist.append(i['artists'][0]['name']) #this points to the first artist listed on the track which is the main one
        artist_ids.append(i['artists'][0]['id'])
    
    #Creating featured artist column
    featured_artist = []

    for i in playlist['track']:
        if len(i['artists']) > 1:
            featured_artist.append(1)
        else:
            featured_artist.append(0)
    
    
    playlist_df = [track_ids, track_pop, track_name, artist, artist_ids, featured_artist]
    playlist_df = pd.DataFrame(data = playlist_df).T
    playlist_df.columns = ['track_ids', 'track_pop', 'track_name', 'artist', 'artist_ids', 'featured_artist']
    return playlist_df

In [15]:
#Function that will take in a list of artist ID's and return their genres and pop scores
def getting_genres(col):
    artist_pop = []
    genre = []

    for i in col:
        artist_pop.append(i['popularity'])
        if i['genres'] == [] : #fixing the empty list issue
            genre.append('NA')
        else:
            genre.append(i['genres'][0])
        
        
    genre_df = [artist_pop, genre]
    genre_df = pd.DataFrame(data = genre_df).T
    genre_df.columns = ['artist_pop', 'genre']
    return genre_df

In [16]:
#function that will take in a list of track ID's and return the audio features
def audio(col):
    a = pd.DataFrame(sp.audio_features(tracks = list(col)))
    a = a[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id','duration_ms', 'time_signature']]
    return a

### Example of creating a full playlist dataframe with Today's Top Hits Playlist - 50 songs

In [17]:
todays_top_hits = playlist_id('37i9dQZF1DXcBWIGoYBM5M')

In [18]:
first = list(todays_top_hits['artist_ids'])[0:50]
first = sp.artists(first)
first = first['artists']

In [19]:
todays_top_hits_genres = getting_genres(first)

In [20]:
todays_top_hits = pd.concat([todays_top_hits, todays_top_hits_genres], axis=1, sort=False)
todays_top_hits.head()

Unnamed: 0,track_ids,track_pop,track_name,artist,artist_ids,featured_artist,artist_pop,genre
0,6WrI0LAC5M1Rw2MnX2ZvEg,97,Don't Start Now,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,0,94,dance pop
1,0sf12qNH5qcw8qpgymFOqD,98,Blinding Lights,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,0,96,canadian contemporary r&b
2,364dI1bYnvamSnBJ8JcNzN,86,Intentions,Justin Bieber,1uNFoZAHBGtllmzznpCI3s,1,97,canadian pop
3,4TnjEaWOeW0eKTKIEvJyCa,97,Falling,Trevor Daniel,7uaIm6Pw7xplS8Dy06V6pT,0,88,alternative r&b
4,0nbXyq5TXYPCO7pr3N8S4I,100,The Box,Roddy Ricch,757aE44tKEUQEqRuT6GnEB,0,96,melodic rap


In [21]:
audio(todays_top_hits['track_ids'])

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,duration_ms,time_signature
0,0.794,0.793,11,-4.521,0,0.0842,0.0125,0.0,0.0952,0.677,123.941,audio_features,6WrI0LAC5M1Rw2MnX2ZvEg,183290,4
1,0.513,0.796,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,0.345,171.017,audio_features,0sf12qNH5qcw8qpgymFOqD,201573,4
2,0.811,0.553,9,-6.644,1,0.0552,0.317,0.0,0.105,0.86,148.014,audio_features,364dI1bYnvamSnBJ8JcNzN,212869,4
3,0.784,0.43,10,-8.756,0,0.0364,0.123,0.0,0.0887,0.236,127.087,audio_features,4TnjEaWOeW0eKTKIEvJyCa,159382,4
4,0.896,0.586,10,-6.687,0,0.0559,0.104,0.0,0.79,0.642,116.971,audio_features,0nbXyq5TXYPCO7pr3N8S4I,196653,4
5,0.676,0.609,2,-5.831,0,0.481,0.0706,0.0,0.152,0.508,142.037,audio_features,5yY9lUy8nbvjM1Uyo1Uqoc,237735,4
6,0.591,0.585,2,-6.35,1,0.0277,0.143,0.0,0.109,0.324,110.94,audio_features,2usxQITOSDqvkYiI0oIwao,205473,4
7,0.724,0.491,8,-6.024,1,0.0296,0.018,1.3e-05,0.0887,0.383,105.046,audio_features,3yOlyBJuViE2YSGn3nVE1K,170746,4
8,0.704,0.225,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,audio_features,3ZCTVFBt2Brf31RLEnCkWJ,245426,4
9,0.787,0.673,11,-4.577,0,0.158,0.256,4e-06,0.0904,0.786,110.962,audio_features,3Dv1eDb0MEgF93GpLXlucZ,237893,4


In [22]:
todays_top_hits = pd.concat([todays_top_hits, audio(todays_top_hits['track_ids'])], axis =1, sort = False )
todays_top_hits

Unnamed: 0,track_ids,track_pop,track_name,artist,artist_ids,featured_artist,artist_pop,genre,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,duration_ms,time_signature
0,6WrI0LAC5M1Rw2MnX2ZvEg,97,Don't Start Now,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,0,94,dance pop,0.794,0.793,...,0.0842,0.0125,0.0,0.0952,0.677,123.941,audio_features,6WrI0LAC5M1Rw2MnX2ZvEg,183290,4
1,0sf12qNH5qcw8qpgymFOqD,98,Blinding Lights,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,0,96,canadian contemporary r&b,0.513,0.796,...,0.0629,0.00147,0.000209,0.0938,0.345,171.017,audio_features,0sf12qNH5qcw8qpgymFOqD,201573,4
2,364dI1bYnvamSnBJ8JcNzN,86,Intentions,Justin Bieber,1uNFoZAHBGtllmzznpCI3s,1,97,canadian pop,0.811,0.553,...,0.0552,0.317,0.0,0.105,0.86,148.014,audio_features,364dI1bYnvamSnBJ8JcNzN,212869,4
3,4TnjEaWOeW0eKTKIEvJyCa,97,Falling,Trevor Daniel,7uaIm6Pw7xplS8Dy06V6pT,0,88,alternative r&b,0.784,0.43,...,0.0364,0.123,0.0,0.0887,0.236,127.087,audio_features,4TnjEaWOeW0eKTKIEvJyCa,159382,4
4,0nbXyq5TXYPCO7pr3N8S4I,100,The Box,Roddy Ricch,757aE44tKEUQEqRuT6GnEB,0,96,melodic rap,0.896,0.586,...,0.0559,0.104,0.0,0.79,0.642,116.971,audio_features,0nbXyq5TXYPCO7pr3N8S4I,196653,4
5,5yY9lUy8nbvjM1Uyo1Uqoc,96,Life Is Good (feat. Drake),Future,1RyvyyTE3xzB2ZywiAwp0i,1,94,atl hip hop,0.676,0.609,...,0.481,0.0706,0.0,0.152,0.508,142.037,audio_features,5yY9lUy8nbvjM1Uyo1Uqoc,237735,4
6,2usxQITOSDqvkYiI0oIwao,85,You should be sad,Halsey,26VFTg2z8YR0cCuwLzESi2,0,95,dance pop,0.591,0.585,...,0.0277,0.143,0.0,0.109,0.324,110.94,audio_features,2usxQITOSDqvkYiI0oIwao,205473,4
7,3yOlyBJuViE2YSGn3nVE1K,93,My Oh My (feat. DaBaby),Camila Cabello,4nDoRrQiYLoBzwC5BhVJzF,1,95,dance pop,0.724,0.491,...,0.0296,0.018,1.3e-05,0.0887,0.383,105.046,audio_features,3yOlyBJuViE2YSGn3nVE1K,170746,4
8,3ZCTVFBt2Brf31RLEnCkWJ,96,everything i wanted,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,0,99,electropop,0.704,0.225,...,0.0994,0.902,0.657,0.106,0.243,120.006,audio_features,3ZCTVFBt2Brf31RLEnCkWJ,245426,4
9,3Dv1eDb0MEgF93GpLXlucZ,91,Say So,Doja Cat,5cj0lLjcoR7YOSnhnX0Po5,0,88,la indie,0.787,0.673,...,0.158,0.256,4e-06,0.0904,0.786,110.962,audio_features,3Dv1eDb0MEgF93GpLXlucZ,237893,4


In [23]:
todays_top_hits.to_csv('./Data/todays_top_hits.csv')

### Another example making sure it works with >50 songs: Mega Hit Mix 75 songs

In [None]:
mega_hit_mix = playlist_id('37i9dQZF1DXbYM3nMM0oPk')

In [None]:
first2 = list(mega_hit_mix['artist_ids'])[0:50]
first2 = sp.artists(first2)
first2 = first2['artists']

last2 = list(mega_hit_mix['artist_ids'])[50:100]
last2 = sp.artists(last2)
last2 = last2['artists']

In [None]:
mega_hit_mix_genres = getting_genres(first2).append(getting_genres(last2))

In [None]:
mega_hit_mix_genres

In [None]:
#note: neither 'concat' or 'join' was working so we had to hardcode adding the genre dataframe
mega_hit_mix['artist_pop'] = mega_hit_mix_genres['artist_pop'].values
mega_hit_mix['genre'] = mega_hit_mix_genres['genre'].values

In [None]:
audio(mega_hit_mix['track_ids'])

In [None]:
mega_hit_mix = mega_hit_mix.join(audio(mega_hit_mix['track_ids']), how = 'right')
mega_hit_mix.head()

In [None]:
mega_hit_mix.to_csv('./Data/mega_hit_mix.csv')