In [1]:
! pip install pycurl
! pip install certifi
print("done")

done


In [2]:
# based on pycURL tutorial on https://brightdata.com/blog/how-tos/curl-with-python
# .. Spotify's WebAPI tutorial on https://developer.spotify.com/documentation/web-api/tutorials/getting-started
# .. and this Stack Overflow thread https://stackoverflow.com/questions/31826814/curl-post-request-into-pycurl-code

# Preparation
import pycurl
import certifi
from io import BytesIO
from json import loads

# Set buffer and Curl object.
buffer = BytesIO()
c = pycurl.Curl()

# Set request options.
## Set the request destination.
c.setopt(c.URL, "https://accounts.spotify.com/api/token")

## Set content type
c.setopt(pycurl.HTTPHEADER, ["Content-Type: application/x-www-form-urlencoded"])

## Set the request's body.
c.setopt(pycurl.POST, 1)
postfields = "grant_type=client_credentials&client_id=f77088cfc5c9478bb9a7e849cee4a57b&client_secret=36cba06f1080414bb01a763c7882b710"
c.setopt(c.POSTFIELDS, postfields)

## Set the buffer as the destination of the request's response.
c.setopt(c.WRITEDATA, buffer)

## Refer to the installed certificate authority bundle for validating the SSL certificate.
c.setopt(c.CAINFO, certifi.where())

# Execute and close the request.
c.perform()
c.close()

# Print the buffer's content with a Latin1 (iso-8859-1) encoding.
body = buffer.getvalue()
print(body.decode('iso-8859-1'))


access_string = body.decode('iso-8859-1')
# using json.loads() method
my_access = loads(access_string)
my_token_type = my_access["token_type"]
my_access_tok = my_access["access_token"]
access_str = "Authorization: "+my_token_type+"  "+my_access_tok


{"access_token":"BQDiipQk9-ovtUtWpz54pRwE8PkSGZSv-wR31_VAhhj2zJVqgBP8Dqt86qWCBaNG5mTWwP-ZvAi-uGBLSzQds49ELXj_IX3xqRvonJJ9vUxVMITEoZc5","token_type":"Bearer","expires_in":3600}


In [3]:
# TEST CALL USING PYCURL WITH HARRY STYLES AS EXAMPLE
# https://open.spotify.com/artist/6KImCVD70vtIoJWnq6nGn3?si=GQIpyQMRRx-tyfYzXFlcgA

# Set buffer and Curl object.
buffer = BytesIO()
c = pycurl.Curl()

# Set request options.
## Set the request destination.
artist_id = '6KImCVD70vtIoJWnq6nGn3?si=GQIpyQMRRx-tyfYzXFlcgA'
artist_url = "https://api.spotify.com/v1/artists/"+artist_id
c.setopt(c.URL, artist_url)

## State authorisation code
c.setopt(pycurl.HTTPHEADER, [access_str])

## Set the buffer as the destination of the request's response.
c.setopt(c.WRITEDATA, buffer)

## Refer to the installed certificate authority bundle for validating the SSL certificate.
c.setopt(c.CAINFO, certifi.where())

# Execute and close the request.
c.perform()
c.close()

# Print the buffer's content with a Latin1 (iso-8859-1) encoding.
body = buffer.getvalue()
data = body.decode('iso-8859-1')
print(data)

{
  "external_urls" : {
    "spotify" : "https://open.spotify.com/artist/6KImCVD70vtIoJWnq6nGn3"
  },
  "followers" : {
    "href" : null,
    "total" : 27053226
  },
  "genres" : [ "pop" ],
  "href" : "https://api.spotify.com/v1/artists/6KImCVD70vtIoJWnq6nGn3",
  "id" : "6KImCVD70vtIoJWnq6nGn3",
  "images" : [ {
    "height" : 640,
    "url" : "https://i.scdn.co/image/ab6761610000e5ebf7db7c8ede90a019c54590bb",
    "width" : 640
  }, {
    "height" : 320,
    "url" : "https://i.scdn.co/image/ab67616100005174f7db7c8ede90a019c54590bb",
    "width" : 320
  }, {
    "height" : 160,
    "url" : "https://i.scdn.co/image/ab6761610000f178f7db7c8ede90a019c54590bb",
    "width" : 160
  } ],
  "name" : "Harry Styles",
  "popularity" : 89,
  "type" : "artist",
  "uri" : "spotify:artist:6KImCVD70vtIoJWnq6nGn3"
}


In [4]:
buffer = BytesIO()
c = pycurl.Curl()

# my_query = "search?q="
genre_endpoint = "recommendations/available-genre-seeds"
url_prefix = "https://api.spotify.com/v1/" 
endpoint_url = url_prefix+genre_endpoint
c.setopt(c.URL, endpoint_url)

c.setopt(pycurl.HTTPHEADER, [access_str])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.CAINFO, certifi.where())
c.perform()
c.close()

body = buffer.getvalue()
data_string = body.decode('iso-8859-1')
my_genres = loads(data_string)
print(my_genres)

{'genres': ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues', 'bossanova', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'holidays', 'honky-tonk', 'house', 'idm', 'indian', 'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal', 'metal-misc', 'metalcore', 'minimal-techno', 'movies', 'mpb', 'new-age', 'new-release', 'opera', 'pagode', 'party', 'philippines-opm', 'piano', 'pop', 'pop-film', 'post-dubstep'

In [5]:
def spotify_call(some_url):
    buffer = BytesIO()
    c = pycurl.Curl()
    c.setopt(c.URL, some_url)
    c.setopt(pycurl.HTTPHEADER, [access_str])
    c.setopt(c.WRITEDATA, buffer)
    c.setopt(c.CAINFO, certifi.where())
    c.perform()
    c.close()
    body = buffer.getvalue()
    data_string = body.decode('iso-8859-1')
    data_dict = loads(data_string)
    return data_dict

In [6]:
# top hit playlists for the years 2018, 2019, 2020, and 2022 respectively 
# https://open.spotify.com/playlist/37i9dQZF1DXe2bobNYDtW8?si=a0746f3c724c4680
# https://open.spotify.com/playlist/37i9dQZF1DWVRSukIED0e9?si=92153ca85f32434a
# https://open.spotify.com/playlist/37i9dQZF1DX7Jl5KP2eZaS?si=3ba10c81634b4018
# https://open.spotify.com/playlist/37i9dQZF1DX18jTM2l2fJY?si=c53aa4358b0d44b8

# https://open.spotify.com/playlist/37i9dQZF1DXcBWIGoYBM5M

top_tracks_data = {2018:[],2019:[],2020:[],2022:[]}
playlist_ids = {
    2018:"37i9dQZF1DXe2bobNYDtW8?si=a0746f3c724c4680",
    2019:"37i9dQZF1DWVRSukIED0e9?si=92153ca85f32434a",
    2020:"37i9dQZF1DX7Jl5KP2eZaS?si=3ba10c81634b4018",
    2022:"37i9dQZF1DX18jTM2l2fJY?si=c53aa4358b0d44b8"
}
url_prefix = "https://api.spotify.com/v1/playlists/"
my_query = "/tracks" # ?fields=items(track(name,href,artist(name,genres,popularity)))"

#requesting top 50 items from playlist
curr_offset = 0
lim = 50

for year in playlist_ids:
    my_offset_n_limit = f'&offset={curr_offset}&limit={lim}'
    endpoint_url = url_prefix + playlist_ids[year] + my_query + my_offset_n_limit
    tracks_data = spotify_call(endpoint_url) 
    print(len(tracks_data['tracks']['items']))
    top_tracks_data[year].append(tracks_data) 

    total_tracks = top_tracks_data[year][0]['tracks']['total']
        
    print(f"{year}'s top {total_tracks} tracks according to spotify scraped")
print("done")

# print(f'{size} tracks in total scraped')

100
2018's top 100 tracks according to spotify scraped
100
2019's top 100 tracks according to spotify scraped
50
2020's top 50 tracks according to spotify scraped
50
2022's top 50 tracks according to spotify scraped
done


In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [8]:
print(top_tracks_data[2018][0].keys())
print()
print(top_tracks_data[2018][0]['tracks']['items'][0].keys())

dict_keys(['collaborative', 'description', 'external_urls', 'followers', 'href', 'id', 'images', 'name', 'owner', 'primary_color', 'public', 'snapshot_id', 'tracks', 'type', 'uri'])

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])


In [9]:
# want to build appropiate dataset now
# DESIRABLE ATTRIBUTES:
# index number  (0,1,2,3,4...)
# song id   (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['id'])
# song name  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['name'])
# date added  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['added_at'])
# artist names  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['artists'][0]['name'])
# artist ids  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['artists'][0]['id'])
# artist popularity  (query needed)
# artist genres  (query needed)
# album popularity  (query needed)
# album id  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['album']['id'])
# explicit  (e.g. top_tracks_data[2018][0]['tracks']['items'][index]['track']['explicit'])


my_columns=['index', 'year', 'song_id', 'song_title', 'date_added', 'artists', 'artist_ids', 'artist_popularity', 
            'mean_artist_popularity', 'artist_genres', 'album_popularity', 'album_id', 'is_explicit']
width = len(my_columns)
# my_matrix = np.empty([size, width])
my_matrix = []
# years = [2018,2019,2020,2022]

index = 0

for year in top_tracks_data: 
    pages = len(top_tracks_data[year])
    for page_num in range(pages):
        
        curr_list = top_tracks_data[year][page_num]['tracks']['items']
        for song in curr_list:
            date_added = song['added_at']
            track  = song['track']
            song_id = track['id']
            song_title = track['name']
            is_explicit = track['explicit']
            artists = track['artists']
            artist_names = []
            artist_ids = []
            for artist in artists:
                artist_names.append(artist['name'])
                artist_ids.append(artist['id'])
            album_id = track['album']['id']
            
            curr_row = [index, year, song_id, song_title, date_added, artist_names, 
                        artist_ids, [], 0.0, [], 0, album_id, is_explicit]
            my_matrix.append(curr_row)
            index+=1


df = pd.DataFrame(my_matrix, columns=my_columns)
df.head()


Unnamed: 0,index,year,song_id,song_title,date_added,artists,artist_ids,artist_popularity,mean_artist_popularity,artist_genres,album_popularity,album_id,is_explicit
0,0,2018,6DCZcSspjsKoFjzjrWoCdn,God's Plan,2020-06-22T07:16:14Z,[Drake],[3TVXtAsR1Inumwj472S9r4],[],0.0,[],0,1ATL5GLyefJaxhQzSPVrLX,True
1,1,2018,0e7ipj03S05BNilyu5bRzt,rockstar (feat. 21 Savage),2020-06-22T07:16:14Z,"[Post Malone, 21 Savage]","[246dkjvS1zLTtiykXe5h60, 1URnnhqYAYcrqrcwql10ft]",[],0.0,[],0,6trNtQUgC8cgbWcqoMYkOR,True
2,2,2018,0u2P5u6lvoDfwTYjAADbn4,lovely (with Khalid),2020-06-22T07:16:14Z,"[Billie Eilish, Khalid]","[6qqNVTkY8uBg9cP3Jd7DAH, 6LuN9FCkKOj5PcnpouEgny]",[],0.0,[],0,2sBB17RXTamvj7Ncps15AK,False
3,3,2018,09mEdoA6zrmBPgTEN5qXmN,Call Out My Name,2020-06-22T07:16:14Z,[The Weeknd],[1Xyo4u8uXC1ZmMpatF05PJ],[],0.0,[],0,4qZBW3f2Q8y0k1A84d4iAO,False
4,4,2018,3GCdLUSnKSMJhs4Tj6CV3s,All The Stars (with SZA),2020-06-22T07:16:14Z,"[Kendrick Lamar, SZA]","[2YZyLoL8N0Wb9xBt1NhZWg, 7tYKF4w9nC0nq9CsPZTHyP]",[],0.0,[],0,3pLdWdkj83EYfDN6H2N8MR,True


In [10]:
#now want to send queries to locate artists' popularity, artists' genres and album's genre

size = df.shape[0]
popularities = []
artist_genres = []
album_pops = []
next_percentile = 20

for ind in range(size):
    
    percentage_done = round((ind/size)*100)
    if ((percentage_done>next_percentile) and (not (next_percentile==100))):
        print(f"completed more than {next_percentile}% of queries...")
        next_percentile+=20
        
    curr_row = df.iloc[ind]
    popularity = []
    genres = []
    first_artist = True
    for artist in curr_row['artist_ids']:
        artist_url = "https://api.spotify.com/v1/artists/"+str(artist)
        artists_data = spotify_call(artist_url)
        popularity.append(artists_data['popularity'])
        if first_artist:
            genres = artists_data['genres']
            first_artist = False
        else:
            for genre in artists_data['genres']:
                if (not (genre in genres)):
                    genres.append(genre)
                    
    album_url = "https://api.spotify.com/v1/albums/"+str(curr_row['album_id'])
    album_data = spotify_call(album_url)
    album_popularity = album_data['popularity']
    
    popularities.append(popularity)
    artist_genres.append(genres)
    album_pops.append(album_popularity)

print("finished with queries regarding artists' and albums' associated genres along with artist popularity")
    
df['artist_popularity'] = popularities
df['artist_genres'] = artist_genres
df['album_popularity'] = album_pops

df.head()
    

completed more than 20% of queries...
completed more than 40% of queries...
completed more than 60% of queries...
completed more than 80% of queries...
finished with queries regarding artists' and albums' associated genres along with artist popularity


Unnamed: 0,index,year,song_id,song_title,date_added,artists,artist_ids,artist_popularity,mean_artist_popularity,artist_genres,album_popularity,album_id,is_explicit
0,0,2018,6DCZcSspjsKoFjzjrWoCdn,God's Plan,2020-06-22T07:16:14Z,[Drake],[3TVXtAsR1Inumwj472S9r4],[97],0.0,"[canadian hip hop, canadian pop, hip hop, pop,...",87,1ATL5GLyefJaxhQzSPVrLX,True
1,1,2018,0e7ipj03S05BNilyu5bRzt,rockstar (feat. 21 Savage),2020-06-22T07:16:14Z,"[Post Malone, 21 Savage]","[246dkjvS1zLTtiykXe5h60, 1URnnhqYAYcrqrcwql10ft]","[89, 92]",0.0,"[dfw rap, melodic rap, pop, rap, atl hip hop, ...",87,6trNtQUgC8cgbWcqoMYkOR,True
2,2,2018,0u2P5u6lvoDfwTYjAADbn4,lovely (with Khalid),2020-06-22T07:16:14Z,"[Billie Eilish, Khalid]","[6qqNVTkY8uBg9cP3Jd7DAH, 6LuN9FCkKOj5PcnpouEgny]","[89, 86]",0.0,"[art pop, electropop, pop, pop r&b, rap]",82,2sBB17RXTamvj7Ncps15AK,False
3,3,2018,09mEdoA6zrmBPgTEN5qXmN,Call Out My Name,2020-06-22T07:16:14Z,[The Weeknd],[1Xyo4u8uXC1ZmMpatF05PJ],[97],0.0,"[canadian contemporary r&b, canadian pop, pop]",88,4qZBW3f2Q8y0k1A84d4iAO,False
4,4,2018,3GCdLUSnKSMJhs4Tj6CV3s,All The Stars (with SZA),2020-06-22T07:16:14Z,"[Kendrick Lamar, SZA]","[2YZyLoL8N0Wb9xBt1NhZWg, 7tYKF4w9nC0nq9CsPZTHyP]","[90, 92]",0.0,"[conscious hip hop, hip hop, rap, west coast r...",81,3pLdWdkj83EYfDN6H2N8MR,True


In [11]:
pop_means = []
for ind in range(size):
    curr_pop = df.iloc[ind]['artist_popularity']
    my_mean = round(sum(curr_pop)/len(curr_pop), 1)
    pop_means.append(my_mean)

df['mean_artist_popularity'] = pop_means
df.head()

Unnamed: 0,index,year,song_id,song_title,date_added,artists,artist_ids,artist_popularity,mean_artist_popularity,artist_genres,album_popularity,album_id,is_explicit
0,0,2018,6DCZcSspjsKoFjzjrWoCdn,God's Plan,2020-06-22T07:16:14Z,[Drake],[3TVXtAsR1Inumwj472S9r4],[97],97.0,"[canadian hip hop, canadian pop, hip hop, pop,...",87,1ATL5GLyefJaxhQzSPVrLX,True
1,1,2018,0e7ipj03S05BNilyu5bRzt,rockstar (feat. 21 Savage),2020-06-22T07:16:14Z,"[Post Malone, 21 Savage]","[246dkjvS1zLTtiykXe5h60, 1URnnhqYAYcrqrcwql10ft]","[89, 92]",90.5,"[dfw rap, melodic rap, pop, rap, atl hip hop, ...",87,6trNtQUgC8cgbWcqoMYkOR,True
2,2,2018,0u2P5u6lvoDfwTYjAADbn4,lovely (with Khalid),2020-06-22T07:16:14Z,"[Billie Eilish, Khalid]","[6qqNVTkY8uBg9cP3Jd7DAH, 6LuN9FCkKOj5PcnpouEgny]","[89, 86]",87.5,"[art pop, electropop, pop, pop r&b, rap]",82,2sBB17RXTamvj7Ncps15AK,False
3,3,2018,09mEdoA6zrmBPgTEN5qXmN,Call Out My Name,2020-06-22T07:16:14Z,[The Weeknd],[1Xyo4u8uXC1ZmMpatF05PJ],[97],97.0,"[canadian contemporary r&b, canadian pop, pop]",88,4qZBW3f2Q8y0k1A84d4iAO,False
4,4,2018,3GCdLUSnKSMJhs4Tj6CV3s,All The Stars (with SZA),2020-06-22T07:16:14Z,"[Kendrick Lamar, SZA]","[2YZyLoL8N0Wb9xBt1NhZWg, 7tYKF4w9nC0nq9CsPZTHyP]","[90, 92]",91.0,"[conscious hip hop, hip hop, rap, west coast r...",81,3pLdWdkj83EYfDN6H2N8MR,True


In [12]:
# descriptive statistics
# mean, stnd dev, min and max of mean_artist_popularity

print("descriptive statistics regarding the artists' current popularity")
df.describe()['mean_artist_popularity']

descriptive statistics regarding the artists' current popularity


count    300.000000
mean      82.350667
std        7.967868
min       59.000000
25%       77.500000
50%       83.000000
75%       88.350000
max      100.000000
Name: mean_artist_popularity, dtype: float64

In [14]:
# descriptive statistics
# frequency/distribution of genres
# most 'popular'/frequent genre
# least 'popular'/frequent genre
# most and least popular artist

top_genres = {}
for ind in range(size):
    curr_genres = df.iloc[ind]['artist_genres']
    for genre in curr_genres:
        top_genres[genre] = 0
        
for ind in range(size):
    curr_genres = df.iloc[ind]['artist_genres']
    for genre in curr_genres:
        top_genres[genre] +=1
        
top_genres

{'canadian hip hop': 15,
 'canadian pop': 30,
 'hip hop': 43,
 'pop': 188,
 'rap': 89,
 'toronto rap': 14,
 'dfw rap': 11,
 'melodic rap': 30,
 'atl hip hop': 17,
 'trap': 30,
 'art pop': 13,
 'electropop': 15,
 'pop r&b': 9,
 'canadian contemporary r&b': 7,
 'conscious hip hop': 6,
 'west coast rap': 4,
 'r&b': 9,
 'chicago rap': 5,
 'complextro': 1,
 'dance pop': 77,
 'edm': 24,
 'german techno': 1,
 'pop dance': 23,
 'tropical house': 14,
 'contemporary country': 10,
 'texas country': 1,
 'brostep': 9,
 'progressive electro house': 8,
 'hel': 2,
 'reggaeton': 38,
 'trap latino': 38,
 'urbano latino': 40,
 'reggaeton colombiano': 13,
 'alt z': 9,
 'bedroom pop': 3,
 'indie pop': 3,
 'norwegian indie': 1,
 'electro house': 4,
 'house': 5,
 'progressive house': 2,
 'uk dance': 12,
 'uk pop': 26,
 'slap house': 12,
 'metropopolis': 2,
 'modern rock': 9,
 'neo mellow': 1,
 'emo rap': 6,
 'miami hip hop': 4,
 'atl trap': 2,
 'pop rap': 16,
 'rock': 5,
 'post-teen pop': 21,
 'australian po