In [1]:
import pandas as pd
import numpy as np
import math
import os
import spotipy

from IPython.display import display
pd.options.display.max_columns = None

This is a feature engineering script that generates variables that can be inferred from the Spotify Top 200 Charts in the US.

### Import raw Spotify charts data

Import the raw dataset of all songs on the Spotify Top 200 Charts since 2017 (assembled by Brunnell), regardless if they feature a collaboration or not. To generate artist-specific variables on the number of appearances/songs on the chart, we need to have a complete history of each artist, including all the songs they have released without collaborations.

In [2]:
data_top200 = pd.read_csv('../input/spotify_top200_us.csv')\
.drop_duplicates()
data_top200['Date'] = pd.to_datetime(data_top200['Date'])

In [3]:
data_us = data_top200[data_top200['Country']=='United States']\
.sort_values(by=['Artist', 'Date', 'Track Name'])
data_us['Track URI'] = data_us['Track URL'].str.replace(r'https://open.spotify.com/track/', '')

Conduct a few sanity checks to make sure the date range is complete.

In [4]:
print(max(data_us['Date']))
print(min(data_us['Date']))

2020-12-31 00:00:00
2017-01-01 00:00:00


Check that 200 unique positions per day x 1404 unique days (between 2017-01-01 and 2020-12-07) = 280600 observations in the US.

In [5]:
print(len(data_us.index))

x = data_us.groupby('Date')[['Position']].transform('count')
print(np.max(x))
print(np.min(x))

y = data_us.groupby('Position')[['Date']].transform('count')
print(np.max(y))
print(np.min(y))

291200
Position    200
dtype: int64
Position    200
dtype: int64
Date    1456
dtype: int64
Date    1456
dtype: int64


### Setting up Spotipy Credentials

In [7]:
creds = pd.read_csv('../input/credentials.csv')
SPOTIPY_CLIENT_ID = creds['Client ID'][0]
SPOTIPY_CLIENT_SECRET = creds['Client Secret'][0]

client_credentials_manager = spotipy.SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, 
                                                              client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Collect track information from the API

In [8]:
track_uris = data_us['Track URI'].unique()
len(track_uris)

7547

For each unique track in data_us, download the list of artists/collaborators involved, their names, album id, and their release date from the API.

In [39]:
def get_track_data(track_uris, n):
    
    track_artist_uris = []
    track_artist_names = []
    track_album_uri = []
    track_album_release = []
    track_album_genre = []
#     loops = 0
    
#     while n* (loops+1) <= len(track_uris):
    for loops in range(math.ceil(len(track_uris)/n)):
        # getting track information in batches of size n
        temp = [x for x in sp.tracks(track_uris[n*loops : n*(loops + 1)])['tracks']]
    
        # for each track in this batch, obtain album info and list of artists
        for i in range(len(temp)):
            track_album_uri.append(temp[i]['album']['id'])
            track_album_release.append(temp[i]['album']['release_date'])
#             track_album_genre.append(sp.album(temp[i]['album']['id'])['genres'])
        
            t = temp[i]['artists']
            temp_uris = []
            temp_names = []
        
            # for each artist, obtain id and name
            for j in range(len(t)):
                temp_uris.append(t[j]['id']) 
                temp_names.append(t[j]['name'])
                
            track_artist_uris.append(temp_uris)
            track_artist_names.append(temp_names)
        
#         loops += 1
        print(f'Found information for {n*(loops+1)} songs')
    
    # Combine all lists into a dataframe
    track_data = pd.DataFrame(list(zip(track_uris, track_artist_uris, track_artist_names,
                                       track_album_uri, track_album_release)),
                              columns=['Track URI', 'Artist URI', 'Artist Name',
                                       'Album URI', 'Album Release'])
    
    # Count number of collaborators per song 
    track_data['No. of Artists'] = track_data['Artist URI'].astype(str).str.count(',')+1 
    return(track_data)
#     return(temp)

In [40]:
track_data = get_track_data(track_uris, n=50)
# track_data = pd.read_csv('../output/2021.01.01 Track Artist URI 7313.csv')\
# .drop(columns = ['Unnamed: 0'])

Found information for 50 songs
Found information for 100 songs
Found information for 150 songs
Found information for 200 songs
Found information for 250 songs
Found information for 300 songs
Found information for 350 songs
Found information for 400 songs
Found information for 450 songs
Found information for 500 songs
Found information for 550 songs
Found information for 600 songs
Found information for 650 songs
Found information for 700 songs
Found information for 750 songs
Found information for 800 songs
Found information for 850 songs
Found information for 900 songs
Found information for 950 songs
Found information for 1000 songs
Found information for 1050 songs
Found information for 1100 songs
Found information for 1150 songs
Found information for 1200 songs
Found information for 1250 songs
Found information for 1300 songs
Found information for 1350 songs
Found information for 1400 songs
Found information for 1450 songs
Found information for 1500 songs
Found information for 1550 son

### Collect artist information from the API

In [41]:
artist_uris = pd.unique(track_data['Artist URI'].explode())
len(artist_uris)

1663

In [42]:
def get_artist_data(artist_uris, n):
    
    artist_genres = []
    loops = 0
    
    # getting artist information in batches of size n
    for loops in range(math.ceil(len(artist_uris)/n)):
        temp = [x for x in sp.artists(artist_uris[n*loops : n*(loops + 1)])['artists']]
    
        # for each artist in this batch, obtain genre
        for i in range(len(temp)):
            artist_genres.append(temp[i]['genres'])
            
        loops += 1
        print(f'Found information for {n*(loops+1)} artists')

    artist_data = pd.DataFrame(list(zip(artist_uris, artist_genres)),
                              columns=['Artist URI', 'Artist Genre'])
    return(artist_data)

In [43]:
artist_data = get_artist_data(artist_uris, n=50)

Found information for 100 artists
Found information for 150 artists
Found information for 200 artists
Found information for 250 artists
Found information for 300 artists
Found information for 350 artists
Found information for 400 artists
Found information for 450 artists
Found information for 500 artists
Found information for 550 artists
Found information for 600 artists
Found information for 650 artists
Found information for 700 artists
Found information for 750 artists
Found information for 800 artists
Found information for 850 artists
Found information for 900 artists
Found information for 950 artists
Found information for 1000 artists
Found information for 1050 artists
Found information for 1100 artists
Found information for 1150 artists
Found information for 1200 artists
Found information for 1250 artists
Found information for 1300 artists
Found information for 1350 artists
Found information for 1400 artists
Found information for 1450 artists
Found information for 1500 artists
Fou

In [44]:
artist_data

Unnamed: 0,Artist URI,Artist Genre
0,1VPmR4DJC1PlOtd0IADAO0,"[dark trap, new orleans rap, underground hip hop]"
1,6Ff53KvcvAj5U7Z1vojB5o,"[boy band, dance pop, europop, pop]"
2,6PfSUFtkMVoDkx4MQkzOi3,"[glitchcore, hyperpop]"
3,4UXqAaa6dQYAk18Lv7PEgX,"[emo, modern rock, pop punk]"
4,5aYf0AInMznHfXGaemKEBv,[]
...,...,...
1658,3QJUFtGBGL05vo0kCJZsmT,"[modern indie pop, social media pop]"
1659,3h1fFIofdTGrHbqisHyWgI,[]
1660,6IhKl7lqJc5omkhODEJinj,[]
1661,6IrDpI3xcuzTUiEc3fnc9H,[]


In [None]:
# sp.track('4HBZA5flZLE435QTztThqH')['album']

In [None]:
# sp.album('5mUdh6YWnUvf0MfklEk1oi')['genres']

In [None]:
# sp.artist('66CXWjxzNUsdJxJ2JdwvnR')['genres']

In [None]:
# track_data.to_csv('../output/2021.01.04 Track Artist URI 7313.csv')
# artist_data.to_csv('../output/2021.01.04 Artist Genres 1611.csv')

### Generate variables at a song/track level

Generate variables on the timing of the song's release. 

In [45]:
track_data['Album Release'] = pd.to_datetime(track_data['Album Release'])
track_data['Album_release_month'] = track_data['Album Release'].dt.strftime('%m')
track_data['Album_release_dayweek'] = track_data['Album Release'].dt.strftime('%a')

In [17]:
track_data

Unnamed: 0,Track URI,Artist URI,Artist Name,Album URI,Album Release,No. of Artists,Album_release_month,Album_release_dayweek
0,4nutwPQrK56fFmrAMgyPhz,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
1,7v2azTfke2BR57lh2HxPQo,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
2,0fyBYsrmpihh1mfalssDlB,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
3,6VFKlX5qzxwmIiezqeqNYG,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
4,53AiGAa0Qi2VbX7eUpur1U,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
...,...,...,...,...,...,...,...,...
7542,3RXkboS74UYzN14xTqzPyY,[6IhKl7lqJc5omkhODEJinj],[],4yQC726OERjevM2YKtORnm,2017-07-14,1,07,Fri
7543,3eFJqPe8VUYrABbFjSauuj,[3h1fFIofdTGrHbqisHyWgI],[],6zP4EpOz9M0vhrjp7FNwKN,2017-07-17,1,07,Mon
7544,3bVbQvGVIe4n24AzyXovXh,[3h1fFIofdTGrHbqisHyWgI],[],6zP4EpOz9M0vhrjp7FNwKN,2017-07-17,1,07,Mon
7545,6Br5mChPdgQNmLF0G0gjPH,[6IrDpI3xcuzTUiEc3fnc9H],[],3jd8KzVKriW0uzSsIwxfZM,2017-07-18,1,07,Tue


Check that no songs were missed from the original list of track_uris.

In [46]:
setdiff = np.setdiff1d(track_uris, track_data['Track URI'])
len(setdiff)

0

Confirm that there are 2843 songs with collaborations, as Brunnell has found previously.

In [None]:
# bru = pd.read_csv('../input/spotify_top200_reduced.csv').drop_duplicates()
# bru_us = bru[bru['Country']=='United States']

# uri1 = track_data.loc[track_data['No. of Collaborators']!= 1,'Track URI'].drop_duplicates()
# uri2 = bru_us['Track URI'].drop_duplicates()
# print(len(uri1))
# print(len(uri2))
# print(len(np.setdiff1d(uri1, uri2)))
# print(len(np.intersect1d(uri1, uri2)))

Merge data_us onto track_data

In [47]:
data1 = data_us.merge(track_data, how = 'left', on = 'Track URI')

In [31]:
data1

Unnamed: 0.1,Unnamed: 0,Date,Track URL,Position,Track Name,Artist,Streams,Country,Track URI,Artist URI,Artist Name,Album URI,Album Release,No. of Artists,Album_release_month,Album_release_dayweek
0,279795,2018-09-07,https://open.spotify.com/track/4nutwPQrK56fFmr...,196.0,"10,000 Degrees",$uicideBoy$,232152.0,United States,4nutwPQrK56fFmrAMgyPhz,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
1,279776,2018-09-07,https://open.spotify.com/track/7v2azTfke2BR57l...,177.0,Bring Out Your Dead,$uicideBoy$,247359.0,United States,7v2azTfke2BR57lh2HxPQo,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
2,279745,2018-09-07,https://open.spotify.com/track/0fyBYsrmpihh1mf...,146.0,King Tulip,$uicideBoy$,270299.0,United States,0fyBYsrmpihh1mfalssDlB,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
3,279789,2018-09-07,https://open.spotify.com/track/6VFKlX5qzxwmIie...,190.0,Meet Mr. NICEGUY,$uicideBoy$,236054.0,United States,6VFKlX5qzxwmIiezqeqNYG,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
4,279739,2018-09-07,https://open.spotify.com/track/53AiGAa0Qi2VbX7...,140.0,Nicotine Patches,$uicideBoy$,279299.0,United States,53AiGAa0Qi2VbX7eUpur1U,[1VPmR4DJC1PlOtd0IADAO0],[$uicideBoy$],72I2i7wwU3Q7mJGxbNW12D,2018-09-07,1,09,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291195,31581,2017-11-10,https://open.spotify.com/track/1YqcGlCHNquxBhl...,182.0,,,205945.0,United States,1YqcGlCHNquxBhlUZsjhMT,[2uwXb02RFgtGOr6s6XMWlB],[],2oxjmU5c2mZzc7mSQqWTE9,2017-11-07,1,11,Tue
291196,31950,2017-11-11,https://open.spotify.com/track/1YqcGlCHNquxBhl...,151.0,,,200095.0,United States,1YqcGlCHNquxBhlUZsjhMT,[2uwXb02RFgtGOr6s6XMWlB],[],2oxjmU5c2mZzc7mSQqWTE9,2017-11-07,1,11,Tue
291197,32359,2017-11-12,https://open.spotify.com/track/1YqcGlCHNquxBhl...,160.0,,,176687.0,United States,1YqcGlCHNquxBhlUZsjhMT,[2uwXb02RFgtGOr6s6XMWlB],[],2oxjmU5c2mZzc7mSQqWTE9,2017-11-07,1,11,Tue
291198,32769,2017-11-13,https://open.spotify.com/track/1YqcGlCHNquxBhl...,170.0,,,190496.0,United States,1YqcGlCHNquxBhlUZsjhMT,[2uwXb02RFgtGOr6s6XMWlB],[],2oxjmU5c2mZzc7mSQqWTE9,2017-11-07,1,11,Tue


### Generate variables that change on a song-day basis

**Song_first_onchart**: a song's earliest appearance on the chart.

**Song_days_since_first**: the number of days between the current chart day and the song's first day on the chart.

**Song_days_onchart**: the actual number of days that the song has appeared on the chart.

**Song_days_since_release**: the number of days between the current chart day and the song's album's release date.

**Date_diff**: the number of days elapsed between a song's current appearance and its last appearance If Date_diff = 1, then the song appeared yesterday and today.

**Song_new_streak**: when Date_diff is > 1 , we assume that the song has temporarily dropped off the chart (hence ending its last consecutive streak), so we create a binary variable "Song_new_streak" that signals the beginnning of a new streak.

**Song_streak_id**: for each song's consecutive streak, assign a unique id by performing a cumulative sum of the 'Song_new_streak' variable.

**Song_consec_day**: for each consecutive day within a streak, assign a unique id.

In [48]:
track_days = data1[['Date', 'Track URI', 'Album Release']]\
.drop_duplicates().sort_values(by = ['Date', 'Track URI'])

track_days['Year_chart'] = track_days['Date'].dt.strftime('%Y').astype(int)
track_days['Month_chart'] = track_days['Date'].dt.strftime('%m')
track_days['Song_first_onchart'] = track_days.groupby(['Track URI'])[['Date']].transform('min')
track_days['Song_days_since_first'] = track_days['Date'] - track_days['Song_first_onchart']
track_days['Song_days_since_first'] = track_days['Song_days_since_first'].apply(lambda x: x.days)
track_days['Song_days_onchart'] = track_days.groupby(['Track URI']).cumcount() + 1
track_days['Song_days_since_release'] = track_days['Date'] - track_days['Album Release']
track_days['Song_days_since_release'] = track_days['Song_days_since_release'].apply(lambda x: x.days)

track_days['Date_lag'] = track_days.groupby(['Track URI'])[['Date']].shift(1)
track_days['Date_diff'] = track_days['Date'] - track_days['Date_lag']
track_days['Date_diff'] = track_days['Date_diff'].apply(lambda x: x.days)
track_days['Song_new_streak'] = np.where((track_days['Song_days_onchart']==1)|
                                         (track_days['Date_diff']>1), 
                                         1, 0)
track_days['Song_streak_id'] = track_days.groupby(['Track URI'])[['Song_new_streak']].transform('cumsum')
track_days['Song_consec_day'] = track_days.groupby(['Track URI', 'Song_streak_id']).cumcount() + 1

track_days

Unnamed: 0,Date,Track URI,Album Release,Year_chart,Month_chart,Song_first_onchart,Song_days_since_first,Song_days_onchart,Song_days_since_release,Date_lag,Date_diff,Song_new_streak,Song_streak_id,Song_consec_day
98988,2017-01-01,04CttTezSnv71USiiG9mIo,2016-11-11,2017,01,2017-01-01,0,1,51,NaT,,1,1,1
182322,2017-01-01,04DwTuZ2VBdJCCC5TROn7L,2017-04-21,2017,01,2017-01-01,0,1,-110,NaT,,1,1,1
49972,2017-01-01,05Z7jet4VDNVgNQWcYHnrk,2016-12-16,2017,01,2017-01-01,0,1,16,NaT,,1,1,1
233117,2017-01-01,08WPvDEsHvTFuB9w8tC2OS,2017-07-21,2017,01,2017-01-01,0,1,-201,NaT,,1,1,1
248434,2017-01-01,0B8B8cVRFIG1yznoQe7c9s,2016-11-04,2017,01,2017-01-01,0,1,58,NaT,,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125187,2020-12-31,7vrJn5hDSXRmdXoR30KgF1,2020-10-23,2020,12,2020-11-20,41,38,69,2020-12-30,1.0,0,3,6
222624,2020-12-31,7xQAfvXzm3AkraOtGPWIZg,2019-09-06,2020,12,2019-09-16,472,423,482,2020-11-19,42.0,1,7,1
185249,2020-12-31,7yiSvALPjMrBLDDrbcDRNy,2020-11-20,2020,12,2020-11-20,41,26,41,2020-12-30,1.0,0,3,5
58879,2020-12-31,7ytR5pFWmSjzHJIeQkgog4,2020-04-17,2020,12,2020-04-17,258,259,258,2020-12-30,1.0,0,1,259


## Convert list of collaborator URIs and names into long format

For each song/track URI, we expand/explode the list of Collaborator URI and Collaborator Name so that each artist can be on a separate row. This allows us to merge the characteristics of each artist later on. 

In [49]:
artist_uri = data1.drop(columns = ['Artist Name', 'Track URL', 'Artist', 'Album Release'])
artist_uri2 = artist_uri.explode('Artist URI', ignore_index=True)
artist_uri2[artist_uri2['No. of Artists']!=1]

Unnamed: 0.1,Unnamed: 0,Date,Position,Track Name,Streams,Country,Track URI,Artist URI,Album URI,No. of Artists,Album_release_month,Album_release_dayweek
199,140981,2020-07-10,182.0,hand crushed by a mallet (Remix) [feat. Fall O...,238674.0,United States,5Mm2CJzNRiICC5MWRWQnBo,6PfSUFtkMVoDkx4MQkzOi3,0qnExDZfz0kVeBjixPsyjS,4,07,Fri
200,140981,2020-07-10,182.0,hand crushed by a mallet (Remix) [feat. Fall O...,238674.0,United States,5Mm2CJzNRiICC5MWRWQnBo,4UXqAaa6dQYAk18Lv7PEgX,0qnExDZfz0kVeBjixPsyjS,4,07,Fri
201,140981,2020-07-10,182.0,hand crushed by a mallet (Remix) [feat. Fall O...,238674.0,United States,5Mm2CJzNRiICC5MWRWQnBo,5aYf0AInMznHfXGaemKEBv,0qnExDZfz0kVeBjixPsyjS,4,07,Fri
202,140981,2020-07-10,182.0,hand crushed by a mallet (Remix) [feat. Fall O...,238674.0,United States,5Mm2CJzNRiICC5MWRWQnBo,0MfC3pip8rY8OFLJVVNvBO,0qnExDZfz0kVeBjixPsyjS,4,07,Fri
203,186,2017-01-01,187.0,Good Drank,146863.0,United States,39pS70eeDvyCAF3t8NAlVV,17lzZA2AlOHwCwFALHttmp,5vvvo79z68vWj9yimoygfS,3,06,Fri
...,...,...,...,...,...,...,...,...,...,...,...,...
454193,157591,2020-10-01,192.0,Prospect (ft. Lil Baby),210504.0,United States,4iHSE5R1U8jf84tRn52xRt,5f7VJjfbwm532GiveGC0ZK,4Wb5bU9FkmZ84WkkL37rKA,2,06,Fri
454197,83765,2019-09-27,166.0,Chicken Noodle Soup (feat. Becky G),257406.0,United States,6wyr4ReB05D9sJB1Rsmcqo,0b1sIQumIAsNbqAoIClSpy,76IRLp7YzBVLKsat6Ro9ae,2,09,Fri
454198,83765,2019-09-27,166.0,Chicken Noodle Soup (feat. Becky G),257406.0,United States,6wyr4ReB05D9sJB1Rsmcqo,4obzFoKoKRHIphyHzJ35G3,76IRLp7YzBVLKsat6Ro9ae,2,09,Fri
454239,175995,2017-03-17,196.0,Dennis Rodman,166943.0,United States,73cAKC1NbxHuFPcQ4slGtl,2o8lOQRjzsSC8UdbNN88HN,5WS1g0cKtjfK6eDoSLdv7d,2,03,Fri


In [50]:
artist_name = data1[['Date', 'Track URI', 'No. of Artists', 'Artist Name']]
artist_name2 = artist_name.explode('Artist Name', ignore_index=True)
artist_name2[artist_name2['No. of Artists']!=1]

Unnamed: 0,Date,Track URI,No. of Artists,Artist Name
199,2020-07-10,5Mm2CJzNRiICC5MWRWQnBo,4,100 gecs
200,2020-07-10,5Mm2CJzNRiICC5MWRWQnBo,4,Fall Out Boy
201,2020-07-10,5Mm2CJzNRiICC5MWRWQnBo,4,Craig Owens
202,2020-07-10,5Mm2CJzNRiICC5MWRWQnBo,4,Nicole Dollanganger
203,2017-01-01,39pS70eeDvyCAF3t8NAlVV,3,2 Chainz
...,...,...,...,...
454193,2020-10-01,4iHSE5R1U8jf84tRn52xRt,2,Lil Baby
454197,2019-09-27,6wyr4ReB05D9sJB1Rsmcqo,2,j-hope
454198,2019-09-27,6wyr4ReB05D9sJB1Rsmcqo,2,Becky G
454239,2017-03-17,73cAKC1NbxHuFPcQ4slGtl,2,mansionz


In [51]:
data2 = artist_name2.merge(artist_uri2, how = 'left', 
                           on = ['Date', 'Track URI', 'No. of Artists'],
                           left_index=True, right_index=True)\
.merge(artist_data, how = 'left', on = ['Artist URI'])
data2['Artist No.'] = data2.groupby(['Track URI', 'Date']).cumcount() + 1

In [None]:
data2[data2['No. of Artists']!=1]

### Generate variables that change on a artist-day basis

**Artist_first_onchart**: identify the first day an artist appears on the Top 200 chart.

**Artist_new_song**: generate a dummy variable that = 1 if this is the first time a track URI appears (Song_days_onchart == 1).

**Artist_new_collab**: generate a dummy variable that = 1 if this is the first time a track URI appears that is also a collab (No. of Collaborators != 1).

**Artist_new_solo**: generate a dummy variable that = 1 if this is the first time a track URI appears that is a solo (No. of Collaborators == 1).

**Artist_cumsum_songs**: calculate the cumulative sum of artist_new_song.

However, in the event that an artist has multiple songs on the chart on the same day, then this cumsum will be different for each song. To remedy this, we take the max cumsum per day for an artist to show how many songs an artist has on that day in total.

**Artist_cumu_songs**: the cumulative number of songs for an artist in a given day
(for each collaborator URI and date, find the max artist_cumsum_songs).

Analogously for Artist_cumu_collabs and Artist_cumu_solo.

Artist_cumu_songs should equal Artist_cumu_collabs + Artist_cumu_solo.

In [52]:
data2d = data2.merge(track_days, 
                      how= 'left', on = ['Date', 'Track URI'])\
.sort_values(by = ['Date', 'Track URI'])

data2d['Artist_first_onchart'] = data2d.groupby(['Artist URI'])[['Date']].transform('min')
data2d['Artist_new_song'] = np.where(data2d['Song_days_onchart']==1, 
                                     1, 0)

data2d['Artist_new_collab'] = np.where((data2d['Artist_new_song']==1) &
                                      (data2d['No. of Artists']!=1),
                                       1, 0)
data2d['Artist_new_solo'] = np.where((data2d['Artist_new_song']==1) &
                                      (data2d['No. of Artists']==1),
                                       1, 0)

data2d['Artist_cumsum_songs'] = data2d.groupby(['Artist URI'])[['Artist_new_song']]\
.transform('cumsum')
data2d['Artist_cumu_songs'] = data2d.groupby(['Artist URI', 'Date'])[['Artist_cumsum_songs']]\
.transform('max')

data2d['Artist_cumsum_collab'] = data2d.groupby(['Artist URI'])[['Artist_new_collab']]\
.transform('cumsum')
data2d['Artist_cumu_collab'] = data2d.groupby(['Artist URI', 'Date'])[['Artist_cumsum_collab']]\
.transform('max')

data2d['Artist_cumsum_solo'] = data2d.groupby(['Artist URI'])[['Artist_new_solo']]\
.transform('cumsum')
data2d['Artist_cumu_solo'] = data2d.groupby(['Artist URI', 'Date'])[['Artist_cumsum_solo']]\
.transform('max')

In [None]:
# data2d

For each artist-day, find how many days the artist has been on the chart (cumulatively, not necessarily consecutively).

In [53]:
artist_days = data2d.loc[:, ['Artist URI', 'Date']]\
.sort_values(by=['Artist URI', 'Date'], ascending=[True,True]).drop_duplicates()
artist_days['Artist_days_onchart'] = artist_days.groupby(['Artist URI']).cumcount() + 1

artist_days

Unnamed: 0,Artist URI,Date,Artist_days_onchart
414600,002HSjuWsGMinkXTa7JcRp,2020-04-03,1
224509,00FQb4jTyendYWaN8pK0wa,2017-02-19,1
224510,00FQb4jTyendYWaN8pK0wa,2017-02-20,2
224511,00FQb4jTyendYWaN8pK0wa,2017-02-21,3
224512,00FQb4jTyendYWaN8pK0wa,2017-02-22,4
...,...,...,...
97541,7z5WFjZAIYejWy0NI5lv4T,2020-12-25,1000
97542,7z5WFjZAIYejWy0NI5lv4T,2020-12-28,1001
97544,7z5WFjZAIYejWy0NI5lv4T,2020-12-29,1002
97546,7z5WFjZAIYejWy0NI5lv4T,2020-12-30,1003


Merge data2d with artist days information.

In [54]:
data2e = data2d.merge(artist_days, how='left', on = ['Artist URI', 'Date'])

In [None]:
# data2e

Drop intermediary (dummy) variables not useful for EDAs/model building.

In [55]:
data3 = data2e.drop(columns = ['Artist_new_song', 'Artist_cumsum_songs',
                               'Artist_new_collab', 'Artist_cumsum_collab',
                               'Artist_new_solo', 'Artist_cumsum_solo',
                               'Date_diff', 'Date_lag',
                               'Song_new_streak',
                               'Unnamed: 0'])
# data3

For each track URI-date, find the average characteristics for all artists in the collaboration:

**Collab_avg_days_onchart**: average number of days on the chart of all collaborators

**Collab_avg_cumu_songs**: average number of songs on the chart of all collaborators

**Collab_avg_cumu_collab**: average number of previous collaborations of all collaborators

**Collab_avg_cumu_solo**: average number of previous solo songs released of all collaborators

In [56]:
data3b = data3

data3b['Collab_avg_days_onchart'] = data3.groupby(['Track URI', 'Date'])[['Artist_days_onchart']].transform('mean')
data3b['Collab_avg_cumu_songs'] = data3.groupby(['Track URI', 'Date'])[['Artist_cumu_songs']].transform('mean')
data3b['Collab_avg_cumu_collab'] = data3.groupby(['Track URI', 'Date'])[['Artist_cumu_collab']].transform('mean')
data3b['Collab_avg_cumu_solo'] = data3.groupby(['Track URI', 'Date'])[['Artist_cumu_solo']].transform('mean')

Reorder columns

In [57]:
cols1 = ['Date','Track Name', 'Streams', 'Position',
       'Artist Name', 'Artist No.', 'No. of Artists', 'Artist Genre']
cols2 = data3b.columns.difference(cols1, sort=False).tolist()
data3c = data3b[cols1 + cols2]

In [58]:
data3c

Unnamed: 0,Date,Track Name,Streams,Position,Artist Name,Artist No.,No. of Artists,Artist Genre,Track URI,Country,Artist URI,Album URI,Album_release_month,Album_release_dayweek,Album Release,Year_chart,Month_chart,Song_first_onchart,Song_days_since_first,Song_days_onchart,Song_days_since_release,Song_streak_id,Song_consec_day,Artist_first_onchart,Artist_cumu_songs,Artist_cumu_collab,Artist_cumu_solo,Artist_days_onchart,Collab_avg_days_onchart,Collab_avg_cumu_songs,Collab_avg_cumu_collab,Collab_avg_cumu_solo
0,2017-01-01,Lighthouse - Andrelli Remix,149929.0,183.0,Hearts & Colors,1,2,[viral pop],04CttTezSnv71USiiG9mIo,United States,3wjsrpfO6odEphTZWx45RQ,4ywy3ahNM8FMH99Ueuf9ZA,11,Fri,2016-11-11,2017,01,2017-01-01,0,1,51,1,1,2017-01-01,1,1,0,1,1.0,1.000000,1.000000,0.0
1,2017-01-01,Lighthouse - Andrelli Remix,149929.0,183.0,Andrelli,2,2,[swedish pop],04CttTezSnv71USiiG9mIo,United States,5M2y5A6d5QZjw9JeKClagC,4ywy3ahNM8FMH99Ueuf9ZA,11,Fri,2016-11-11,2017,01,2017-01-01,0,1,51,1,1,2017-01-01,1,1,0,1,1.0,1.000000,1.000000,0.0
2,2017-01-01,In the Name of Love,435945.0,27.0,Martin Garrix,1,2,"[big room, dance pop, edm, electro house, pop,...",04DwTuZ2VBdJCCC5TROn7L,United States,60d24wfXkVzDSfLS6hyCjZ,75kX486cBBkuaLkZGjBptl,04,Fri,2017-04-21,2017,01,2017-01-01,0,1,-110,1,1,2017-01-01,1,1,0,1,1.0,2.000000,1.500000,0.5
3,2017-01-01,In the Name of Love,435945.0,27.0,Bebe Rexha,2,2,"[dance pop, electropop, pop, pop dance, post-t...",04DwTuZ2VBdJCCC5TROn7L,United States,64M6ah0SkkRsnPGtGiRAbb,75kX486cBBkuaLkZGjBptl,04,Fri,2017-04-21,2017,01,2017-01-01,0,1,-110,1,1,2017-01-01,3,2,1,1,1.0,2.000000,1.500000,0.5
4,2017-01-01,Party,151474.0,178.0,Chris Brown,1,3,"[dance pop, pop, pop rap, r&b, rap]",05Z7jet4VDNVgNQWcYHnrk,United States,7bXgB6jMjp9ATFy66eO08Z,35ljAE1f5Qmp2ZvVir34tL,12,Fri,2016-12-16,2017,01,2017-01-01,0,1,16,1,1,2017-01-01,1,1,0,1,1.0,1.666667,1.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454317,2020-12-31,Girls in the Hood,243157.0,160.0,Megan Thee Stallion,1,1,"[houston rap, pop, pop rap, trap queen]",7yiSvALPjMrBLDDrbcDRNy,United States,181bsRPaVXVlUKXrxwZfHK,0KjckH1EE6HRRurMIXSc0r,11,Fri,2020-11-20,2020,12,2020-11-20,41,26,41,3,5,2019-05-17,34,20,14,525,525.0,34.000000,20.000000,14.0
454318,2020-12-31,ROCKSTAR (feat. Roddy Ricch),581768.0,19.0,DaBaby,1,2,"[north carolina hip hop, rap]",7ytR5pFWmSjzHJIeQkgog4,United States,4r63FhuTkUYltbVAg5TQnk,623PL2MBg50Br5dLXC9E9e,04,Fri,2020-04-17,2020,12,2020-04-17,258,259,258,1,259,2019-03-30,76,52,24,641,674.5,55.500000,39.500000,16.0
454319,2020-12-31,ROCKSTAR (feat. Roddy Ricch),581768.0,19.0,Roddy Ricch,2,2,"[melodic rap, rap, trap]",7ytR5pFWmSjzHJIeQkgog4,United States,757aE44tKEUQEqRuT6GnEB,623PL2MBg50Br5dLXC9E9e,04,Fri,2020-04-17,2020,12,2020-04-17,258,259,258,1,259,2018-11-30,35,27,8,708,674.5,55.500000,39.500000,16.0
454320,2020-12-31,M3tamorphosis (feat. Kid Cudi),230435.0,182.0,Playboi Carti,1,2,"[atl hip hop, rap, trap]",7zLMYtNJcabv4h4wBnjNQI,United States,699OTQXzgjhIYAHMy9RyPD,2QRedhP5RmKJiJ1i8VgDGR,12,Fri,2020-12-25,2020,12,2020-12-25,6,7,6,1,7,2017-04-14,66,36,30,844,556.5,48.500000,24.500000,24.0


Set index using Date, Track URI, and the ID of the collaborator in the track (Collab No.).
In the event that we need to pivot the dataframe to wide format (so that each artist's information is on a separate column), we can unstack using the Collab No.

In [None]:
data3d = data3c\
.set_index(['Date', 'Track URI', 'Artist No.'], append=False)
# data3c.loc[data3c['No. of Collaborators']!=1,:]

In [None]:
# data3e = data3d.unstack(level=2)

In [None]:
# data3f = data3e
# data3f.columns = [' '.join(col).strip() for col in data3f.columns.values]
# data3f

In [59]:
data3c.describe()

Unnamed: 0,Streams,Position,Artist No.,No. of Artists,Year_chart,Song_days_since_first,Song_days_onchart,Song_days_since_release,Song_streak_id,Song_consec_day,Artist_cumu_songs,Artist_cumu_collab,Artist_cumu_solo,Artist_days_onchart,Collab_avg_days_onchart,Collab_avg_cumu_songs,Collab_avg_cumu_collab,Collab_avg_cumu_solo
count,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0,454322.0
mean,395190.6,99.593145,1.4931,1.986199,2018.471408,144.707925,125.967239,455.869535,2.657153,62.055093,30.356635,17.293895,13.06274,409.872639,409.872639,30.356635,17.293895,13.06274
std,274556.0,57.720472,0.809487,1.108856,1.112439,192.587274,163.758139,2147.929308,4.786989,78.732474,34.233954,20.603702,17.980469,336.572517,308.397882,30.09772,18.062873,15.606805
min,122488.0,1.0,1.0,1.0,2017.0,0.0,1.0,-208.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,236443.0,49.0,1.0,1.0,2017.0,23.0,21.0,31.0,1.0,9.0,5.0,2.0,1.0,128.0,156.666667,7.333333,3.25,2.0
50%,300042.0,99.0,1.0,2.0,2018.0,75.0,67.0,101.0,1.0,33.0,18.0,9.0,6.0,326.0,350.0,20.75,11.333333,7.5
75%,450181.0,150.0,2.0,2.0,2019.0,188.0,165.0,254.0,3.0,86.0,43.0,26.0,19.0,624.0,598.0,43.0,26.0,18.0
max,5749019.0,200.0,22.0,22.0,2020.0,1460.0,1436.0,28849.0,103.0,705.0,219.0,130.0,103.0,1455.0,1455.0,215.0,126.0,103.0


In [60]:
data3c.loc[data3c['No. of Artists']!=1, :].describe()

Unnamed: 0,Streams,Position,Artist No.,No. of Artists,Year_chart,Song_days_since_first,Song_days_onchart,Song_days_since_release,Song_streak_id,Song_consec_day,Artist_cumu_songs,Artist_cumu_collab,Artist_cumu_solo,Artist_days_onchart,Collab_avg_days_onchart,Collab_avg_cumu_songs,Collab_avg_cumu_collab,Collab_avg_cumu_solo
count,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0,283753.0
mean,397618.4,97.62326,1.789511,2.579021,2018.430445,126.460563,113.757789,287.652828,2.281509,61.265558,29.967842,19.308902,10.658939,408.309033,408.309033,29.967842,19.308902,10.658939
std,264835.5,57.632805,0.902854,1.016166,1.105799,164.498296,142.624298,1593.743582,3.952194,75.126857,34.948523,21.476665,17.390058,338.444169,292.316975,28.202054,17.434016,13.219871
min,125484.0,1.0,1.0,2.0,2017.0,0.0,1.0,-194.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,237797.0,47.0,1.0,2.0,2017.0,22.0,21.0,28.0,1.0,10.0,4.0,3.0,0.0,126.0,173.0,9.0,6.0,1.5
50%,305342.0,95.0,2.0,2.0,2018.0,69.0,63.0,87.0,1.0,34.0,17.0,11.0,3.0,321.0,356.75,21.0,14.25,6.333333
75%,459826.0,147.0,2.0,3.0,2019.0,165.0,152.0,208.0,2.0,86.0,43.0,28.0,14.0,626.0,587.75,42.0,27.666667,14.333333
max,4444027.0,200.0,22.0,22.0,2020.0,1460.0,1166.0,28849.0,92.0,700.0,219.0,130.0,103.0,1455.0,1454.5,205.5,109.5,96.0


Export dataframe with collaborations only

In [None]:
data3c.loc[data3c['No. of Artists']!=1,:]\
.to_csv('../output/2021.01.09 spotify_us_collab_fe.csv')

Export dataframe with all songs in the US

In [61]:
data3c\
.to_csv('../output/2021.01.09 spotify_us_all_fe.csv')