In [2]:
# !pip install spotipy

In [230]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import json
import time
import homebrew as hb
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
with open('/Users/patrickfuller/.secrets/spotify_api.json') as f:
    creds = json.load(f)
client_id = creds['client_id']
key = creds['api_key']

Spotipy's client credential manager takes id and key when instantiating the object  
The manager is then input when instantiating the spotify object

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                                      client_secret=key)

spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

The featured playlists method will return playlits that were featured at a given time.
I also suspect they may be user specific. 

In [4]:
# yyyy-MM-ddTHH:mm:ss
response = spotify.featured_playlists(limit=50)#, timestamp='2019-06-22T12:00:00')

In [5]:
response.keys()

dict_keys(['message', 'playlists'])

In [6]:
response['playlists'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [7]:
featured_playlists = response['playlists']['items']

In [8]:
len(featured_playlists)

11

In [9]:
featured_playlists

[{'collaborative': False,
  'external_urls': {'spotify': 'https://open.spotify.com/playlist/37i9dQZF1DXdQvOLqzNHSW'},
  'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DXdQvOLqzNHSW',
  'id': '37i9dQZF1DXdQvOLqzNHSW',
  'images': [{'height': None,
    'url': 'https://pl.scdn.co/images/pl/default/ed8ca7d059f668db5a86c73cf16262507cb0137a',
    'width': None}],
  'name': 'Late Night Vibes',
  'owner': {'display_name': 'Spotify',
   'external_urls': {'spotify': 'https://open.spotify.com/user/spotify'},
   'href': 'https://api.spotify.com/v1/users/spotify',
   'id': 'spotify',
   'type': 'user',
   'uri': 'spotify:user:spotify'},
  'primary_color': None,
  'public': None,
  'snapshot_id': 'MTU2MTQ5NzUxNSwwMDAwMDAwMGQ0MWQ4Y2Q5OGYwMGIyMDRlOTgwMDk5OGVjZjg0Mjdl',
  'tracks': {'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DXdQvOLqzNHSW/tracks',
   'total': 75},
  'type': 'playlist',
  'uri': 'spotify:playlist:37i9dQZF1DXdQvOLqzNHSW'},
 {'collaborative': False,
  'external_urls': 

The id and name are nested deep

In [10]:
featured_playlists[0]['id'], featured_playlists[0]['name']

('37i9dQZF1DXdQvOLqzNHSW', 'Late Night Vibes')

In [11]:
#We will also use the owner id
featured_playlists[0]['owner']['id']

'spotify'

Using an abstracted custom function to retreive many id and name pairs at once

In [12]:
hb.extract_name_and_id(response)

{'37i9dQZF1DXdQvOLqzNHSW': {'name': 'Late Night Vibes', 'owner_id': 'spotify'},
 '37i9dQZF1DX1gRalH1mWrP': {'name': 'Summer Hits', 'owner_id': 'spotify'},
 '37i9dQZF1DXcWBRiUaG3o5': {'name': 'Evening Acoustic', 'owner_id': 'spotify'},
 '37i9dQZF1DWXRqgorJj26U': {'name': 'Rock Classics', 'owner_id': 'spotify'},
 '37i9dQZF1DX4wta20PHgwo': {'name': 'Late Night Jazz', 'owner_id': 'spotify'},
 '37i9dQZF1DX7rOY2tZUw1k': {'name': 'Timeless Love Songs',
  'owner_id': 'spotify'},
 '37i9dQZF1DX1s9knjP51Oa': {'name': 'Calm Vibes', 'owner_id': 'spotify'},
 '37i9dQZF1DX5lDysu4GbKR': {'name': 'Acoustic Soul', 'owner_id': 'spotify'},
 '37i9dQZF1DX79Y9Kr2M2tM': {'name': 'Lush + Atmospheric',
  'owner_id': 'spotify'},
 '37i9dQZF1DWVl5gPCRkquk': {'name': 'Chill as Folk', 'owner_id': 'spotify'},
 '37i9dQZF1DWVA1Gq4XHa6U': {'name': 'Gold School', 'owner_id': 'spotify'}}

In [13]:
acoustic = spotify.search('acoustic', type='playlist', market = 'US', limit=50)

In [14]:
hb.extract_name_and_id(acoustic)

{'37i9dQZF1DXb9LIXaj5WhL': {'name': '90s Acoustic', 'owner_id': 'spotify'},
 '37i9dQZF1DWXmlLSKkfdAk': {'name': 'Acoustic Covers', 'owner_id': 'spotify'},
 '37i9dQZF1DWX9VXBLRgDqu': {'name': 'Acoustic Favorites',
  'owner_id': 'spotify'},
 '37i9dQZF1DX8TvdyVZSYFY': {'name': 'Acoustic Throwbacks',
  'owner_id': 'spotify'},
 '37i9dQZF1DX0rCrO4CFRfM': {'name': 'Acoustic Rock', 'owner_id': 'spotify'},
 '37i9dQZF1DX504r1DvyvxG': {'name': 'Classic Acoustic', 'owner_id': 'spotify'},
 '37i9dQZF1DX4VvfRBFClxm': {'name': 'Acoustic Hits', 'owner_id': 'spotify'},
 '37i9dQZF1DWSlwBojgQEcN': {'name': 'Acoustic Love', 'owner_id': 'spotify'},
 '37i9dQZF1DX4CgJVlGEIo5': {'name': 'Easy Acoustic', 'owner_id': 'spotify'},
 '37i9dQZF1DWUNIrSzKgQbP': {'name': 'Summer Acoustic', 'owner_id': 'spotify'},
 '37i9dQZF1DXc3FQfFV3K6V': {'name': 'Laidback Acoustic',
  'owner_id': 'spotify'},
 '37i9dQZF1DWYGZAMYFDM8S': {'name': 'Acoustic Chill', 'owner_id': 'spotify'},
 '37i9dQZF1DXaImRpG7HXqp': {'name': 'Acoustic Ca

It seems to work for other playlists as well. We can now begin to compile a master playlist... list  
But first lets compile a list of potential playlist queries.

In [15]:
spotify.categories(country='US', limit=2)

{'categories': {'href': 'https://api.spotify.com/v1/browse/categories?country=US&offset=0&limit=2',
  'items': [{'href': 'https://api.spotify.com/v1/browse/categories/toplists',
    'icons': [{'height': 275,
      'url': 'https://t.scdn.co/media/derived/toplists_11160599e6a04ac5d6f2757f5511778f_0_0_275_275.jpg',
      'width': 275}],
    'id': 'toplists',
    'name': 'Top Lists'},
   {'href': 'https://api.spotify.com/v1/browse/categories/pride',
    'icons': [{'height': None,
      'url': 'https://t.scdn.co/images/90f4c163-46f6-4cda-bd84-e78ff90d4959.jpg',
      'width': None}],
    'id': 'pride',
    'name': 'Pride'}],
  'limit': 2,
  'next': 'https://api.spotify.com/v1/browse/categories?country=US&offset=2&limit=2',
  'offset': 0,
  'previous': None,
  'total': 44}}

In [16]:
spotify.categories(country='US', limit=50)['categories']['items'][:2]

[{'href': 'https://api.spotify.com/v1/browse/categories/toplists',
  'icons': [{'height': 275,
    'url': 'https://t.scdn.co/media/derived/toplists_11160599e6a04ac5d6f2757f5511778f_0_0_275_275.jpg',
    'width': 275}],
  'id': 'toplists',
  'name': 'Top Lists'},
 {'href': 'https://api.spotify.com/v1/browse/categories/pride',
  'icons': [{'height': None,
    'url': 'https://t.scdn.co/images/90f4c163-46f6-4cda-bd84-e78ff90d4959.jpg',
    'width': None}],
  'id': 'pride',
  'name': 'Pride'}]

In [17]:
# These are nested similar to the playlist items
categories_list = []
for category in spotify.categories(country='US',
                                   limit=50)['categories']['items']:
    categories_list.append(category['id'])

In [18]:
categories_list

['toplists',
 'pride',
 'summer',
 'pop',
 'hiphop',
 'mood',
 'workout',
 'decades',
 'country',
 'focus',
 'latin',
 'chill',
 'edm_dance',
 'rnb',
 'rock',
 'soul',
 'indie_alt',
 'roots',
 'party',
 'sleep',
 'classical',
 'jazz',
 'inspirational',
 'gaming',
 'romance',
 'kpop',
 'popculture',
 'arab',
 'desi',
 'afro',
 'ellen',
 'metal',
 'reggae',
 'blues',
 'punk',
 'funk',
 'dinner',
 'blackhistorymonth',
 'sessions',
 'travel',
 'family',
 'comedy',
 'word',
 'mexican']

In [43]:
for cat in ['family', 'comedy', 'word', 'ellen', 'sleep']:   
    categories_list.remove(cat)
categories_list                      # Think we can pass on these categories for now.

['toplists',
 'pride',
 'summer',
 'pop',
 'hiphop',
 'mood',
 'workout',
 'decades',
 'country',
 'focus',
 'latin',
 'chill',
 'edm_dance',
 'rnb',
 'rock',
 'soul',
 'indie_alt',
 'roots',
 'party',
 'classical',
 'jazz',
 'inspirational',
 'gaming',
 'romance',
 'kpop',
 'popculture',
 'arab',
 'desi',
 'afro',
 'metal',
 'reggae',
 'blues',
 'punk',
 'funk',
 'dinner',
 'blackhistorymonth',
 'sessions',
 'travel',
 'mexican']

In [44]:
big_categories_50 = []
for category in categories_list:
    playlist_list = spotify.category_playlists(category_id=category,
                           limit=50, country='US', offset=0
                           )['playlists']['items']
    n = len(playlist_list)
    if n == 50:                             # If there are 50 playlists it may be only because
                                            # the limit is trimming the result and there are
                                            # actually more that we want to get
        big_categories_50.append(category)
    print(f'{category} has {n} playlists')

toplists has 14 playlists
pride has 24 playlists
summer has 44 playlists
pop has 41 playlists
hiphop has 46 playlists
mood has 50 playlists
workout has 50 playlists
decades has 50 playlists
country has 50 playlists
focus has 41 playlists
latin has 50 playlists
chill has 50 playlists
edm_dance has 49 playlists
rnb has 25 playlists
rock has 50 playlists
soul has 23 playlists
indie_alt has 50 playlists
roots has 50 playlists
party has 42 playlists
classical has 50 playlists
jazz has 39 playlists
inspirational has 50 playlists
gaming has 48 playlists
romance has 50 playlists
kpop has 31 playlists
popculture has 42 playlists
arab has 38 playlists
desi has 50 playlists
afro has 50 playlists
metal has 33 playlists
reggae has 17 playlists
blues has 24 playlists
punk has 27 playlists
funk has 12 playlists
dinner has 19 playlists
blackhistorymonth has 22 playlists
sessions has 10 playlists
travel has 30 playlists
mexican has 40 playlists


In [45]:
big_categories_50

['mood',
 'workout',
 'decades',
 'country',
 'latin',
 'chill',
 'rock',
 'indie_alt',
 'roots',
 'classical',
 'inspirational',
 'romance',
 'desi',
 'afro']

In [46]:
big_categories_100 = []
for category in big_categories_50:
    playlist_list = spotify.category_playlists(category_id=category,
                           limit=50, country='US', offset=50
                           )['playlists']['items']
    n = len(playlist_list)
    if n == 50:
        big_categories_100.append(category)
    print(f'{category} has {n} playlists')

mood has 35 playlists
workout has 16 playlists
decades has 6 playlists
country has 7 playlists
latin has 50 playlists
chill has 48 playlists
rock has 29 playlists
indie_alt has 1 playlists
roots has 50 playlists
classical has 50 playlists
inspirational has 34 playlists
romance has 2 playlists
desi has 13 playlists
afro has 28 playlists


In [47]:
big_categories_150 = []
for category in big_categories_100:
    playlist_list = spotify.category_playlists(category_id=category,
                           limit=50, country='US', offset=100
                           )['playlists']['items']
    n = len(playlist_list)
    if n == 50:
        big_categories_150.append(category)
    print(f'{category} has {n} playlists')

latin has 50 playlists
roots has 7 playlists
classical has 15 playlists


In [48]:
playlist_list = spotify.category_playlists(category_id='latin',
                       limit=50, country='US', offset=150
                       )['playlists']['items']
len(playlist_list)

50

WOW! 200 Playlists for Latin!! 200 will probably be enough. 

Lets start smashing some playlist lists together.  
Thankfully the dictionary class will not add any playlist twice,  
and the update function will allow us to merge new playlists.

In [49]:
master_playlist_dict = {}

In [50]:
for category in categories_list:
    offset = 0
    while True:
        response = spotify.category_playlists(category_id=category,
                                                             offset=offset,
                                                             limit=50, country='US',
                                                             )
        current_names_ids = hb.extract_name_and_id(response)
        if current_names_ids == {}:                     # If we run out of playlists in a category
            break                                       # we move on to the next category
        master_playlist_dict.update(current_names_ids)
        offset += 50

In [51]:
len(master_playlist_dict.keys())

1752

In [52]:
list(master_playlist_dict.items())[0]

('37i9dQZF1DXcBWIGoYBM5M', {'name': "Today's Top Hits", 'owner_id': 'spotify'})

Lets see if we can find artists in a playlist!

In [53]:
playlist_tracks_response = spotify.user_playlist_tracks(user='spotify',
                                             playlist_id='37i9dQZF1DXcBWIGoYBM5M',
                                             offset=0,)
playlist_tracks_response.keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [54]:
type(playlist_tracks_response['items'])

list

In [55]:
n = len(playlist_tracks_response['items'])
print(f'There are {n} songs in this response')

There are 50 songs in this response


In [56]:
first_track = playlist_tracks_response['items'][0]

In [57]:
first_track.keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

In [58]:
first_track['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [59]:
first_track['track']['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/7n2wHs1TKAczGzO7Dd2rGr'},
  'href': 'https://api.spotify.com/v1/artists/7n2wHs1TKAczGzO7Dd2rGr',
  'id': '7n2wHs1TKAczGzO7Dd2rGr',
  'name': 'Shawn Mendes',
  'type': 'artist',
  'uri': 'spotify:artist:7n2wHs1TKAczGzO7Dd2rGr'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/4nDoRrQiYLoBzwC5BhVJzF'},
  'href': 'https://api.spotify.com/v1/artists/4nDoRrQiYLoBzwC5BhVJzF',
  'id': '4nDoRrQiYLoBzwC5BhVJzF',
  'name': 'Camila Cabello',
  'type': 'artist',
  'uri': 'spotify:artist:4nDoRrQiYLoBzwC5BhVJzF'}]

In [60]:
for artist in first_track['track']['artists']:
    print(artist['name'])

Shawn Mendes
Camila Cabello


Whew. Really nested in there. Lets abstract that to a homebrew function.

In [61]:
playlist_artists = hb.get_artists_in_playlist(playlist_tracks_response)

print(f'{len(playlist_artists)} artists in this playlist')
print(f'Five artists featured in this playlist: {playlist_artists[:5]}')

72 artists in this playlist
Five artists featured in this playlist: ['RANI', 'Tyler, The Creator', 'Miley Cyrus', '21 Savage', 'Tiësto']


In [62]:
list(master_playlist_dict.items())[0]

('37i9dQZF1DXcBWIGoYBM5M', {'name': "Today's Top Hits", 'owner_id': 'spotify'})

In [65]:
with open('playlists_with_artists.json', 'w') as f:
    
    for playlist_id, values in master_playlist_dict.items():
        try:
            owner_id = values['owner_id']
            name = values['name']
            artists_in_playlist = []
            offset = 0
            for _ in range(2):
                    # Just going to look at the first 200 songs in a playlist
                response = spotify.user_playlist_tracks(user=owner_id, playlist_id=playlist_id)
                artists = hb.get_artists_in_playlist(response)
                artists_in_playlist.extend(artists)
                offset += 100
            unique = list(set(artists_in_playlist))
            row = {'playlist_id': playlist_id,
                       'name': name,
                       'artists': unique
                      }
            json.dump(row, f)
            f.write('\n')
        except:
            pass 

In [69]:
playlists_df = pd.read_json('playlists_with_artists.json', lines=True)
playlists_df.head()

Unnamed: 0,artists,name,playlist_id
0,"[RANI, Tyler, The Creator, Miley Cyrus, Tiësto...",Today's Top Hits,37i9dQZF1DXcBWIGoYBM5M
1,"[Bas, Lil Baby, French Montana, NAV, Tyler, Th...",RapCaviar,37i9dQZF1DX0XUsuxWHRQd
2,"[Slipknot, Sam Fender, Bad Religion, Sum 41, A...",Rock This,37i9dQZF1DXcF6B6QPhFDv
3,"[James Hype, Shaun Ross, SACHI, Dimitri Vegas ...",mint,37i9dQZF1DX4dyzvuaRJ0n
4,"[Scotty McCreery, Dylan Scott, Dustin Lynch, D...",Hot Country,37i9dQZF1DX1lVhptIYRda


In [72]:
master_artist_list = []
for artist_list in df.artists:
    for artist in artist_list:
        if artist not in master_artist_list:
            master_artist_list.append(artist)

In [74]:
len(master_artist_list)

43869

In [158]:
artists_df = pd.DataFrame(master_artist_list, columns=['artist'])
artists_df.head()

Unnamed: 0,artist
0,RANI
1,"Tyler, The Creator"
2,Miley Cyrus
3,Tiësto
4,YG


In [159]:
for artist_list, playlist_id in zip(playlists_df.artists, playlists_df.playlist_id):
    if 'RANI' in artist_list:
        print(playlist_id)

37i9dQZF1DXcBWIGoYBM5M
37i9dQZEVXbLiRSasKsNU9
37i9dQZEVXbKuaTI1Z1Afx
37i9dQZF1DX1gRalH1mWrP
37i9dQZF1DWUa8ZRTfalHk
37i9dQZF1DX3rxVfibe1L0
37i9dQZF1DWUSyphfcc6aL
37i9dQZF1DX8tZsk68tuDw
37i9dQZF1DXdIpacQDPDV5
3CVqLcJ1CeZ4Ejp22yU1iG


In [160]:
hb.get_playlists_w_artist(artist='RANI', df=playlists_df)

['37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZEVXbLiRSasKsNU9',
 '37i9dQZEVXbKuaTI1Z1Afx',
 '37i9dQZF1DX1gRalH1mWrP',
 '37i9dQZF1DWUa8ZRTfalHk',
 '37i9dQZF1DX3rxVfibe1L0',
 '37i9dQZF1DWUSyphfcc6aL',
 '37i9dQZF1DX8tZsk68tuDw',
 '37i9dQZF1DXdIpacQDPDV5',
 '3CVqLcJ1CeZ4Ejp22yU1iG']

In [166]:
# for artist in artists_df['artist']:
#     playlist_list = hb.get_playlists_w_artist(artist=artist,
#                                               df=playlists_df)
#     for playlist in playlist_list:
#         artists_df.loc[artists_df['artist']==artist, playlist] = 1
    

In [167]:
artists_df.head()

Unnamed: 0,artist,37i9dQZF1DXcBWIGoYBM5M,37i9dQZEVXbLiRSasKsNU9,37i9dQZEVXbKuaTI1Z1Afx,37i9dQZF1DX1gRalH1mWrP,37i9dQZF1DWUa8ZRTfalHk,37i9dQZF1DX3rxVfibe1L0,37i9dQZF1DWUSyphfcc6aL,37i9dQZF1DX8tZsk68tuDw,37i9dQZF1DXdIpacQDPDV5,...,37i9dQZF1DX37bXS7EGI3f,37i9dQZF1DWVw3oyaj4jsN,37i9dQZF1DXdTb8AG95jne,37i9dQZF1DX1cJWWyylDuw,37i9dQZF1DX5dWVbnmtiDv,37i9dQZF1DXaGNG7NmtmZv,37i9dQZF1DWXhcuQw7KIeM,37i9dQZF1DWY3X53lmPYk9,37i9dQZF1DWY7TEf86LRjN,37i9dQZF1DWVr1Mv0HkuSR
0,RANI,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,"Tyler, The Creator",1.0,,1.0,,1.0,,,,,...,,,,,,,,,,
2,Miley Cyrus,1.0,1.0,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,
3,Tiësto,1.0,,,1.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,
4,YG,1.0,1.0,1.0,,1.0,,,,,...,,,,,,,,,,


In [176]:
artists_shard = master_artist_list[:2000:250]

In [177]:
artists_shard

['RANI',
 'Matt Nash',
 'Jorge & Mateus',
 'Annie Lennox',
 'WNNR',
 'Lauren Bousfield',
 'Fredrik Ullen',
 'Sonny & Cher']

In [243]:
toy = playlists_df.head().copy()
toy

Unnamed: 0,artists,name,playlist_id
0,"[RANI, Tyler, The Creator, Miley Cyrus, Tiësto...",Today's Top Hits,37i9dQZF1DXcBWIGoYBM5M
1,"[Bas, Lil Baby, French Montana, NAV, Tyler, Th...",RapCaviar,37i9dQZF1DX0XUsuxWHRQd
2,"[Slipknot, Sam Fender, Bad Religion, Sum 41, A...",Rock This,37i9dQZF1DXcF6B6QPhFDv
3,"[James Hype, Shaun Ross, SACHI, Dimitri Vegas ...",mint,37i9dQZF1DX4dyzvuaRJ0n
4,"[Scotty McCreery, Dylan Scott, Dustin Lynch, D...",Hot Country,37i9dQZF1DX1lVhptIYRda


In [220]:
taco = pd.DataFrame(toy['artists'].tolist(),
             index=toy.index).stack()

In [225]:
taco = taco.reset_index().drop('level_1', axis=1)

In [227]:
taco.columns = ['playlist_index', 'artists']
taco.head()

Unnamed: 0,playlist_index,artists
0,0,RANI
1,0,"Tyler, The Creator"
2,0,Miley Cyrus
3,0,Tiësto
4,0,YG


* join `toy` onto `taco` via index
* drop index in favor of `playlist_id`
* OHE `playlist_id`

In [229]:
taco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 2 columns):
playlist_index    361 non-null int64
artists           361 non-null object
dtypes: int64(1), object(1)
memory usage: 5.7+ KB


In [232]:
encoder = OneHotEncoder(drop='first', categories='auto')
encoder.fit(taco[['artists']])
ohe = pd.DataFrame(encoder.transform(taco[['artists']]).toarray(),
                   columns=encoder.get_feature_names(['artist']))
ohe.head()

Unnamed: 0,artist_21 Savage,artist_3LAU,artist_5 Seconds of Summer,artist_6ix9ine,artist_A Boogie Wit da Hoodie,artist_A R I Z O N A,artist_A Touch Of Class,artist_A$AP Ferg,artist_A$AP Rocky,artist_ARTY,...,artist_Young Nudy,artist_Young Thug,artist_YoungBoy Never Broke Again,artist_Zac Brown Band,artist_Zero,artist_blink-182,artist_grandson,artist_josh pan,artist_ricky retro,artist_twoloud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [234]:
taco_ohe = pd.concat([taco.drop('artists', axis=1), ohe], axis=1)
taco_ohe.head()

Unnamed: 0,playlist_index,artist_21 Savage,artist_3LAU,artist_5 Seconds of Summer,artist_6ix9ine,artist_A Boogie Wit da Hoodie,artist_A R I Z O N A,artist_A Touch Of Class,artist_A$AP Ferg,artist_A$AP Rocky,...,artist_Young Nudy,artist_Young Thug,artist_YoungBoy Never Broke Again,artist_Zac Brown Band,artist_Zero,artist_blink-182,artist_grandson,artist_josh pan,artist_ricky retro,artist_twoloud
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
taco_ohe.shape

(361, 333)

In [237]:
toy.shape

(5, 3)

In [238]:
taco_ohe_dedup = taco_ohe.groupby('playlist_index')[ohe.columns].agg('sum').reset_index()

In [239]:
taco_ohe_dedup.head()

Unnamed: 0,playlist_index,artist_21 Savage,artist_3LAU,artist_5 Seconds of Summer,artist_6ix9ine,artist_A Boogie Wit da Hoodie,artist_A R I Z O N A,artist_A Touch Of Class,artist_A$AP Ferg,artist_A$AP Rocky,...,artist_Young Nudy,artist_Young Thug,artist_YoungBoy Never Broke Again,artist_Zac Brown Band,artist_Zero,artist_blink-182,artist_grandson,artist_josh pan,artist_ricky retro,artist_twoloud
0,0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:
output = pd.merge(toy, taco_ohe_dedup, left_index=True,
                  right_on='playlist_index', how='left'
                 ).drop('playlist_index', axis=1)

In [244]:
output

Unnamed: 0,artists,name,playlist_id,artist_21 Savage,artist_3LAU,artist_5 Seconds of Summer,artist_6ix9ine,artist_A Boogie Wit da Hoodie,artist_A R I Z O N A,artist_A Touch Of Class,...,artist_Young Nudy,artist_Young Thug,artist_YoungBoy Never Broke Again,artist_Zac Brown Band,artist_Zero,artist_blink-182,artist_grandson,artist_josh pan,artist_ricky retro,artist_twoloud
0,"[RANI, Tyler, The Creator, Miley Cyrus, Tiësto...",Today's Top Hits,37i9dQZF1DXcBWIGoYBM5M,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[Bas, Lil Baby, French Montana, NAV, Tyler, Th...",RapCaviar,37i9dQZF1DX0XUsuxWHRQd,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[Slipknot, Sam Fender, Bad Religion, Sum 41, A...",Rock This,37i9dQZF1DXcF6B6QPhFDv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,"[James Hype, Shaun Ross, SACHI, Dimitri Vegas ...",mint,37i9dQZF1DX4dyzvuaRJ0n,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,"[Scotty McCreery, Dylan Scott, Dustin Lynch, D...",Hot Country,37i9dQZF1DX1lVhptIYRda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
