In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import json
import time
import homebrew as hb
%load_ext autoreload
%autoreload 2

In [2]:
with open('/Users/patrickfuller/.secrets/spotify_api.json') as f:
    creds = json.load(f)
client_id = creds['client_id']
key = creds['api_key']

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                                      client_secret=key)

spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [9]:
cat_list = []
response = spotify.categories(country='US', limit=50)
items = response['categories']['items']
for item in items:
    category = item['id']
    cat_list.append(category)

In [10]:
for cat in ['family', 'comedy', 'word', 'ellen', 'sleep']:   
    cat_list.remove(cat)

In [12]:
master_playlist_dict = {}
for category in cat_list:
    offset = 0
    while True:
        response = spotify.category_playlists(category_id=category,
                                                             offset=offset,
                                                             limit=50, country='US',
                                                             )
        current_names_ids = hb.extract_name_and_id(response)
        if current_names_ids == {}:                     # If we run out of playlists in a category
            break                                       # we move on to the next category
        master_playlist_dict.update(current_names_ids)
        offset += 50

In [25]:
with open('playlists_artists_redundant.json', 'w') as f:
    for pl_id, pl_data in list(master_playlist_dict.items()):
        pl_artists = []
        offset = 0
        for _ in range(2):
            try:
                response = spotify.user_playlist_tracks(user=pl_data['owner_id'],
                                             playlist_id=pl_id,
                                             limit=100)
                current_artists = hb.get_artists_in_playlist(response)
                pl_artists.extend(current_artists)
                unique = set(pl_artists)
                for artist in unique:
                    line = {
                        'artist':(artist),
                        'playlist':(pl_data['name'], pl_id),
                    }
                    json.dump(line, f)
                    f.write('\n')
            except:
                pass

In [61]:
df = pd.read_json('playlists_artists_redundant.json', lines=True)

In [62]:
display(df.shape)
df.head()

(218706, 2)

Unnamed: 0,artist,playlist
0,DJ Snake,"[Today's Top Hits, 37i9dQZF1DXcBWIGoYBM5M]"
1,TINI,"[Today's Top Hits, 37i9dQZF1DXcBWIGoYBM5M]"
2,Bon Iver,"[Today's Top Hits, 37i9dQZF1DXcBWIGoYBM5M]"
3,Jon Z,"[Today's Top Hits, 37i9dQZF1DXcBWIGoYBM5M]"
4,Martin Garrix,"[Today's Top Hits, 37i9dQZF1DXcBWIGoYBM5M]"


In [63]:
len(df.artist.unique())

43786

In [64]:
df.artist.value_counts()[:20]  # These artists are showing up in the most playlists

J Balvin          248
Bad Bunny         214
Drake             202
Cardi B           186
Daddy Yankee      182
Farruko           180
Nicky Jam         178
Ozuna             176
Maluma            174
Khalid            172
Ed Sheeran        170
Nicki Minaj       165
Camila Cabello    150
Justin Bieber     146
Marshmello        146
Diplo             144
Beyoncé           143
Ty Dolla $ign     138
Anuel Aa          136
Shawn Mendes      132
Name: artist, dtype: int64

In [65]:
df.playlist = df.playlist.apply(lambda x: x[1]) # OneHot doesn't like lists or sets

In [66]:
df.head()

Unnamed: 0,artist,playlist
0,DJ Snake,37i9dQZF1DXcBWIGoYBM5M
1,TINI,37i9dQZF1DXcBWIGoYBM5M
2,Bon Iver,37i9dQZF1DXcBWIGoYBM5M
3,Jon Z,37i9dQZF1DXcBWIGoYBM5M
4,Martin Garrix,37i9dQZF1DXcBWIGoYBM5M


In [67]:
encoder = OneHotEncoder(drop='first', categories='auto')
encoder.fit(df[['playlist']])

ohe_playlists= pd.DataFrame(encoder.transform(df[['playlist']]).toarray(),
                          columns=encoder.get_feature_names(['playlist']))

In [68]:
artists_ohe_playlists = pd.concat([df.drop('playlist', axis=1),
                                   ohe_playlists],
                                   axis=1)

In [69]:
artists_ohe_playlists.shape

(218706, 1749)

In [70]:
artist_w_flagged_playlists = artists_ohe_playlists.groupby('artist').sum()

In [71]:
artist_w_flagged_playlists.shape

(43786, 1748)

In [72]:
artist_w_flagged_playlists.head(20)

Unnamed: 0_level_0,playlist_08CJ6fWrTA1vMfwVwAxWUl,playlist_0Gy1TwCxPFTMCiwxGLVkc1,playlist_0T22esVKW7cTZEN7XzXV14,playlist_0YMVY3uhQNd7WUf0hSByrX,playlist_0ZWYUjUMhaq0Rammqq3qu1,playlist_0bx5Dgu5yOEhST49LP67su,playlist_0fmLn46c92nP4dePQSZlsz,playlist_0k7YwkrfJ2BsRMFBP5W5xK,playlist_0yfTPCk6Cnhm9wtoEaXXlo,playlist_13thjkLTYZmZvjdz4u6kxh,...,playlist_73gXbLcjix7p1pPsQClusE,playlist_7A2YimOfIrmAWkCeSIY8Rq,playlist_7EnyqmmhFgfhicejZiVV13,playlist_7EpJpnU6KrSc430F8B5tM7,playlist_7FDyC1JdV3jkGzAE9BONKt,playlist_7HQu1GUDVSx64GdCpaB88I,playlist_7JXJKP3xMvIMbEBBGApTPQ,playlist_7b9DiMN2kttG8lYQogfQBw,playlist_7fzFQUIVM4TumzRWo5UFP2,playlist_7kdOsNnHtzwncTBnI3J17w
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""CHEN",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Cats"" 1981 Original London Cast",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Mendelssohn-Duo""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Mista Dj Paul",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#TocoParaVos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$NOT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$Pacely,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$tupid Young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$uicideBoy$,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
