# SpotiFinder


---

### Spotify request code

In [15]:
import sys
import json
import timeit
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv


class SpotiFinder():
    '''docstring'''
    load_dotenv()
    
    
    def __init__(self):
        self.sp       = None
        self.token    = None
        self.username = None
        
        
        if len(sys.argv) > 1:
            self.username = sys.argv[1]
        else:
            print('Whoops, need your username!')
            print('Usage: python SpotiFinder.py [username]')
            sys.exit()
        self.token = util.prompt_for_user_token(self.username)
        
    def get_saved_tracks(self):
        '''Grabs a list of all a user's saved tracks ids'''
        
        
        scope = 'user-library-read'

        if self.token:
            sp           = spotipy.Spotify(auth = self.token)
            saved_tracks = self.sp.current_user_saved_tracks()
            
            track_id_list = []
            for iterator in range(0, len(saved_tracks)):
                track_id_list.append(saved_tracks['items'][iterator]['track']['id'])
            
            return track_id_list
        else:
            print('Can\'t get token for', self.username)
    
    def song_features(self, track_id_list):
        '''Takes in a list of track ids and returns a feature list for use with our nearest neighbors model.'''
        
        
        if self.token:
            sp           = spotipy.Spotify(auth = self.token)
            
            start        = timeit.default_timer()
            rows         = []
            batchsize    = 100
            None_counter = 0

            for iterator in range(0, len(track_id_list), batchsize):

                batch           = track_id_list[iterator: iterator + batchsize]
                feature_results = self.sp.audio_features(batch)

                for i, features in enumerate(feature_results):
                    if features == None:
                        None_counter = None_counter + 1
                    else:
                        rows.append(features)

            print('Number of tracks where no audio features were available:', None_counter)
            stop = timeit.default_timer()
            print('Time to run this code (in seconds):', stop - start)
            return rows
        else:
                print('Can\'t get token for', self.username)

    def get_saved_playlists(self):
        '''Grabs a list of all a user's saved playlists'''
        
        if self.token:
            sp        = spotipy.Spotify(auth = self.token)
            playlists = self.sp.user_playlists(self.username)

            for playlist in playlists['items']:
                if playlist['owner']['id'] == self.username:
                    print()
                    print(playlist['name'])
                    print ('  total tracks', playlist['tracks']['total'])
                    results = sp.playlist(playlist['id'], fields = "tracks,next")
                    tracks  = results['tracks']
                    show_tracks(tracks)
                    while tracks['next']:
                        tracks = sp.next(tracks)
                        show_tracks(tracks)
        else:
            print("Can't get token for", self.username)
            
spot = SpotiFinder()

In [23]:
spot.get_saved_playlists()


AttributeError: 'NoneType' object has no attribute 'user_playlists'

In [24]:
tracks = spot.get_saved_tracks()
tracks

AttributeError: 'NoneType' object has no attribute 'current_user_saved_tracks'

In [9]:
new_tracks = spot.song_features(tracks)
new_tracks[0]

Number of tracks where no audio features were available: 0
Time to run this code (in seconds): 0.11849349999999959


{'danceability': 0.284,
 'energy': 0.722,
 'key': 7,
 'loudness': -8.284,
 'mode': 1,
 'speechiness': 0.0559,
 'acousticness': 0.00461,
 'instrumentalness': 0.873,
 'liveness': 0.163,
 'valence': 0.0383,
 'tempo': 116.292,
 'type': 'audio_features',
 'id': '4COR2ZPEyUn0lsbAouRWxA',
 'uri': 'spotify:track:4COR2ZPEyUn0lsbAouRWxA',
 'track_href': 'https://api.spotify.com/v1/tracks/4COR2ZPEyUn0lsbAouRWxA',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4COR2ZPEyUn0lsbAouRWxA',
 'duration_ms': 506763,
 'time_signature': 3}

In [69]:
# Creates a json file of tracks
with open('tracks.json', 'w') as outfile:
    json.dump(tracks['items'][0]['track'], outfile)

In [2]:
#     if token:
#         sp        = spotipy.Spotify(auth = token)
#         playlists = sp.user_playlists(username)
        
#         for playlist in playlists['items']:
#             if playlist['owner']['id'] == username:
#                 print()
#                 print(playlist['name'])
#                 print ('  total tracks', playlist['tracks']['total'])
#                 results = sp.playlist(playlist['id'], fields = "tracks,next")
#                 tracks  = results['tracks']
#                 show_tracks(tracks)
#                 while tracks['next']:
#                     tracks = sp.next(tracks)
#                     show_tracks(tracks)
#     else:
#         print("Can't get token for", username)

In [14]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb


list_of_results      = data[0]["items"]
list_of_artist_names = []
list_of_artist_uri   = []
list_of_song_names   = []
list_of_song_uri     = []
list_of_durations_ms = []
list_of_explicit     = []
list_of_albums       = []
list_of_popularity   = []

for result in list_of_results:
    result["album"]
    this_artists_name = result["artists"][0]["name"]
    list_of_artist_names.append(this_artists_name)
    this_artists_uri  = result["artists"][0]["uri"]
    list_of_artist_uri.append(this_artists_uri)
    list_of_songs     = result["name"]
    list_of_song_names.append(list_of_songs)
    song_uri          = result["uri"]
    list_of_song_uri.append(song_uri)
    list_of_duration  = result["duration_ms"]
    list_of_durations_ms.append(list_of_duration)
    song_explicit     = result["explicit"]
    list_of_explicit.append(song_explicit)
    this_album        = result["album"]["name"]
    list_of_albums.append(this_album)
    song_popularity   = result["popularity"]
    list_of_popularity.append(song_popularity)

NameError: name 'data' is not defined

In [None]:
all_songs = pd.DataFrame(
    {'artist': list_of_artist_names,
     'artist_uri': list_of_artist_uri,
     'song': list_of_song_names,
     'song_uri': list_of_song_uri,
     'duration_ms': list_of_durations_ms,
     'explicit': list_of_explicit,
     'album': list_of_albums,
     'popularity': list_of_popularity
     
    })

all_songs_saved = all_songs.to_csv('top50_songs.csv')

In [8]:
client_credentials_manager = SpotifyClientCredentials(client_id     = Client
                                                     ,client_secret = Secret
                                                     )
sp                         = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [24]:
# timeit library to measure the time needed to run this code
import timeit
start = timeit.default_timer()

# create empty lists where the results are going to be stored
artist_name = []
track_name  = []
popularity  = []
track_id    = []

for i in range(0, 10000, 50):
    track_results = sp.search(q      = 'year:2018'
                             ,type   = 'track'
                             ,limit  = 50
                             ,offset = i
                             )
    for i, t in enumerate(track_results['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
      

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

NameError: name 'sp' is not defined

### Song features

In [25]:
import timeit


start        = timeit.default_timer()
rows         = []
batchsize    = 100
None_counter = 0

df_tracks    = pd.DataFrame({'artist_name': artist_name
                            ,'track_name' : track_name
                            ,'track_id'   : track_id
                            ,'popularity' : popularity
                            }
                           )

for iterator in range(0, len(df_tracks['track_id']), batchsize):
    
    batch           = df_tracks['track_id'][iterator: iterator + batchsize]
    feature_results = sp.audio_features(batch)
    
    for i, features in enumerate(feature_results):
        if features == None:
            None_counter = None_counter + 1
        else:
            rows.append(features)
            
print('Number of tracks where no audio features were available:', None_counter)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

Number of tracks where no audio features were available: 0
Time to run this code (in seconds): 0.002217700000073819


In [26]:
rows[0]

IndexError: list index out of range

## Dataframe stuff

In [19]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split


df      = pd.read_csv('http://www.zernach.com/wp-content/uploads/2020/02/SpotifyAudioFeaturesApril2019.csv')
# profile = ProfileReport(df
#                        ,title = 'Pandas Profiling Report'
# #                        ,html  = {'style':{'full_width':True}}
#                        )

target   = 'track_id'
features = ['acousticness'
           ,'danceability'
           ,'energy'
           ,'instrumentalness'
           ,'key'
           ,'liveness'
           ,'loudness'
           ,'mode'
           ,'speechiness'
           ,'tempo'
           ,'valence'
           ]

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X
                                                   ,y
                                                   ,train_size   = 0.8
                                                   ,random_state = 6
                                                   )
X_train.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
79844,0.0128,0.577,0.809,2e-06,0,0.308,-3.737,0,0.0811,172.475,0.783
71346,0.44,0.741,0.548,0.0,11,0.188,-9.122,0,0.461,174.997,0.519
69319,0.00179,0.387,0.983,0.641,7,0.133,-1.295,1,0.639,171.781,0.0876
50534,0.99,0.375,0.0211,0.904,0,0.0694,-27.907,0,0.0532,86.271,0.0443
117154,0.995,0.344,0.00367,0.903,6,0.0654,-35.48,1,0.048,102.868,0.137


### Nearest Neighbors

In [67]:
# Load
import joblib


filename = 'Nearest_Neighbors.sav'
nn       = joblib.load(filename)
print('Model Loaded.')

Model Loaded.


In [16]:
from sklearn.neighbors import NearestNeighbors


# Fit on DTM
nearest = 10

nn = NearestNeighbors(n_neighbors = nearest
                     ,algorithm   = 'kd_tree'
                     )
nn.fit(X_train, y_train)
nn.kneighbors(X_test)

(array([[0.35943454, 0.3912381 , 0.49772777, ..., 0.65004361, 0.71786337,
         0.75240598],
        [1.00081604, 1.00939867, 1.21788131, ..., 1.48020205, 1.48630182,
         1.50388653],
        [0.60524097, 0.72794451, 1.00118408, ..., 1.11901486, 1.13712675,
         1.16546482],
        ...,
        [0.56813504, 0.59744219, 0.70007179, ..., 1.08162478, 1.10318403,
         1.12194971],
        [1.08791094, 1.17370865, 1.20648166, ..., 1.43699797, 1.44623926,
         1.48630686],
        [1.03551939, 1.14338649, 1.15505548, ..., 1.36791522, 1.40147052,
         1.41089263]]),
 array([[ 42642, 101183,  63650, ...,  78442,  63162,  46320],
        [ 83274,  19647,  31075, ...,  75559,  12263,  81370],
        [ 74596,  84621,  45780, ...,  57128,   2408,  39370],
        ...,
        [ 21923,  24417,  52868, ..., 103787,  43270,   9099],
        [ 20873,  30739,  80960, ...,  74010,  74407,  88279],
        [ 39042,  95439,  25256, ...,  37333,  25249,  47876]], dtype=int64))

In [68]:
df['track_id']

0         2RM4jf1Xa9zPgMGRDiht8O
1         1tHDG53xJNGsItRA3vfVgs
2         6Wosx2euFPMT14UXiWudMy
3         3J2Jpw61sO7l6Hc7qdYV91
4         2jbYvQCyPgX3CdmAzeVeuS
                   ...          
130658    0cvfSKcm9VeduwyYPrxtLx
130659    43MP9F7UzvfilSrw2SqZGJ
130660    4TWlUuFk81NGUNKwndyS5Q
130661    5iGBXzOoRo4sBTy8wdzMyK
130662    7LNtyuekYHiZ99UxkrfCQR
Name: track_id, Length: 130663, dtype: object

In [69]:
track_list = ['2RM4jf1Xa9zPgMGRDiht8O']

In [70]:
df['track_id'].isin(track_list)

0          True
1         False
2         False
3         False
4         False
          ...  
130658    False
130659    False
130660    False
130661    False
130662    False
Name: track_id, Length: 130663, dtype: bool

In [71]:
df[df['track_id'].isin(track_list)]

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1.0,0.0812,-7.678,1.0,0.409,203.927,4,0.118,15


In [72]:
df[df['track_id'].isin(track_list)][features]

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,0.00582,0.743,0.339,0.0,1.0,0.0812,-7.678,1.0,0.409,203.927,0.118


In [46]:
df[df['track_id'].isin(['2RM4jf1Xa9zPgMGRDiht8O'])][features].dtypes

acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
valence             float64
dtype: object

In [37]:
df[df['track_id'].isin(['2RM4jf1Xa9zPgMGRDiht8O'])][features]

0    1.0
Name: mode, dtype: float64

In [63]:
track

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
130613,0.498,0.676,0.844,0.0,7.0,0.866,-3.396,1.0,0.109,159.969,0.716
130614,0.023,0.544,0.573,0.0,1.0,0.103,-6.936,1.0,0.0297,94.143,0.349
130615,0.0648,0.578,0.91,0.0,2.0,0.311,-2.307,1.0,0.0637,133.055,0.823
130616,0.444,0.624,0.291,0.877,11.0,0.239,-13.459,0.0,0.0996,194.011,0.251
130617,0.98,0.298,0.0462,0.941,8.0,0.125,-28.427,1.0,0.04,61.299,0.28
130618,0.0459,0.846,0.692,0.0,9.0,0.127,-3.303,1.0,0.0623,123.95,0.228
130619,0.508,0.76,0.559,0.0,11.0,0.367,-7.274,0.0,0.499,74.954,0.48
130620,0.605,0.605,0.598,0.00999,3.0,0.157,-9.263,1.0,0.0323,105.945,0.753
130621,0.0238,0.658,0.913,0.623,11.0,0.0747,-4.89,0.0,0.0509,123.998,0.711
130622,0.691,0.494,0.344,0.112,2.0,0.126,-9.379,1.0,0.0532,113.822,0.329


In [73]:
import json


track_list = df['track_id'].tail(50)

# Get track features
# This can be replaced with a spotipy request or a specific track from a user as long as it contains the features.
track = df[df['track_id'].isin(track_list)][features]

# This returns a dataframe of the top 10 most likely songs to recommend.
predictions      = nn.kneighbors(track)[1][0]
df_top_similar   = df.iloc[predictions]

# This converts our dataframe to json
json_top_similar = json.loads(df_top_similar.to_json())


# To save as a json file
# df_top_similar.to_json(r'./predict.json')

In [75]:
predictions

array([ 1987, 98359, 79094, 21104, 59537, 10281,  8241, 15964,  2249,
       97195], dtype=int64)

In [74]:
df_top_similar

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
1987,El Komander,7dXwmw2CI19UrzGPhzhJZ2,La Botella,0.629,0.726,163505,0.685,1.3e-05,2.0,0.0461,-4.965,1.0,0.0617,145.016,4,0.968,27
98359,Shridhar V. Sambaram,5QYK1uAzcuyu92S5zBSIVW,Megha Megha Karaoke,0.296,0.647,271957,0.484,0.808,9.0,0.102,-11.538,1.0,0.0329,157.022,5,0.118,0
79094,C.O.Z.,2f89aCFZWHL52UlCzUeqDz,On Fire - Original Mix,0.000294,0.889,391000,0.897,0.81,4.0,0.113,-7.696,0.0,0.099,124.001,4,0.839,11
21104,Cody Ray,5MRZ1YQtINE1tanEKAaXOW,Growing Up Too Fast,0.444,0.792,230400,0.624,2e-06,4.0,0.0776,-11.516,1.0,0.226,100.038,4,0.705,24
59537,Slow Dancing Society,05nOXGoDSDVojMSotVOSjM,East Sprague,0.168,0.679,167173,0.397,0.944,11.0,0.105,-15.566,0.0,0.0286,119.977,4,0.165,1
10281,Aaron Camper,56TNQyAhvug0ZWVNB4FV7e,Commandments,0.339,0.52,234067,0.597,0.0,8.0,0.192,-8.783,0.0,0.0884,80.919,4,0.667,19
8241,Valentino Khan,0mrCPHs4Fmr2IlbjiVVN06,Gold - Remix,0.00106,0.755,215094,0.833,2e-06,9.0,0.616,-3.056,1.0,0.0631,106.056,4,0.198,40
15964,Ginny Vee,29jy786zW9SfzX5PPjB6MX,Love strong - Manovski & Alan Aguero Edit,0.0693,0.604,191739,0.823,0.0,2.0,0.0532,-3.145,0.0,0.0461,184.005,4,0.556,13
2249,John Prine,1LMWAiVZFfih1JRwZ42pc9,Forbidden Jimmy,0.525,0.645,172280,0.618,1e-06,7.0,0.134,-13.81,1.0,0.0492,91.672,4,0.949,6
97195,Seven Lions,41Umx93nsocrD8ckKBSuno,Start Again (feat. Fiora),0.119,0.379,334707,0.754,1.8e-05,5.0,0.222,-5.433,1.0,0.0352,149.882,4,0.0895,57


In [2]:
spot   = SpotiFinder()

In [106]:
spot   = SpotiFinder()
tracks = spot.get_saved_tracks()
tracks

['4COR2ZPEyUn0lsbAouRWxA',
 '0XhcbuMPBRRcaqRiHMrnKl',
 '5MAPt0beSbEQdPM5SfJwyl',
 '5LfS278Zsmem9pmxmBXgix',
 '5ZGtjzyg3F6XftEyllHDnC',
 '6gHsenqqwzj67gp1OzCIi2',
 '3KtFA8wHjyg44eVtiI4QqE']

In [5]:
tracky = '0XhcbuMPBRRcaqRiHMrnKl'

In [6]:
spot.song_features(tracky)

Number of tracks where no audio features were available: 0
Time to run this code (in seconds): 0.11925030000000447


[{'danceability': 0.544,
  'energy': 0.912,
  'key': 1,
  'loudness': -5.515,
  'mode': 1,
  'speechiness': 0.0743,
  'acousticness': 0.00757,
  'instrumentalness': 5.64e-05,
  'liveness': 0.0946,
  'valence': 0.794,
  'tempo': 157.603,
  'type': 'audio_features',
  'id': '0XhcbuMPBRRcaqRiHMrnKl',
  'uri': 'spotify:track:0XhcbuMPBRRcaqRiHMrnKl',
  'track_href': 'https://api.spotify.com/v1/tracks/0XhcbuMPBRRcaqRiHMrnKl',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0XhcbuMPBRRcaqRiHMrnKl',
  'duration_ms': 244253,
  'time_signature': 4}]

In [20]:
# Save
import joblib


filename = 'Nearest_Neighbors.sav'
joblib.dump(nn, filename)
print('Model saved!')

Model saved!


---

### Neural Net

In [None]:
model.load('Sequential_model.h5')
print('Saved model!')

In [None]:
import pandas as pd
import numpy
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import wandb
from wandb.keras import WandbCallback
from category_encoders import TargetEncoder
from category_encoders import OrdinalEncoder
from tensorflow import keras


seed = 7
numpy.random.seed(seed)

# Important Hyperparameters
batch_size = 100
epochs     = 1
optimizer  = 'adam'

print('Building model...')
model = Sequential()

# Input
model.add(Dense(13, input_dim = 13, activation = 'sigmoid'))

# Hidden
model.add(Dense(28   ,activation = 'sigmoid'))
model.add(Dense(56   ,activation = 'sigmoid'))

# Output
model.add(Dense(130663, activation = 'sigmoid'))
print('Model set.')

In [None]:
#Compile
print('Compiling...')
model.compile(loss      = 'sparse_categorical_crossentropy'
             ,optimizer = optimizer
             ,metrics   = ['accuracy']
             )

print('Finished.')

In [None]:
model.save('Sequential_model.h5')
print('Saved model!')

In [None]:
from tensorflow.keras.models import load_model


model = load_model('Sequential_model.h5')

model.summary()


score = model.evaluate(X_test, y_test, verbose = 0)
print(f'{model.metrics_names[1]}: {score[1] * 100:.2f}')

In [None]:
model.predict(X_test)

In [49]:
print('Building hist...')
history = model.fit(X_train, y_train
#                    ,validation_data = (X_test, y_test)
                   ,epochs          = epochs
                   ,batch_size      = batch_size
                   ,verbose         = False
                   )
print('Hist complete.')

model.summary()

Building hist...
Hist complete.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_9 (Dense)              (None, 28)                392       
_________________________________________________________________
dense_10 (Dense)             (None, 56)                1624      
_________________________________________________________________
dense_11 (Dense)             (None, 130663)            7447791   
Total params: 7,449,989
Trainable params: 7,449,989
Non-trainable params: 0
_________________________________________________________________


---

In [None]:
import pandas as pd


df = pd.read_csv('http://www.zernach.com/wp-content/uploads/2020/02/SpotifyAudioFeaturesApril2019.csv')

results = {}


for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['id'][i]) for i in similar_indices]

    results[row['id']] = similar_items[1:]
    
print('done!')

In [None]:
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id = 11, num = 5)