In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import pandas as pd
pd.set_option('display.max_rows', 1000)
import numpy as np
import seaborn as sns

import logging

from sklearn.cluster import KMeans

In [2]:
### Set up API auth
cid = '8bdd55c606af499581a2f6fa6619b956' #client ID
secret = '687bddf834984225abd085ac6f0dfd41' #secret
user = 'gza59mo1lxjaokuzj5752p5m9'
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(cid, secret, scope=scope, redirect_uri='http://localhost:8000'))

In [None]:
# Get songs from playlist URI and user name
def get_songs_from_playlistURL(plURL, user, sp):
    track_ids = []
    track_names = []
    tracks = sp.user_playlist(user, plURL)['tracks']['items']

    for i in range(0, len(tracks)):
        track_ids.append(tracks[i]['track']['id'])
        track_names.append(tracks[i]['track']['name'])

    all_songs = pd.DataFrame({'id': track_ids, 'names': track_names})
    song_names = all_songs['names']
    return all_songs
# Get song feautures
def get_song_features(all_songs, sp):
    features_df = pd.DataFrame()
    for i in range(0, len(all_songs)):
        audio_features = sp.audio_features(all_songs.loc[i, 'id'])
        af = pd.DataFrame(audio_features)
        
        features_df = pd.concat([features_df, af])
    features_df.set_index('id', inplace = True)
    features_df.drop(['key', 'type', 'mode', 'uri', 'track_href', 'analysis_url', 'time_signature', 'duration_ms'], axis=1, inplace=True)
    return features_df
# Print results
def print_crossval_param_scores(grid_result):
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
### Construct an example training set:
# Get spotify playlist of genres user would NOT like
#i.e. Christian, Classical, and Rap 

# Get a Christian playlist
christian_rock_songs = get_songs_from_playlistURL('https://open.spotify.com/playlist/30wdzfOKmW7JmLPiQI0BGC?si=ddfbb1ce031b49ef',
                                                  'gza59mo1lxjaokuzj5752p5m9',
                                                 sp)
print(len(christian_rock_songs))
# Get a Classical playlist 
classical_songs = get_songs_from_playlistURL('https://open.spotify.com/playlist/27Zm1P410dPfedsdoO9fqm?si=483250c08cd64782',
                                            'gza59mo1lxjaokuzj5752p5m9',
                                            sp)
print(len(classical_songs))
# Get a Rap playlist 
rap_songs = get_songs_from_playlistURL('https://open.spotify.com/playlist/4riovLwMCrY3q0Cd4e0Sqp?si=42c183a75939403b',
                                      'gza59mo1lxjaokuzj5752p5m9',
                                      sp)
print(len(rap_songs))
# Combine all 
not_liked_songs = pd.concat([christian_rock_songs, classical_songs, rap_songs])
not_liked_songs.reset_index(inplace = True)
not_liked_songs

In [None]:
# Training set data cleaning
##### Get all not-liked song features
not_liked_features = get_song_features(not_liked_songs, sp)
# Drop unnecessary columns
not_liked_features.drop(['key', 'type', 'mode', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1, inplace = True)

# get songs that user does like
liked_songs = get_songs_from_playlistURL('https://open.spotify.com/playlist/5G5zCN9WRAi8EQtabAjGHg?si=22288fe4f0de40dd',
                                         'gza59mo1lxjaokuzj5752p5m9',
                                         sp)
liked_song_features = get_song_features(liked_songs, sp)
liked_song_features.drop(['key', 'type', 'mode', 'uri', 'track_href', 'analysis_url', 'time_signature'], axis=1, inplace = True)

## Normalzation (Standardization)

In [None]:
# Standardize data
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(X)
X = std_scaler.transform(X)

## Classification
#### First, test a naive classifier that guesses the majority class for all instances (0)
#### Then, use train random forest on constructed training set

In [None]:
### For tuning hyperparameters and model training:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
### Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# Tuning hyperparameters:
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

grid = dict(n_estimators=n_estimators, max_features=max_features)

cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
grid_search = GridSearchCV(estimator = rfc, param_grid = grid, n_jobs = -1, cv = cv, scoring = 'f1_micro', error_score=0)
grid_result = grid_search.fit(X, y)

print_crossval_param_scores(grid_result)
#0.9405