In [3]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import sys
import json
from dotenv import load_dotenv
import pandas as pd
import datetime
import pytz
from dateutil.relativedelta import relativedelta

# Dataset Setup 

In [4]:
all_tracks_df = pd.read_csv('tracks_features.csv')
all_tracks_df = all_tracks_df.drop(['album_id', 'track_number', 'disc_number', 'year', 'release_date'],axis=1)
all_tracks_df

Unnamed: 0,id,name,album,artists,artist_ids,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],False,0.470,0.978,7,-5.399,1,0.0727,0.02610,0.000011,0.3560,0.503,117.906,210133,4.0
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],True,0.599,0.957,11,-5.764,1,0.1880,0.01290,0.000071,0.1550,0.489,103.680,206200,4.0
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],False,0.315,0.970,7,-5.424,1,0.4830,0.02340,0.000002,0.1220,0.370,149.749,298893,4.0
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],True,0.440,0.967,11,-5.830,0,0.2370,0.16300,0.000004,0.1210,0.574,96.752,213640,4.0
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105000,0.0789,0.539,127.059,205600,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204020,0EsMifwUmMfJZxzoMPXJKZ,Gospel of Juke,Notch - EP,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],False,0.264,0.966,5,-6.970,0,0.0672,0.00935,0.002240,0.3370,0.415,159.586,276213,4.0
1204021,2WSc2TB1CSJgGE0PEzVeiu,Prism Visions,Notch - EP,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],False,0.796,0.701,11,-6.602,0,0.0883,0.10400,0.644000,0.0749,0.781,121.980,363179,4.0
1204022,6iProIgUe3ETpO6UT0v5Hg,Tokyo 360,Notch - EP,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],False,0.785,0.796,9,-5.960,0,0.0564,0.03040,0.918000,0.0664,0.467,121.996,385335,4.0
1204023,37B4SXC8uoBsUyKCWnhPfX,Yummy!,Notch - EP,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],False,0.665,0.856,6,-6.788,0,0.0409,0.00007,0.776000,0.1170,0.227,124.986,324455,4.0


In [5]:
from sklearn.preprocessing import MinMaxScaler

def create_feature_vectores(all_tracks_df):
    """
    Creates feature vector for each song in all_tracks_df.

    Parameters:
    - all_tracks_df: consists of all tracks in the used dataset, mimicking the "spotify db"
    Returns:
    - dataframe consisting of each track id, and their feature vector normalized.
    """

    # seperate feature columns and id column
    features_only_df = all_tracks_df.drop(['name','album','artists','artist_ids', 'id'],axis=1).reset_index(drop = True)
    id_df = all_tracks_df[['id']]

    scaler = MinMaxScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features_only_df), columns=features_only_df.columns) * 0.2

    final_df = pd.concat([id_df, features_scaled], axis = 1)
    return final_df

In [6]:
full_feature_set_df = create_feature_vectores(all_tracks_df)
full_feature_set_df

Unnamed: 0,id,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,0.0,0.0940,0.1956,0.127273,0.162421,0.2,0.015005,0.005241,2.180000e-06,0.07120,0.1006,0.094729,0.006902,0.16
1,1wsRitfRRtWyEapl0q22o8,0.2,0.1198,0.1914,0.200000,0.161335,0.2,0.038803,0.002590,1.412000e-05,0.03100,0.0978,0.083299,0.006772,0.16
2,1hR0fIFK2qRG3f3RF70pb7,0.0,0.0630,0.1940,0.127273,0.162346,0.2,0.099690,0.004699,4.060000e-07,0.02440,0.0740,0.120312,0.009831,0.16
3,2lbASgTSoDO7MTuLAXlTW0,0.2,0.0880,0.1934,0.200000,0.161139,0.0,0.048916,0.032731,7.280000e-07,0.02420,0.1148,0.077733,0.007018,0.16
4,1MQTmpYOZ6fcMQc56Hdo7T,0.0,0.0852,0.1858,0.036364,0.158464,0.2,0.014469,0.000325,2.100000e-02,0.01578,0.1078,0.102082,0.006752,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204020,0EsMifwUmMfJZxzoMPXJKZ,0.0,0.0528,0.1932,0.090909,0.157748,0.0,0.013870,0.001878,4.480000e-04,0.06740,0.0830,0.128216,0.009083,0.16
1204021,2WSc2TB1CSJgGE0PEzVeiu,0.0,0.1592,0.1402,0.200000,0.158842,0.0,0.018225,0.020884,1.288000e-01,0.01498,0.1562,0.098002,0.011953,0.16
1204022,6iProIgUe3ETpO6UT0v5Hg,0.0,0.1570,0.1592,0.163636,0.160752,0.0,0.011641,0.006104,1.836000e-01,0.01328,0.0934,0.098015,0.012684,0.16
1204023,37B4SXC8uoBsUyKCWnhPfX,0.0,0.1330,0.1712,0.109091,0.158289,0.0,0.008442,0.000014,1.552000e-01,0.02340,0.0454,0.100417,0.010675,0.16


# Connecting to Spotify API

In [7]:
load_dotenv()
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [8]:
playlists_res = sp.current_user_playlists()['items']
playlists = {}

for item in playlists_res:
  playlists[item['name']] = item['id']

playlists
id = playlists['RO tation'] # This is what the input to the model will be, playlist ID

# Constructing Input Data From Playlist

In [9]:
def _tracks_from_playlist(id):
    """
    Given a playlist id, returns a pandas dataframe consisting of key elements of each song
    """
    playlist = sp.playlist(id)
    tracks = []
    for item in playlist['tracks']['items']:
        if item['track']['id'] is not None:
            track = item['track']
            track_id = track['id']
            track_info = {
                'id': track_id,
                'duration_ms': track['duration_ms'],
                'explicit': track['explicit'],
                'popularity': track['popularity'],
                'date_added': item['added_at']
            }
            tracks.append(track_info)

    tracks_df = pd.DataFrame(tracks)
    tracks_df['date_added'] = pd.to_datetime(tracks_df['date_added'], utc=True)
    now = datetime.datetime.now(pytz.utc)
    tracks_df['months_since_added'] = tracks_df['date_added'].apply(lambda x: relativedelta(now, x).months)
    tracks_df = tracks_df.drop(['date_added'], axis=1)

    return tracks_df

In [10]:
def _extract_tracks_features(ids):
  """
  Given a list of track ids, returns a pandas dataframe of key audio features of each track
  """
  audio_features_list = []
  for track_id in ids:
    if track_id is not None:
      audio_features = sp.audio_features(track_id)[0]
      audio_features_list.append(audio_features)

  # convert list of dictionaries to Pandas DataFrame
  audio_features_df = pd.DataFrame.from_records(audio_features_list, columns=audio_features_list[0].keys())

  # add track ID column to DataFrame
  audio_features_df['id'] = ids

  # re-order columns to put track_id first
  cols = audio_features_df.columns.tolist()
  cols = cols[-1:] + cols[:-1]
  audio_features_df = audio_features_df[cols]
  audio_features_df = audio_features_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url','duration_ms'], axis=1)
  return audio_features_df

In [11]:
def create_playlist_df(id):
    """
    Returns a dataframe for the playlsit with the given id. Attributes include those returned by
    Spotify Web API's GET Audio Features endpoint and track meta data like name, artist, etc.
    """
    df1 = _tracks_from_playlist(id)
    df2 = _extract_tracks_features(df1['id'])
    playlist_df = pd.concat([df1,df2], axis=1)
    return playlist_df

In [12]:
playlist_df = create_playlist_df(id)
playlist_df

Unnamed: 0,id,duration_ms,explicit,popularity,months_since_added,time_signature,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,1sFstGV1Z3Aw5TDFCiT7vK,185062,True,63,3,4,0.725,0.601,1,-9.030,1,0.377,0.0321,0.000000,0.0504,0.811,155.918
1,0v9Wz8o0BT8DU38R4ddjeH,304606,True,67,3,4,0.652,0.795,11,-5.192,0,0.174,0.1560,0.000000,0.1230,0.788,135.018
2,3ZLyt2ndLFBh148XRYjYYZ,203794,False,57,3,4,0.531,0.544,0,-8.615,0,0.462,0.3080,0.000002,0.0835,0.398,131.921
3,1QBwk6GTCxVdC2hoSw9tlM,307640,True,64,3,4,0.417,0.887,5,-5.551,0,0.367,0.0786,0.000000,0.8340,0.303,167.579
4,1ZM8toCOlnfBKJdvR8GqUq,233922,True,59,3,4,0.687,0.783,0,-4.573,1,0.191,0.0526,0.000000,0.0718,0.359,156.141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,52uU5uNt9lOstGCc6C7QEJ,251906,True,51,0,4,0.473,0.913,11,-3.372,0,0.444,0.0271,0.000000,0.0784,0.721,94.571
95,564oa00vY05d1uYnTEAAmE,183760,True,61,0,4,0.885,0.595,7,-5.487,1,0.286,0.6560,0.000004,0.1050,0.754,148.034
96,6igOR5QCn09zAVF2ikZjsV,184093,True,55,0,4,0.930,0.561,0,-8.053,1,0.250,0.1010,0.000000,0.0992,0.596,140.987
97,1GeNui6m825V8jP4uKiIaH,149040,True,69,0,4,0.974,0.596,6,-8.888,1,0.184,0.0976,0.000000,0.1510,0.892,111.959


# Generate Recommendations

In [13]:
def create_playlist_vector(full_feature_set_df, playlist_df, weight_factor=1.2):
    """
    Generates a single vector desribing a playlist dataframe.
    Removes those songs in the playlist from the full_feature_set_df

    Parameters:
    - full_feature_set_df: All tracks of the dataset
    - playlist_df: Dataframe consisting of the songs in the playlist and their features
    - weight_factor: value representing bias of more recently added songs
    Returns:
    - sum_vect: 1D vector summarizing the features of the playlist
    - refined: All tracks in dataset as defined by full_feature_set_df, except those in the playlist_df
    """
    # Compute full_feature_set_df
    # refined_complete_df = 
    merged_df = pd.merge(full_feature_set_df, playlist_df['id'], on='id', how='left', indicator=True)
    pruned_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')


    playlist_df= playlist_df.drop(['id'],axis=1)
    # Note: Popularity feature is not given in kaggle dataset;may need to drop that column as well.
    playlist_df= playlist_df.drop(['popularity'],axis=1)
    

    # Compute Weight for each song of playlist; given from their months_since_added
    # More recent the song was added, the more weight is is given
    playlist_df['weight'] = playlist_df['months_since_added'].apply(lambda x: weight_factor ** (-x))
    playlist_df.update(playlist_df.mul(playlist_df.weight,0))
    playlist_df = playlist_df.drop(['weight', 'months_since_added'],axis=1)

    #Normalize Data (15 features)
    df = playlist_df.apply(lambda iterator: ((iterator - iterator.mean())/iterator.std()).round(2))
    df_normalized = df.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())).round(2))
    sum_vect = df_normalized.sum(axis = 0)


    return sum_vect, pruned_df

In [14]:
playlist_vect, refined_feature_set = create_playlist_vector(full_feature_set_df,playlist_df)
playlist_vect

duration_ms         63.02
explicit            34.96
time_signature      38.23
danceability        64.19
energy              62.57
key                 66.26
loudness            39.41
mode                67.84
speechiness         79.08
acousticness        82.41
instrumentalness    97.80
liveness            79.91
valence             64.23
tempo               63.94
dtype: float64

In [15]:
refined_feature_set

Unnamed: 0,id,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,0.0,0.0940,0.1956,0.127273,0.162421,0.2,0.015005,0.005241,2.180000e-06,0.07120,0.1006,0.094729,0.006902,0.16
1,1wsRitfRRtWyEapl0q22o8,0.2,0.1198,0.1914,0.200000,0.161335,0.2,0.038803,0.002590,1.412000e-05,0.03100,0.0978,0.083299,0.006772,0.16
2,1hR0fIFK2qRG3f3RF70pb7,0.0,0.0630,0.1940,0.127273,0.162346,0.2,0.099690,0.004699,4.060000e-07,0.02440,0.0740,0.120312,0.009831,0.16
3,2lbASgTSoDO7MTuLAXlTW0,0.2,0.0880,0.1934,0.200000,0.161139,0.0,0.048916,0.032731,7.280000e-07,0.02420,0.1148,0.077733,0.007018,0.16
4,1MQTmpYOZ6fcMQc56Hdo7T,0.0,0.0852,0.1858,0.036364,0.158464,0.2,0.014469,0.000325,2.100000e-02,0.01578,0.1078,0.102082,0.006752,0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204020,0EsMifwUmMfJZxzoMPXJKZ,0.0,0.0528,0.1932,0.090909,0.157748,0.0,0.013870,0.001878,4.480000e-04,0.06740,0.0830,0.128216,0.009083,0.16
1204021,2WSc2TB1CSJgGE0PEzVeiu,0.0,0.1592,0.1402,0.200000,0.158842,0.0,0.018225,0.020884,1.288000e-01,0.01498,0.1562,0.098002,0.011953,0.16
1204022,6iProIgUe3ETpO6UT0v5Hg,0.0,0.1570,0.1592,0.163636,0.160752,0.0,0.011641,0.006104,1.836000e-01,0.01328,0.0934,0.098015,0.012684,0.16
1204023,37B4SXC8uoBsUyKCWnhPfX,0.0,0.1330,0.1712,0.109091,0.158289,0.0,0.008442,0.000014,1.552000e-01,0.02340,0.0454,0.100417,0.010675,0.16


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def generate_recommendations(spotify_df, playlist_vect, refined_feature_set):
    """
    Return Recommenmdations based on playlist.

    Parameters:
    - spotify_df : Dataframe of all songs in spotify (or in the used dataset)
    - playlist_vect: vector representing the playlist
    - refined_feature_set: feature set of songs that are not in playlist
    
    Returns:
    - recommended_10_songs: Top 10 recommended songs based on playlists
    """

    non_playlist_df = spotify_df[spotify_df['id'].isin(refined_feature_set['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(refined_feature_set.drop('id', axis = 1).values, playlist_vect.values.reshape(1, -1))[:,0]
    recommended_10_songs = non_playlist_df.sort_values('sim',ascending = False).head(10)
    recommended_10_songs['url'] = recommended_10_songs['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])

    return recommended_10_songs

In [19]:
rotaion_recs = generate_recommendations(all_tracks_df,playlist_vect, refined_feature_set)
rotaion_recs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(refined_feature_set.drop('id', axis = 1).values, playlist_vect.values.reshape(1, -1))[:,0]


Unnamed: 0,id,name,album,artists,artist_ids,explicit,danceability,energy,key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,sim,url
585515,4HEOlBKEndbe89c1co7G26,Honky Tonk Train Blues,Live at Foxrock Folk Club - The Parish Hall Tapes,['Tony Drennan'],['6drU7Xc313Dzc8hiia9Z36'],False,0.479,0.664,7,-10.689,...,0.439,0.898,0.79,0.864,0.936,168.554,158452,4.0,0.918525,https://i.scdn.co/image/ab67616d00001e023858a7...
776274,6TTWUsIPydoLs9DKFwUyXT,Darktown Strutter's Ball,Benny Goodman On The Air 1937 - 38,['Benny Goodman'],['1pBuKaLHJlIlqYxQQaflve'],False,0.451,0.658,10,-10.895,...,0.454,0.98,0.907,0.937,0.774,210.264,149267,4.0,0.916696,https://i.scdn.co/image/ab67616d00001e028b2b3e...
3187,1FbXA7ikBbxb8mHQBAU1Co,Track 2,"Duo: Wesleyan, 1994","['Abraham Adzinyah', 'Anthony Braxton']","['2ystOZyRIZhdUIYeairXEq', '3UXq4fckDmcPmleixl...",False,0.366,0.248,5,-26.94,...,0.0391,0.936,0.89,0.673,0.626,156.382,2987943,4.0,0.915766,https://i.scdn.co/image/ab67616d00001e025a37e0...
551482,1RsFpT7TSXMEkMRIaE6u4Y,Off to California / The Greencastle - Hornpipes,Voice of the People 14: Troubles They Are But Few,['Willy Taylor'],['21ijoXArUT0XDQ4zk1UzFY'],False,0.378,0.399,7,-13.289,...,0.0542,0.774,0.876,0.826,0.976,168.214,195773,4.0,0.915363,https://i.scdn.co/image/ab67616d00001e02c3bfd0...
823553,15DPxFDjv8H9reyvhL4iYQ,El Caballo Blanco,Corridos Can Banda,['Banda la Sinaloense'],['0Nq3UWpsptgad94f4yWIgZ'],False,0.469,0.479,10,-11.753,...,0.0813,0.821,0.794,0.835,0.971,189.669,112053,3.0,0.9136,https://i.scdn.co/image/ab67616d00001e0275696a...
124633,1gAlzdwwIwOJsjLEU64OJK,Life - Remix,Team B,['Team B'],['4VDGqdw2vo0Mow8GiSNarg'],False,0.674,0.441,9,-14.017,...,0.453,0.958,0.951,0.803,0.948,142.747,144147,4.0,0.912778,https://i.scdn.co/image/ab67616d00001e02dd5d67...
912746,5fUkozAN5DWXKHYPXStuXs,Randy Lynn Rag,24 Greatest Bluegrass Hits,"[""Lester Flatt's Nashville Grass""]",['0QanYsnKFsyMuvZJt0mGfl'],False,0.397,0.637,8,-11.555,...,0.0381,0.816,0.943,0.924,0.985,173.43,97693,4.0,0.912062,https://i.scdn.co/image/ab67616d00001e025bb7c9...
775813,0a94FNCXmUi1M6jxeY8VYo,In the Mood,"The Chesterfield Broadcasts, Volume 1 (with Th...",['Glenn Miller'],['2aAHdB5HweT3mFcRzm0swc'],False,0.468,0.502,8,-11.957,...,0.278,0.838,0.791,0.744,0.701,171.752,196373,4.0,0.912049,https://i.scdn.co/image/ab67616d00001e0202dc17...
206603,21Ttrr0EruSy8UOKFxOjk5,In the Mood,Glenn Miller And The Andrews Sisters: The Ches...,['Glenn Miller'],['2aAHdB5HweT3mFcRzm0swc'],False,0.468,0.502,8,-11.957,...,0.278,0.838,0.791,0.744,0.701,171.752,196373,4.0,0.912049,https://i.scdn.co/image/ab67616d00001e029fc17e...
650032,6IxV9uNX46VNavAma5Gs7X,The Manchester Hornpipe,Voice of the People 09: Rig-A-Jig-Jig: Dance M...,['Ruth Askew & George Privett'],['3zOYXLGBurCrHRSwTqUrKm'],False,0.583,0.61,9,-12.178,...,0.241,0.957,0.967,0.908,0.958,137.769,129173,4.0,0.91158,https://i.scdn.co/image/ab67616d00001e029a3532...


# Full Pipeline

In [20]:
def recommend_tracks(playlist_id, spotify_df):
    """
    Generate recommendations based on a playlist given by its id and with the 
    spotify_df datafram mimicking the full available set of tracks to recommend from.

    Parameters:
    - playlist_id: id of the user's playlist
    - spotify_df: dataframe of the full available songs to recommend based off of

    Returns:
    - recommended_10: 10 recommended tracks based off the playlist
    """

    full_feature_set_df = create_feature_vectores(spotify_df)
    
    playlist_df = create_playlist_df(playlist_id)
    playlist_vect, refined_feature_set = create_playlist_vector(full_feature_set_df,playlist_df)
    recommended_10 = generate_recommendations(all_tracks_df,playlist_vect, refined_feature_set)

    return recommended_10

## Algorithm Pipeline

User Action:
- Authenticate Spotify User
- Select Playlist (playlist id)

Algorithm Steps:
- Create feature vectors for each song in the Spotify dataset
- Create a Pandas dataframe from the indicated Playlist
- Create a feature vector summarizing the playlist, and remove songs from the Spotify dataset dataframe that are in the playlist
- Recommend 10 songs