# VERSION 1

See modelv2.ipynb for current model

In [8]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import pandas as pd
import datetime
import pytz
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')

# Dataset Setup 

In [9]:
all_tracks_df = pd.read_csv('../tracks_features.csv')
all_tracks_df = all_tracks_df.drop(['album_id', 'track_number', 'disc_number', 'year', 'release_date','album','artists','artist_ids'],axis=1)
all_tracks_df

Unnamed: 0,id,name,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,False,0.470,0.978,7,-5.399,1,0.0727,0.02610,0.000011,0.3560,0.503,117.906,210133,4.0
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,True,0.599,0.957,11,-5.764,1,0.1880,0.01290,0.000071,0.1550,0.489,103.680,206200,4.0
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,False,0.315,0.970,7,-5.424,1,0.4830,0.02340,0.000002,0.1220,0.370,149.749,298893,4.0
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,True,0.440,0.967,11,-5.830,0,0.2370,0.16300,0.000004,0.1210,0.574,96.752,213640,4.0
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105000,0.0789,0.539,127.059,205600,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204020,0EsMifwUmMfJZxzoMPXJKZ,Gospel of Juke,False,0.264,0.966,5,-6.970,0,0.0672,0.00935,0.002240,0.3370,0.415,159.586,276213,4.0
1204021,2WSc2TB1CSJgGE0PEzVeiu,Prism Visions,False,0.796,0.701,11,-6.602,0,0.0883,0.10400,0.644000,0.0749,0.781,121.980,363179,4.0
1204022,6iProIgUe3ETpO6UT0v5Hg,Tokyo 360,False,0.785,0.796,9,-5.960,0,0.0564,0.03040,0.918000,0.0664,0.467,121.996,385335,4.0
1204023,37B4SXC8uoBsUyKCWnhPfX,Yummy!,False,0.665,0.856,6,-6.788,0,0.0409,0.00007,0.776000,0.1170,0.227,124.986,324455,4.0


In [10]:
spotify2_df = pd.read_csv('../SpotifyFeatures.csv')
spotify2_df = spotify2_df.drop(['artist_name'], axis=1)
# Has Genre and Popularity; does not have explicit --> Net total of 1 more feature.
spotify2_df

Unnamed: 0,genre,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.962
232721,Soul,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969
232722,Soul,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.813
232723,Soul,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.489


In [11]:
from sklearn.preprocessing import MinMaxScaler

def create_feature_vectores(all_tracks_df):
    """
    Creates feature vector for each song in all_tracks_df.

    Parameters:
    - all_tracks_df: consists of all tracks in the used dataset, mimicking the "spotify db"
    Returns:
    - dataframe consisting of each track id, and their feature vector normalized.
    """

    # seperate feature columns and id column
    features_only_df = all_tracks_df.drop(['name','id'],axis=1).reset_index(drop = True)
    id_df = all_tracks_df[['id']]

    scaler = MinMaxScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features_only_df), columns=features_only_df.columns) * 0.2

    final_df = pd.concat([id_df, features_scaled], axis = 1)
    return final_df

In [12]:
full_feature_set_df = create_feature_vectores(all_tracks_df)
full_feature_set_df.head()

Unnamed: 0,id,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,0.0,0.094,0.1956,0.127273,0.162421,0.2,0.015005,0.005241,2.18e-06,0.0712,0.1006,0.094729,0.006902,0.16
1,1wsRitfRRtWyEapl0q22o8,0.2,0.1198,0.1914,0.2,0.161335,0.2,0.038803,0.00259,1.412e-05,0.031,0.0978,0.083299,0.006772,0.16
2,1hR0fIFK2qRG3f3RF70pb7,0.0,0.063,0.194,0.127273,0.162346,0.2,0.09969,0.004699,4.06e-07,0.0244,0.074,0.120312,0.009831,0.16
3,2lbASgTSoDO7MTuLAXlTW0,0.2,0.088,0.1934,0.2,0.161139,0.0,0.048916,0.032731,7.28e-07,0.0242,0.1148,0.077733,0.007018,0.16
4,1MQTmpYOZ6fcMQc56Hdo7T,0.0,0.0852,0.1858,0.036364,0.158464,0.2,0.014469,0.000325,0.021,0.01578,0.1078,0.102082,0.006752,0.16


# Connecting to Spotify API

In [13]:
load_dotenv()
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [14]:
playlists_res = sp.current_user_playlists()['items']
playlists = {}

for item in playlists_res:
  playlists[item['name']] = item['id']

playlists
id = playlists['RO tation'] # This is what the input to the model will be, playlist ID

# Constructing Input Data From Playlist

In [15]:
def _tracks_from_playlist(id):
    """
    Given a playlist id, returns a pandas dataframe consisting of key elements of each song
    """
    playlist = sp.playlist(id)
    tracks = []
    for item in playlist['tracks']['items']:
        if item['track']['id'] is not None:
            track = item['track']
            track_id = track['id']
            track_info = {
                'id': track_id,
                'duration_ms': track['duration_ms'],
                'explicit': track['explicit'],
                'popularity': track['popularity'],
                'date_added': item['added_at']
            }
            tracks.append(track_info)

    tracks_df = pd.DataFrame(tracks)
    tracks_df['date_added'] = pd.to_datetime(tracks_df['date_added'], utc=True)
    now = datetime.datetime.now(pytz.utc)
    tracks_df['months_since_added'] = tracks_df['date_added'].apply(lambda x: relativedelta(now, x).months)
    tracks_df = tracks_df.drop(['date_added'], axis=1)

    return tracks_df

In [16]:
def _extract_tracks_features(ids):
  """
  Given a list of track ids, returns a pandas dataframe of key audio features of each track
  """
  audio_features_list = []
  for track_id in ids:
    if track_id is not None:
      audio_features = sp.audio_features(track_id)[0]
      audio_features_list.append(audio_features)

  # convert list of dictionaries to Pandas DataFrame
  audio_features_df = pd.DataFrame.from_records(audio_features_list, columns=audio_features_list[0].keys())

  # add track ID column to DataFrame
  audio_features_df['id'] = ids

  # re-order columns to put track_id first
  cols = audio_features_df.columns.tolist()
  cols = cols[-1:] + cols[:-1]
  audio_features_df = audio_features_df[cols]
  audio_features_df = audio_features_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url','duration_ms'], axis=1)
  return audio_features_df

In [17]:
def create_playlist_df(id):
    """
    Returns a dataframe for the playlsit with the given id. Attributes include those returned by
    Spotify Web API's GET Audio Features endpoint and track meta data like name, artist, etc.
    """
    df1 = _tracks_from_playlist(id)
    df2 = _extract_tracks_features(df1['id'])
    playlist_df = pd.concat([df1,df2], axis=1)
    return playlist_df

In [18]:
playlist_df = create_playlist_df(id)
playlist_df.head()

Unnamed: 0,id,duration_ms,explicit,popularity,months_since_added,time_signature,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,1sFstGV1Z3Aw5TDFCiT7vK,185062,True,63,3,4,0.725,0.601,1,-9.03,1,0.377,0.0321,0.0,0.0504,0.811,155.918
1,0v9Wz8o0BT8DU38R4ddjeH,304606,True,67,3,4,0.652,0.795,11,-5.192,0,0.174,0.156,0.0,0.123,0.788,135.018
2,3ZLyt2ndLFBh148XRYjYYZ,203794,False,58,3,4,0.531,0.544,0,-8.615,0,0.462,0.308,2e-06,0.0835,0.398,131.921
3,1QBwk6GTCxVdC2hoSw9tlM,307640,True,64,3,4,0.417,0.887,5,-5.551,0,0.367,0.0786,0.0,0.834,0.303,167.579
4,1ZM8toCOlnfBKJdvR8GqUq,233922,True,59,3,4,0.687,0.783,0,-4.573,1,0.191,0.0526,0.0,0.0718,0.359,156.141


# Generate Recommendations

In [19]:
def create_playlist_vector(full_feature_set_df, playlist_df, weight_factor=1.2):
    """
    Generates a single vector desribing a playlist dataframe.
    Removes those songs in the playlist from the full_feature_set_df

    Parameters:
    - full_feature_set_df: All tracks of the dataset
    - playlist_df: Dataframe consisting of the songs in the playlist and their features
    - weight_factor: value representing bias of more recently added songs
    Returns:
    - sum_vect: 1D vector summarizing the features of the playlist
    - refined: All tracks in dataset as defined by full_feature_set_df, except those in the playlist_df
    """
    # Compute full_feature_set_df
    # refined_complete_df = 
    merged_df = pd.merge(full_feature_set_df, playlist_df['id'], on='id', how='left', indicator=True)
    pruned_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')


    playlist_df= playlist_df.drop(['id'],axis=1)
    # Note: Popularity feature is not given in kaggle dataset;may need to drop that column as well.
    playlist_df= playlist_df.drop(['popularity'],axis=1)
    

    # Compute Weight for each song of playlist; given from their months_since_added
    # More recent the song was added, the more weight is is given
    playlist_df['weight'] = playlist_df['months_since_added'].apply(lambda x: weight_factor ** (-x))
    playlist_df.update(playlist_df.mul(playlist_df.weight,0))
    playlist_df = playlist_df.drop(['weight', 'months_since_added'],axis=1)

    #Normalize Data (15 features)
    df = playlist_df.apply(lambda iterator: ((iterator - iterator.mean())/iterator.std()).round(2))
    df_normalized = df.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())).round(2))
    sum_vect = df_normalized.sum(axis = 0)


    return sum_vect, pruned_df

In [20]:
playlist_vect, refined_feature_set = create_playlist_vector(full_feature_set_df,playlist_df)
playlist_vect

duration_ms         64.23
explicit            35.85
time_signature      39.71
danceability        65.19
energy              64.41
key                 67.88
loudness            39.42
mode                68.10
speechiness         75.84
acousticness        83.57
instrumentalness    98.80
liveness            80.94
valence             65.65
tempo               65.59
dtype: float64

In [21]:
refined_feature_set.head()

Unnamed: 0,id,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7lmeHLHBe4nmXzuXc0HDjk,0.0,0.094,0.1956,0.127273,0.162421,0.2,0.015005,0.005241,2.18e-06,0.0712,0.1006,0.094729,0.006902,0.16
1,1wsRitfRRtWyEapl0q22o8,0.2,0.1198,0.1914,0.2,0.161335,0.2,0.038803,0.00259,1.412e-05,0.031,0.0978,0.083299,0.006772,0.16
2,1hR0fIFK2qRG3f3RF70pb7,0.0,0.063,0.194,0.127273,0.162346,0.2,0.09969,0.004699,4.06e-07,0.0244,0.074,0.120312,0.009831,0.16
3,2lbASgTSoDO7MTuLAXlTW0,0.2,0.088,0.1934,0.2,0.161139,0.0,0.048916,0.032731,7.28e-07,0.0242,0.1148,0.077733,0.007018,0.16
4,1MQTmpYOZ6fcMQc56Hdo7T,0.0,0.0852,0.1858,0.036364,0.158464,0.2,0.014469,0.000325,0.021,0.01578,0.1078,0.102082,0.006752,0.16


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def generate_recommendations(spotify_df, playlist_vect, refined_feature_set):
    """
    Return Recommenmdations based on playlist.

    Parameters:
    - spotify_df : Dataframe of all songs in spotify (or in the used dataset)
    - playlist_vect: vector representing the playlist
    - refined_feature_set: feature set of songs that are not in playlist
    
    Returns:
    - recommended_10_songs: Top 10 recommended songs based on playlists
    """

    non_playlist_df = spotify_df[spotify_df['id'].isin(refined_feature_set['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(refined_feature_set.drop('id', axis = 1).values, playlist_vect.values.reshape(1, -1))[:,0]
    recommended_10_songs = non_playlist_df.sort_values('sim',ascending = False).head(10)
    recommended_10_songs['url'] = recommended_10_songs['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])

    return recommended_10_songs

In [23]:
rotaion_recs = generate_recommendations(all_tracks_df,playlist_vect, refined_feature_set)
rotaion_recs

Unnamed: 0,id,name,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,sim,url
585515,4HEOlBKEndbe89c1co7G26,Honky Tonk Train Blues,False,0.479,0.664,7,-10.689,1,0.439,0.898,0.79,0.864,0.936,168.554,158452,4.0,0.918418,https://i.scdn.co/image/ab67616d00001e023858a7...
776274,6TTWUsIPydoLs9DKFwUyXT,Darktown Strutter's Ball,False,0.451,0.658,10,-10.895,1,0.454,0.98,0.907,0.937,0.774,210.264,149267,4.0,0.916,https://i.scdn.co/image/ab67616d00001e028b2b3e...
551482,1RsFpT7TSXMEkMRIaE6u4Y,Off to California / The Greencastle - Hornpipes,False,0.378,0.399,7,-13.289,1,0.0542,0.774,0.876,0.826,0.976,168.214,195773,4.0,0.914455,https://i.scdn.co/image/ab67616d00001e02c3bfd0...
3187,1FbXA7ikBbxb8mHQBAU1Co,Track 2,False,0.366,0.248,5,-26.94,1,0.0391,0.936,0.89,0.673,0.626,156.382,2987943,4.0,0.914334,https://i.scdn.co/image/ab67616d00001e025a37e0...
823553,15DPxFDjv8H9reyvhL4iYQ,El Caballo Blanco,False,0.469,0.479,10,-11.753,1,0.0813,0.821,0.794,0.835,0.971,189.669,112053,3.0,0.91326,https://i.scdn.co/image/ab67616d00001e0275696a...
206603,21Ttrr0EruSy8UOKFxOjk5,In the Mood,False,0.468,0.502,8,-11.957,1,0.278,0.838,0.791,0.744,0.701,171.752,196373,4.0,0.911813,https://i.scdn.co/image/ab67616d00001e029fc17e...
775813,0a94FNCXmUi1M6jxeY8VYo,In the Mood,False,0.468,0.502,8,-11.957,1,0.278,0.838,0.791,0.744,0.701,171.752,196373,4.0,0.911813,https://i.scdn.co/image/ab67616d00001e0202dc17...
378009,59FNX3v0kOR6m3cgWX0JZp,Back Then,True,0.3,0.824,3,-10.267,1,0.608,0.99,0.786,0.645,0.84,160.657,26387,3.0,0.911548,https://i.scdn.co/image/ab67616d00001e0202199d...
124633,1gAlzdwwIwOJsjLEU64OJK,Life - Remix,False,0.674,0.441,9,-14.017,1,0.453,0.958,0.951,0.803,0.948,142.747,144147,4.0,0.911398,https://i.scdn.co/image/ab67616d00001e02dd5d67...
1125468,4dMnOw6MYNWFK16mgXs7pZ,Asdfghjkl,True,0.767,0.403,9,-12.467,1,0.583,0.936,0.816,0.624,0.557,139.959,73748,4.0,0.911272,https://i.scdn.co/image/ab67616d00001e0211bfaf...


# Full Pipeline

In [24]:
def recommend_tracks(playlist_id, spotify_df):
    """
    Generate recommendations based on a playlist given by its id and with the 
    spotify_df datafram mimicking the full available set of tracks to recommend from.

    Parameters:
    - playlist_id: id of the user's playlist
    - spotify_df: dataframe of the full available songs to recommend based off of

    Returns:
    - recommended_10: 10 recommended tracks based off the playlist
    """

    full_feature_set_df = create_feature_vectores(spotify_df)
    
    playlist_df = create_playlist_df(playlist_id)
    playlist_vect, refined_feature_set = create_playlist_vector(full_feature_set_df,playlist_df)
    recommended_10 = generate_recommendations(all_tracks_df,playlist_vect, refined_feature_set)

    return recommended_10

In [25]:
chill_recommended = recommend_tracks(playlists['chill'], all_tracks_df)
chill_recommended
dict = chill_recommended.set_index('id')['name'].to_dict()
dict

{'4dMnOw6MYNWFK16mgXs7pZ': 'Asdfghjkl',
 '2vGZgsxzd43skBb4AsCq8Q': '?????',
 '6QWCGHpEzkbQ8KHGsmbwxE': 'Kiss Of Death',
 '2avrNQwsUDQMwtbLDAIIjQ': "The Ladies' Pantalettes / Scotch Mary / Crowely's Reel (Live)",
 '2kaZByUJccaYKQIssLDAkd': 'Matando',
 '6vQ46r1V8pEOVj8578SUqQ': 'Luni Coleone Drop',
 '3xPAmz1frMn1H39rKqeexe': 'Heillos Warnung',
 '3MllNkYBYMZshZylK6eRNs': 'Reminiscing',
 '3CBTYl0gUOC3SzJHoV81xh': 'Habanera',
 '6i8ZFueCPYEo4ViFZw6TaC': 'Love Is Strange'}

## Algorithm Pipeline

User Action:
- Authenticate Spotify User
- Select Playlist (playlist id)

Algorithm Steps:
- Create feature vectors for each song in the Spotify dataset
- Create a Pandas dataframe from the indicated Playlist
- Create a feature vector summarizing the playlist, and remove songs from the Spotify dataset dataframe that are in the playlist
- Recommend 10 songs