In [4]:
import config
import spotipy
import json
import requests
from spotipy.oauth2 import SpotifyClientCredentials
import pprint
import pandas as pd
import numpy as np
import base64
import csv
import pickle
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import silhouette_score
import random
import difflib

In [3]:
# Authenticate with Spotify API
client_credentials_manager = SpotifyClientCredentials(
    client_id=config.client_id,
    client_secret=config.client_secret
)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## 1. create list of playlists

In [37]:
playlists = sp.user_playlists('spotify')
playlists["items"]

[{'collaborative': False,
  'description': 'Rema & Selena Gomez are on top of the Hottest 50!',
  'external_urls': {'spotify': 'https://open.spotify.com/playlist/37i9dQZF1DXcBWIGoYBM5M'},
  'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DXcBWIGoYBM5M',
  'id': '37i9dQZF1DXcBWIGoYBM5M',
  'images': [{'height': None,
    'url': 'https://i.scdn.co/image/ab67706f000000031d051b0d3dcb712c312eb94b',
    'width': None}],
  'name': "Today's Top Hits",
  'owner': {'display_name': 'Spotify',
   'external_urls': {'spotify': 'https://open.spotify.com/user/spotify'},
   'href': 'https://api.spotify.com/v1/users/spotify',
   'id': 'spotify',
   'type': 'user',
   'uri': 'spotify:user:spotify'},
  'primary_color': None,
  'public': True,
  'snapshot_id': 'MTY4MjY5MDMzOCwwMDAwMDAwMGY0MTVhOTRmYjk3ZjgzY2I3M2YwODAxNmZmY2NhYjFk',
  'tracks': {'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1DXcBWIGoYBM5M/tracks',
   'total': 50},
  'type': 'playlist',
  'uri': 'spotify:playlist:37i9dQZF1DXcBW

In [38]:
# get playlists from spotify
playlists = sp.user_playlists('spotify')
playlist_id_lst = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        playlist_id_lst.append(playlist["uri"])
    if playlist['uri']:
        playlists = sp.next(playlists)
    else:
        pass

## 2. get track ids from playlists

In [39]:
user = "spotify"
# define function to get tracks from a playlist
def get_playlist_tracks(username, playlist_id):
    tracks = []
    results = sp.user_playlist_tracks(user=username, playlist_id=playlist_id, fields="items.track.id", limit=20)
    for i, result in enumerate(results["items"]):
        tracks.append(result["track"]["id"])
    #while results['next']:
     #   results = sp.next(results)
     #   tracks.extend(results['items'])
    return tracks

In [40]:
# make list of tracks from the first 20 songs in each playlist contained in playlist_id_lst
tracks = []
for item in playlist_id_lst:
    try:
        tracks.extend(get_playlist_tracks(user, item))
    except:
        pass

In [41]:
len(tracks)

22339

In [42]:
# make a dataframe containing only the ids
df_tracks = pd.DataFrame(data = tracks, columns=["ID"])

In [43]:
len(df_tracks)

22339

In [44]:
# save track ids to csv
df_tracks.to_csv("tracks.csv", index=False)

## 3. get features of tracks (takes 5-10 min)

In [38]:
def get_track_id(song):
    id = sp.search(q=song,limit=1)["tracks"]["items"][0]["id"]
    return id

In [39]:
def get_features(track_id):
    features = []
    features.extend(sp.audio_features(track_id))
    features = pd.DataFrame(features, columns=["danceability","energy","loudness","mode","speechiness","acousticness", "instrumentalness","liveness","valence","tempo","id","duration_ms"])
    features.index = features["id"]
    features = features.drop("id", axis=1)
    return features

In [90]:
df_tracks = df_tracks.dropna()

In [5]:
df_tracks = pd.read_csv("tracks.csv")

In [13]:
features_df = pd.DataFrame(data=[], columns=["danceability","energy","loudness","mode","speechiness","acousticness", "instrumentalness","liveness","valence","tempo","duration_ms"])
for track in df_tracks["ID"]:
    try:
        features_df = pd.concat([features_df, get_features(track)], axis=0)
    except:
        pass

In [17]:
# save features to csv
features_df.to_csv("features.csv")

## 4. scale the data

In [10]:
# define function to load data from pkl files
def load(filename):
    try:
        with open("Model/"+ filename, "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        print("File not found!")

In [11]:
def scale_input(X):
    # load scaler fit from training data
    scaler = load("scaler_22k.pkl")
    X_scaled = scaler.transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns, index=X.index)
    return X_scaled_df
#display(scale_input(get_features(get_track_id("test"))))

In [21]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns, index=X.index)

# save scaler in pickle file
with open("Model/scaler_22k.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [5]:
features_df = pd.read_csv("features.csv")

In [15]:
scaler = load("scaler_22k.pkl")

In [12]:
X_scaled_df = scale_input(X)

NameError: name 'X' is not defined

## 5. create clusters

K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_scaled_df)

    filename = "Model/kmeans_" + str(k) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans,f)

    silhouette.append(silhouette_score(X_scaled_df, kmeans.predict(X_scaled_df)))

plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')

In [17]:
kmeans12 = KMeans(n_clusters=12, random_state=1234)
kmeans12.fit(X_scaled_df)

NameError: name 'X_scaled_df' is not defined

In [23]:
# save kmeans in pickle file
file_path2 = "Model/kmeans_12.pickle"
with open(file_path2, "wb") as f:
    pickle.dump(kmeans12,f)

NameError: name 'kmeans' is not defined

In [16]:
# open a file, where you stored the pickled data
kmeans12 = load("kmeans_12.pickle")

EOFError: Ran out of input

In [29]:
X_scaled_df['cluster'] = kmeans12.predict(X_scaled_df)

In [30]:
X_scaled_df

Unnamed: 0,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,cluster
0WtM2NBVQNNJLh6scP13H8,1.307194,0.883769,0.661813,0.716523,-0.416618,0.085255,-0.491939,-0.439962,1.289611,-0.398406,0.098803,4
4VMRsbfZzd3SfQtaJ1Wpwi,0.187951,-0.017513,0.351442,-1.395629,-0.466735,-0.994558,-0.494080,2.112204,-1.466125,0.008283,0.335690,5
1Qrg8KqiBpW07V7PNxwwwL,0.432959,0.616024,0.580421,0.716523,-0.406791,-0.862696,-0.027172,-0.125128,-0.207351,-1.006852,-0.600554,4
0yLdNVWF3Srea0uzk55zFn,0.783767,0.412387,0.794356,0.716523,-0.134584,-0.830801,-0.494092,-0.987907,0.681470,-0.026970,-0.219562,4
5w40ZYhbBMAlHYNDaVJIUu,-0.380023,1.226934,0.706796,0.716523,-0.348812,-1.008495,-0.494108,-0.413167,-0.382776,1.725634,-0.354254,6
...,...,...,...,...,...,...,...,...,...,...,...,...
0ygTmpa6uSotkBkTiwcMZ4,-0.407865,0.069221,0.570341,-1.395629,-0.405808,-0.038304,-0.494108,-0.393072,0.120110,1.725837,-0.007128,5
7GJClzimvMSghjcrKxuf1M,0.822745,-0.454956,0.201147,0.716523,-0.519801,-0.758677,-0.494108,-0.480153,-0.254131,0.303541,-0.217391,4
5pY3ovFxbvAg7reGZjJQSp,0.416254,-1.491995,-0.313981,0.716523,-0.448064,0.820857,-0.494092,0.484444,-0.679050,-1.444099,0.588921,7
3NNqqioprPCnYcVtDn3wvS,1.502087,-0.273945,-0.041674,-1.395629,-0.320314,0.458803,-0.490895,-0.460058,-1.411938,-0.465501,-0.254427,0


In [31]:
# save clustered data
X_scaled_df.to_csv("df_clustered.csv")

## 5. make a recommendation

In [40]:
get_features(get_track_id("Yeah")).shape

(1, 11)

In [41]:
user_song_cluster = kmeans12.predict(get_features(get_track_id("Yeah")))[0]
user_song_cluster

11

In [34]:
full_df = pd.read_csv("df_clustered.csv", index_col="Unnamed: 0")

In [53]:
song_rec = random.choice(full_df[full_df["cluster"] == user_song_cluster].index)
song_rec

'0FMdDfm8alEAhS6TyUn4sk'

## 6. UX flow

In [62]:
def str_matcher(user_input, song_list):
    # Find the closest matching song title to the input string
    best_match = difflib.get_close_matches(user_input, song_list["song"], n=1, cutoff=0.8)
    # Print the closest matching song title and the corresponding similarity score
    if best_match:
       similarity = difflib.SequenceMatcher(None, user_input, best_match[0]).ratio()
       corrected_input = best_match[0]
       match = True
    else:
        corrected_input = user_input
        match = False
    return match, corrected_input

In [63]:
def user_input(song_list):
    # get user input
    user_choice = input("Please choose a hot song you like. To exit, type '/exit'.")
    # check if the program should be terminated
    if user_choice == "/exit":
        print("Program terminated.")
        match = False
    else:
        # return list with [match=True/False, corrected_input]
        match, user_choice = str_matcher(user_choice, song_list)

        # check if the corrected string appeares in full in the list of songs
        if match:
           print("Your song is in the list of 100 hottest songs.")
        else:
            #print("Your song is not in the list of the hottest 100 songs, please enter a song from the list:", np.array(song_list["song"].str.title()))
            # return the original str as output
            user_choice = user_choice

    return match, user_choice

In [64]:
def recommender(song_list, clustered_df):
    # call user_input
    match, user_choice = user_input(song_list)

    if match:
        # get the dataframe without the input song
        song_list_out = pd.DataFrame(song_list[~song_list["song"].str.contains(user_choice)])
        # choose random song from the list of songs
        recommended_song = song_list_out.iloc[randint(0, len(song_list)), 0].title()
        print("Based on the input song, we recommend you this song:")
    else:
        track_id = get_track_id(user_choice)
        X = get_features(track_id)
        X_scaled_df = scale_input(X)
        predicted_cluster = kmeans12.predict(X_scaled_df)[0]
        recommended_song = random.choice(full_df[full_df["cluster"] == predicted_cluster].index)

    return recommended_song
# now returns name from list or track id for recommencdation from cluster

In [65]:
from IPython.display import IFrame
def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media"
      )

In [69]:
hot_100 = pd.read_csv("hot_100.csv")
play_song(recommender(hot_100, full_df))

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': '', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to No search query


SpotifyException: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=&limit=1&offset=0&type=track:
 No search query, reason: None

# Snippets

In [3]:
# create a list containing all markets
markets = list(sp.available_markets()["markets"])