# build the UX experience for the song recommender

- store the list in a .csv
- get input from user
    - check for spelling
    - if song is found in the hot list
        - prompt user to confirm by printing song and artist + requesting confirmation
        - if not, send to alternative recommender
            - make dummy function for this

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from fuzzywuzzy import process
import difflib
import config
import spotipy

# model stuff
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
import ast

In [3]:
# Authenticate with Spotify API
client_credentials_manager = spotipy.SpotifyClientCredentials(
    client_id=config.client_id,
    client_secret=config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
def load_csv():
    # load list of hot songs as possible choices
    df = pd.read_csv("hot_100.csv")
    df = df.apply(lambda x: x.str.lower())
    return df

In [5]:
# load list of hot songs as possible choices
hot_100 = load_csv()

In [6]:
def str_matcher(user_input, song_list):
    # Find the closest matching song title to the input string
    best_match = difflib.get_close_matches(user_input, song_list["song"], n=1, cutoff=0.8)
    # Print the closest matching song title and the corresponding similarity score
    if best_match:
       similarity = difflib.SequenceMatcher(None, user_input, best_match[0]).ratio()
       corrected_input = best_match[0]
       match = True
    else:
        corrected_input = user_input
        match = False
    return match, corrected_input

In [140]:
str_matcher("hammer", hot_100)

(False, 'hammer')

## get track id & it's features

In [7]:
def get_track_id(song):
    id = sp.search(q=song,limit=1)["tracks"]["items"][0]["id"]
    return id

In [8]:
def get_features(track_id):
    features = []
    features.extend(sp.audio_features(track_id))
    features = pd.DataFrame(features)[["danceability","energy","loudness","speechiness","acousticness", "instrumentalness","liveness","valence","tempo","id","duration_ms"]]
    features.index = features["id"]
    features = features.drop("id", axis=1)
    return features

In [9]:
X = get_features(get_track_id("Yeah"))
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5rb9QrpfcKFHM1EUbSIurX,0.894,0.791,-4.699,0.112,0.0183,0,0.0388,0.583,105.018,250373


### scale new track

In [10]:
# define function to load data from pkl files
def load(filename):
    try:
        with open("Model/"+filename, "rb") as f:
            return pickle.load(f)

    except FileNotFoundError:
        print("File not found!")

In [11]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5rb9QrpfcKFHM1EUbSIurX,0.894,0.791,-4.699,0.112,0.0183,0,0.0388,0.583,105.018,250373


In [12]:
def scale_input(X):
    # load scaler fit from training data
    scaler = load("scaler.pkl")
    X_scaled = scaler.transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns, index=X.index)
    return X_scaled_df
display(scale_input(get_features(get_track_id("test"))))

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1DMEzmAoQIikcL52psptQL,0.07354,0.185867,-0.15551,-0.582553,1.259395,-0.370466,-0.501395,-0.065156,-1.73908,-0.31322


## get model from pkl

In [38]:
# load the model
kmeans4 = load("kmeans_4.pkl")

In [25]:
X_scaled = scale_input(X)

In [26]:
X_scaled_df

NameError: name 'X_scaled_df' is not defined

## predict the cluster of the input song

In [14]:
kmeans15.predict(X_scaled_df)

NameError: name 'X_scaled_df' is not defined

## find all songs of that cluster in my training set

Unnamed: 0,cluster
7DyDjhZMEIK5Ied4juTCyc,2
7jYwZOptDPesQgzj7vhsEF,1
11eYmv0tA3wEoyD1Sad2Nv,0
54eE5H6F1HhyM5L4fRLq8s,1
0yLdNVWF3Srea0uzk55zFn,1
...,...
1MQHacUW73AKPG9nlf28oZ,0
43rQHWHRNUYvjp1kauDnjL,0
1dGC8B4AVlFvTE7hW1ZNK3,2
4AowHOaycPNDRId7N4Npf9,0


In [16]:
df_clustered = pd.read_csv("df_clustered_15.csv")

In [17]:
def cluster_df(clusters):
    # group dataframe by cluster identifiers
    grouped = clusters.groupby("cluster")
    # create a dictionary of dataframes, where each dataframe corresponds to a unique value in the 'cluster' column
    dfs_by_cluster = dict(tuple(grouped))
    clustered_df = {}
    for cluster in range(len(dfs_by_cluster)):
        clustered_df[cluster] = dfs_by_cluster[cluster]

    return clustered_df
clustered_dfs = cluster_df(df_clustered)

In [18]:
clustered_dfs

{0:                         cluster
 1veHwv1HPwzOvlhs5jPxeP        0
 1veHwv1HPwzOvlhs5jPxeP        0
 5HFzAeZkaTDBhlaqwcRmeL        0
 1veHwv1HPwzOvlhs5jPxeP        0
 06633e40krkr5SWIenSngC        0
 ...                         ...
 3Itfj9OAXvpTF8IVOFRMHW        0
 4eGCTk4n1GXu9C1keB4ama        0
 4QlqE1sO6rXsIR9AZOlawN        0
 0bohmotsrCFyZ4Wfouo7CV        0
 3igT2HOsfVZgYTsTObLbxZ        0
 
 [3190 rows x 1 columns],
 1:                         cluster
 7DyDjhZMEIK5Ied4juTCyc        1
 2xIHVoglz5mLrNf145RieQ        1
 4sx6NRwL6Ol3V6m9exwGlQ        1
 4tQA4uDHh6iPaQLWaxeQqH        1
 34FFsLy4HDUjnnDMjGXEvU        1
 ...                         ...
 79qxwHypONUt3AFq0WPpT9        1
 058bYtysQS0bnt7KtEnZsg        1
 0UwvM6Pn8jWh3cNWQunjt0        1
 1ZCeUv9xi2ZPuRbLBXfNaR        1
 74QU8h22gboUmVHpjZAPYg        1
 
 [2803 rows x 1 columns],
 2:                         cluster
 3zkyus0njMCL6phZmNNEeN        2
 28eI53WyFJVjjQwSnWTh9K        2
 4AdhQGPnOKrH0EJWXc2EEM        2
 2IvrIIvwA3

# UX flow

In [19]:
def user_input(song_list):
    # get user input
    user_choice = input("Please choose a hot song you like. To exit, type '/exit'.")
    # check if the program should be terminated
    if user_choice == "/exit":
        print("Program terminated.")
        match = False
    else:
        # return list with [match=True/False, corrected_input]
        match, user_choice = str_matcher(user_choice, song_list)

        # check if the corrected string appeares in full in the list of songs
        if match:
           print("Your song is in the list of 100 hottest songs.")
        else:
            #print("Your song is not in the list of the hottest 100 songs, please enter a song from the list:", np.array(song_list["song"].str.title()))
            # return the original str as output
            user_choice = user_choice

    return match, user_choice

In [150]:
# prompt spotipy to find the given tracks ID and it's features and then predict its cluster, finally returning a random recommendation from the same cluster


In [151]:
get_track_id("hammer")

'5kqr6EkKEafBGKxzjdXOVP'

In [45]:
artist = hot_100.loc[hot_100["song"].str.lower().str.contains("flowers"),:]["artist"][0]

# todo:
#todo
- return iFrame player for top 100 recommendations
- improve recommendation by printing artist name and song title and ask for confirmation
    - if not: reprompt for title + artist
- optinal: clean str_matcher & user_input

In [27]:
def recommender(song_list, clustered_dfs = clustered_dfs):
    # call user_input
    match, user_choice = user_input(song_list)

    if match:
        # get the dataframe without the input song
        song_list_out = pd.DataFrame(song_list[~song_list["song"].str.contains(user_choice)])
        # choose random song from the list of songs
        recommended_song = song_list_out.iloc[randint(0, len(song_list)), 0].title()
        print("Based on the input song, we recommend you this song:")
    else:
        track_id = get_track_id(user_choice)
        X = get_features(track_id)
        X_scaled_df = scale_input(X)
        predicted_cluster = kmeans4.predict(X_scaled_df)[0]
        recommended_song = clustered_dfs[predicted_cluster].index[randint(0, len(clustered_dfs[predicted_cluster]))]

    return recommended_song
# now returns name from list or track id for recommencdation from cluster

In [28]:
from IPython.display import IFrame
def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media"
      )

In [37]:
play_song(recommender(hot_100))

In [15]:
get_features(get_track_id("the bells"))

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2lfdtrbDa0NvJPZgKL7o13,0.34,0.476,-11.852,0.0452,0.352,0,0.169,0.546,115.076,175560


In [244]:
recommender(hot_100)

'3UEOT0hD60yGB1OMWJxLQ8'

In [126]:
hot_100 = load_csv()
hot_100.iloc[randint(0, len(hot_100)),0]

NameError: name 'load_csv' is not defined

# code snippets

In [None]:

    if hot_100["song"].str.lower().str.contains(user_choice.lower()).sum() > 1:
        print("There are two songs with this name, please input which artist you meant. These are the artists you can choose from:", hot_100.loc[hot_100["song"].str.contains(user_choice)]["artist"])
        choose_artist = input("Type '/exit' to exit the song recommender.")

        #hot_100_out = pd.DataFrame(hot_100[~hot_100["song"].str.contains(user_choice)])
    else:
        print("fatal error")

In [11]:
# not working yet
def check_duplicates(song_list):
    song_list.loc[len(song_list)+1, "song"] = "Flowers"
    song_list.loc[len(song_list)+1, "artist"] = "Blabla"

    user_choice = user_input(song_list)

    song_list.loc[song_list["song"].str.contains(user_choice)]
