# Recommender App
## By Tobias

### Overview
In this Notebook I'd like to present my approach on a music recommendation engine, based on 183.000 songs in the library and weekly updated Billboards Top100.

## 1.0 Libraries

In [1]:
from IPython.display import display # get ipython for nicer output
import pandas as pd # to build dataframe
from tkinter import * # to have drop.down menues
import numpy as np
from tkinter import ttk
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
from IPython.display import IFrame


## 1.1 Load Data

In [2]:
# Load current hot list
Top100=pd.read_csv('data/Top100.csv')

In [3]:
# load song data
df=pd.read_csv('data/song_data.csv')

In [4]:
# drop old index column
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df['type'].head()

0    audio_features
1    audio_features
2    audio_features
3    audio_features
4    audio_features
Name: type, dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222313 entries, 0 to 222312
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   danceability      222313 non-null  float64
 1   energy            222313 non-null  float64
 2   key               222313 non-null  int64  
 3   loudness          222313 non-null  float64
 4   mode              222313 non-null  int64  
 5   speechiness       222313 non-null  float64
 6   acousticness      222313 non-null  float64
 7   instrumentalness  222313 non-null  float64
 8   liveness          222313 non-null  float64
 9   valence           222313 non-null  float64
 10  tempo             222313 non-null  float64
 11  type              222313 non-null  object 
 12  id                222313 non-null  object 
 13  uri               222313 non-null  object 
 14  track_href        222313 non-null  object 
 15  analysis_url      222313 non-null  object 
 16  duration_ms       22

### Create new field my lookup is based on

In [7]:
# merge song title and artist together and write to a list
df['title_artist'] = df['track_name'].str.cat(others=df['artist_name'],sep='  BY  ')
# make lower case
df['title_artist'] = df['title_artist'].str.lower()
# drop duplicates
print(df['title_artist'].count())
df.drop_duplicates(subset=['title_artist'], keep='first', inplace=True, ignore_index=True)
print(df['title_artist'].count())

222313
182846


In [8]:
df['title_artist'].head()

0               aaj ka ye din  by  kalyanji-anandji
1        main hoon tere samne  by  kalyanji-anandji
2        dulha dulhan ki jodi  by  kalyanji-anandji
3    ankh zhapak te khel gaye  by  kalyanji-anandji
4        pyare tere pyar mein  by  kalyanji-anandji
Name: title_artist, dtype: object

In [9]:
# Create an alphabetical list of all the available songs
#songlist = []
#for i in range(len(df['title_artist'])):
#    songlist.append(df['title_artist'][i])
#songlist = sorted(songlist)

## 2. Building the KMeans Model

## 2.1. Preparing the data

In [10]:
# compiling the features into a designated dataframe
df_features = df.drop(['type','id','uri','track_href','analysis_url','duration_ms','time_signature','track_name','artist_name','artist_id','title_artist'],axis=1)
df_features.head()                       

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.491,0.698,1,-8.913,1,0.0503,0.669,1.2e-05,0.452,0.827,106.624
1,0.5,0.472,9,-9.487,0,0.0309,0.712,0.000569,0.263,0.693,109.25
2,0.517,0.621,5,-8.66,1,0.223,0.773,0.0,0.146,0.782,111.71
3,0.409,0.352,3,-11.444,1,0.031,0.709,0.0,0.265,0.516,138.004
4,0.641,0.654,10,-10.219,1,0.122,0.653,0.0,0.0733,0.736,100.351


## 2.2 Scaling the features

In [11]:
scaler = StandardScaler()
scaler.fit(df_features)
X_scaled = scaler.transform(df_features)
X_scaled_df = pd.DataFrame(X_scaled, columns = df_features.columns)
display(X_scaled_df.head())

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,-0.457822,0.308414,-1.187239,0.009805,0.787163,-0.391332,1.191884,-0.627617,1.092214,1.515025,-0.50341
1,-0.408076,-0.609575,1.029808,-0.103078,-1.270385,-0.540643,1.324135,-0.625991,0.192504,0.984287,-0.414648
2,-0.31411,-0.004353,-0.078715,0.05956,0.787163,0.937847,1.511747,-0.627653,-0.36446,1.336792,-0.331497
3,-0.911069,-1.097003,-0.632977,-0.487941,0.787163,-0.539874,1.314908,-0.627653,0.202025,0.283238,0.557273
4,0.371287,0.12969,1.306939,-0.247033,0.787163,0.160504,1.142675,-0.627653,-0.710538,1.154598,-0.715445


## 2.3 Clustering the songs

In [12]:
kmeans = KMeans(n_clusters=7, random_state=666)
kmeans.fit(X_scaled_df)

KMeans(n_clusters=7, random_state=666)

In [13]:
# count number of songs per cluster

clusters = kmeans.predict(X_scaled_df)
#clusters
pd.Series(clusters).value_counts().sort_index()

0    27763
1    14116
2     5106
3    54559
4    15215
5    27198
6    38889
dtype: int64

## 2.4 Find optimal K

K = range(2, 21)
inertia = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=666)
    kmeans.fit(X_scaled_df)
    inertia.append(kmeans.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

## 2.5 Bring the songs and the clusters together

In [14]:
df['cluster'] = clusters
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,uri,track_href,analysis_url,duration_ms,time_signature,track_name,artist_name,artist_id,title_artist,cluster
0,0.491,0.698,1,-8.913,1,0.0503,0.669,1.2e-05,0.452,0.827,...,spotify:track:2OGawqjrXSuT9REyNfDo6w,https://api.spotify.com/v1/tracks/2OGawqjrXSuT...,https://api.spotify.com/v1/audio-analysis/2OGa...,301857,4,Aaj Ka Ye Din,Kalyanji-Anandji,7pWXlTjky0KXWgbdZi9ebq,aaj ka ye din by kalyanji-anandji,3
1,0.5,0.472,9,-9.487,0,0.0309,0.712,0.000569,0.263,0.693,...,spotify:track:5DdZw9x5dLgpzXjj6GHxIn,https://api.spotify.com/v1/tracks/5DdZw9x5dLgp...,https://api.spotify.com/v1/audio-analysis/5DdZ...,279518,4,Main Hoon Tere Samne,Kalyanji-Anandji,7pWXlTjky0KXWgbdZi9ebq,main hoon tere samne by kalyanji-anandji,5
2,0.517,0.621,5,-8.66,1,0.223,0.773,0.0,0.146,0.782,...,spotify:track:3Ho6oTWwISyvTI2y0TqrjP,https://api.spotify.com/v1/tracks/3Ho6oTWwISyv...,https://api.spotify.com/v1/audio-analysis/3Ho6...,364712,4,Dulha Dulhan Ki Jodi,Kalyanji-Anandji,7pWXlTjky0KXWgbdZi9ebq,dulha dulhan ki jodi by kalyanji-anandji,3
3,0.409,0.352,3,-11.444,1,0.031,0.709,0.0,0.265,0.516,...,spotify:track:0eOfVox5SIKMmSAi9ayd7J,https://api.spotify.com/v1/tracks/0eOfVox5SIKM...,https://api.spotify.com/v1/audio-analysis/0eOf...,93519,3,Ankh Zhapak Te Khel Gaye,Kalyanji-Anandji,7pWXlTjky0KXWgbdZi9ebq,ankh zhapak te khel gaye by kalyanji-anandji,5
4,0.641,0.654,10,-10.219,1,0.122,0.653,0.0,0.0733,0.736,...,spotify:track:0XhArL19uo3LVvpWWCY9Zu,https://api.spotify.com/v1/tracks/0XhArL19uo3L...,https://api.spotify.com/v1/audio-analysis/0XhA...,300053,4,Pyare Tere Pyar Mein,Kalyanji-Anandji,7pWXlTjky0KXWgbdZi9ebq,pyare tere pyar mein by kalyanji-anandji,3




## 3 Loading Spotify

In [15]:
# Import credentials

import config

# Load Libraries and pass credentials to spotify API

import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials
import pprint
import pandas as pd

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=config.c_id, client_secret=config.c_se))

user_input = input("What's you favorite Song:") # user input
user_input = user_input.lower()
if (df['Title'].str.lower() == user_input).any():
    #df_exclude = df.drop(user_input.title(),axis=0)
    print('You might also like:')
    print(df['title_artist'].sample(n=1).values[0])
else:
    print('Unfortunately, the song is not in the hot list!')

## 4. Recommender

In [18]:
user_input = input("Your choice: ")
user_input_low = user_input.lower()
track_name_iterator = filter(lambda x: user_input_low in x, df['title_artist'])
filtered_songs = sorted(list(track_name_iterator))

# More than 1 song left after filtering --> build a selection
if len(filtered_songs) > 1:
    # Deal with more than one song.
    print('There are more than one song containing "{0}"'.format(user_input))
    print('Select the song from choices: ')
    for index, name in enumerate(filtered_songs):
        print("{0}: {1}".format(index, name))
        
# Ask for a choice from the list
    index = int(input("\nEnter choice number: "))
    # Check if song is in hot list
    if Top100['title_artist'].str.contains(filtered_songs[index], case=False).sum() > 0:
        song_id = (df['id'][df['title_artist'] == filtered_songs[index]]).values[0]
        hot = Top100['title_artist'].sample(n=1).values[0]
        print('Your song is very HOT at the moment!')
        display(IFrame(src=f"https://open.spotify.com/embed/track/{song_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))                                   
        print('\nYou know what is also in the top?:\n')
        print(hot)
        # Is the randomly chosen title from the top list in the big list? If yes, call the spotify widget!
        if (df['title_artist'] == hot).sum() > 0:
            hot_id = (df['id'][df['title_artist'] == hot]).values[0]
            display(IFrame(src=f"https://open.spotify.com/embed/track/{hot_id}",
               width="320",
               height="80",
               frameborder="0",
               allowtransparency="true",
               allow="encrypted-media",
              ))
        else: pass 
    else:
        song_id = (df['id'][df['title_artist'] == filtered_songs[index]]).values[0]
        song_cluster = (df['cluster'][df['title_artist'] == filtered_songs[index]]).values[0]
        suggestion = (df['title_artist'][df['cluster'] == song_cluster]).sample(n=1).values[0]
        suggestion_id = (df['id'][df['title_artist'] == suggestion]).values[0]
        print('\nSelected song: {0}'.format(filtered_songs[index]))
        print('Your song is currently NOT hot!')
 
        display(IFrame(src=f"https://open.spotify.com/embed/track/{song_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))
        print('\nBut you might also like:\n' + suggestion)
        display(IFrame(src=f"https://open.spotify.com/embed/track/{suggestion_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))

# If there's only one song after filtering    
elif len(filtered_songs) == 1:
    # Only one song found, so print that song.
    # check if the song is in the hot list
    if Top100['title_artist'].str.contains(filtered_songs[0], case=False).sum() > 0:
        song_id = (df['id'][df['title_artist'] == filtered_songs[0]]).values[0]
        hot = Top100['title_artist'].sample(n=1).values[0]
        print('Your song is very HOT at the moment!')
        display(IFrame(src=f"https://open.spotify.com/embed/track/{song_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))
        print('\nYou know what is also in the top?:\n')
        print(hot)
        # Is the randomly chosen title from the top list in the big list? If yes, call the spotify widget!
        if (df['title_artist'] == hot).sum() > 0:
            hot_id = (df['id'][df['title_artist'] == hot]).values[0]
            display(IFrame(src=f"https://open.spotify.com/embed/track/{hot_id}",
               width="320",
               height="80",
               frameborder="0",
               allowtransparency="true",
               allow="encrypted-media",
              ))
        else: pass
    else:
        song_id = (df['id'][df['title_artist'] == filtered_songs[0]]).values[0]
        song_cluster = (df['cluster'][df['title_artist'] == filtered_songs[0]]).values[0]
        suggestion = (df['title_artist'][df['cluster'] == song_cluster]).sample(n=1).values[0]
        suggestion_id = (df['id'][df['title_artist'] == suggestion]).values[0]
        print('\nSelected song: {0}'.format(filtered_songs[0]))
        print('Your song is currently NOT hot!')
        display(IFrame(src=f"https://open.spotify.com/embed/track/{song_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))
        print('\nBut you might also like:\n' + suggestion)
        display(IFrame(src=f"https://open.spotify.com/embed/track/{suggestion_id}",
           width="320",
           height="80",
           frameborder="0",
           allowtransparency="true",
           allow="encrypted-media",
          ))
else:
    print('Unfortunately, there is no song containing "{0}" in the list!'.format(user_input))

Your choice: arcade fire
There are more than one song containing "arcade fire"
Select the song from choices: 
0: (antichrist television blues)  by  arcade fire
1: afterlife  by  arcade fire
2: apocrypha  by  arcade fire
3: awful sound (oh eurydice)  by  arcade fire
4: baby mine - from "dumbo"/soundtrack version  by  arcade fire
5: black mirror  by  arcade fire
6: black wave / bad vibrations  by  arcade fire
7: chemistry  by  arcade fire
8: city with no children  by  arcade fire
9: creature comfort  by  arcade fire
10: crown of love  by  arcade fire
11: crucified again  by  arcade fire
12: culture war  by  arcade fire
13: deep blue  by  arcade fire
14: dimensions  by  arcade fire
15: divorce papers  by  arcade fire
16: electric blue  by  arcade fire
17: empty room  by  arcade fire
18: everything now  by  arcade fire
19: everything now (continued)  by  arcade fire
20: everything now (todo ya) - remix por bomba estéreo  by  arcade fire
21: everything now - recorded at spotify studios nyc 


But you might also like:
mirror signal  by  lexurus
