In [54]:
import spotipy 
import pandas as pd 
from spotipy.oauth2 import SpotifyClientCredentials
import os
import time
import random
import numpy as np
import config
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
from IPython.display import IFrame
import datetime as dt

<a id="functions"></a>
## 1. Functions

### 1.1 general

In [2]:
def load_songs(fp:[str]=None):
    if not fp:
        print("No filepath provided")
        return None
    try:
        return pd.read_csv(fp, sep=";", error_bad_lines=False)
    except FileNotFoundError:
        print("File not found.")
        return None

In [3]:
def load_pickle(fp:[str] = None): 
    if not fp:
        print("no filepath provided.")
        return None
    try: 
        with open(fp, "rb") as f: 
            return pickle.load(f) 
    except FileNotFoundError: 
        print("File not found!")
        return None

### 1.2 Billboard hot 100

In [26]:
def check_last_update_hot100(fp:[str] = None):
    if not fp:
        print("no file path given")
        return
    current_tuesday = None
    if dt.datetime.now().strftime('%A') != "Tuesday":
        for d in range(1, 7):
            if (dt.datetime.now() - dt.timedelta(d)).strftime("%A") == "Monday":
                current_tuesday = (dt.datetime.now() - dt.timedelta(d)).strftime("%Y_%m_%d")
    else:
        current_tuesday = dt.datetime.now().strftime("%Y_%m_%d")
    mod_time_since_epoc = os.path.getmtime(fp)
    # Convert seconds since epoch to readable timestamp
    modification_time = time.strftime('%Y_%m_%d', time.localtime(mod_time_since_epoc))
    if current_tuesday <= modification_time:
        next_update = dt.datetime.strptime(modification_time, "%Y_%m_%d")+dt.timedelta(7)
        print(f"File is up to date (next update expected for {dt.datetime.strftime(next_update, '%d.%m.%Y')}).")
        return False

In [27]:
def update_billboard_hot100(fp:[str]=None):
    if not fp:
        print("no file path given")
        return
    if not check_last_update_hot100(fp=fp):
        if input("Update anyway (Y/n): ") == "Y":
            get_billboard_hot100(save_to_csv=True, fp=fp)
    else:
        get_billboard_hot100(save_to_csv=True, fp=fp)

In [28]:
def get_billboard_hot100(save_to_csv:[bool]=False, fp:[str]=None):
    """visit website, scrape the hot 100 songs and artists and return it as df"""
    response = requests.get(url="https://www.billboard.com/charts/hot-100/index.php")
    response.raise_for_status()
    website = response.text                             # read out the html code as text
    soup = bs4.BeautifulSoup(website, "html.parser")    # make soup
    li_items = soup.select("li.lrv-u-width-100p")
    
    songs = []
    artists = []
    for _ in range(len(li_items)):
        if _ % 2 == 0:
            this_scraped = li_items[_].get_text().replace("\n","_")
            this_entry = []
            for c in this_scraped.split("_"):
                if c != "" and len(c) > 1:
                    this_entry.append(c)
                    if len(this_entry) == 2:
                        break
            songs.append(this_entry[0])
            artists.append(this_entry[1])
    if save_to_csv:
        if fp:
            try:
                pd.DataFrame({"song":songs, "artist":artists}).to_csv(fp, index=False, mode="w+", sep=";")
            except PermissionError:
                print("File already exists.")
        else:
            print("No filepath given.")
    return pd.DataFrame({"song":songs, "artist":artists})

In [29]:
def find_song_or_artists(user_entry:[str]=None, fp:[str]=None):
    if not fp:
        print("no filepath given.")
        return 
    try:
        music_df = pd.read_csv(fp, sep=";")
    except FileNotFoundError:
        print( "file not found")
        return
    else:   
        song_lst = music_df["song"].tolist()
        artist_lst = music_df["artist"].tolist()
        hit_lst = []    # entries will be tuple in form (direct match: 0 or 1, "s"ong or "a"rtist, index in df)
        for song in range(len(song_lst)):
            if user_entry.lower() == song_lst[song].lower():
                hit_lst.append((1, "s", song))
            elif user_entry.lower() in song_lst[song].lower():
                hit_lst.append((0, "s", song))
        for artist in range(len(artist_lst)):
            if user_entry.lower() == artist_lst[artist].lower():
                hit_lst.append((1, "a", artist))
            elif user_entry.lower() in artist_lst[artist].lower():
                hit_lst.append((0, "a", artist))
        return (hit_lst)

In [30]:
def print_result(result_tup:[tuple] = None, fp:[str]=None):
    if not fp:
        print("no filepath given.")
        return 
    try:
        music_df = pd.read_csv(fp, sep=";")
    except FileNotFoundError:
        print("file not found")
        return
    else:
        this_artist = music_df.iloc[result_tup[2]]["artist"]
        this_song = music_df.iloc[result_tup[2]]["song"]
        print(f"{this_song} by {this_artist} on place {result_tup[2]+1}")

In [31]:
def return_random(this_rank:[int]=None, fp:[str]=None):
    if not fp:
        print("no filepath given.")
        return 
    if not this_rank:
        print("no rank given")
        return
    try:
        music_df = pd.read_csv(fp, sep=";")
    except FileNotFoundError:
        print("file not found")
        return
    else:
        recom_index = random.randint(0,99)
        while recom_index == this_rank:
            recom_index = random.randint(0,99)
        reco_artist = music_df.iloc[recom_index]["artist"]
        reco_song = music_df.iloc[recom_index]["song"]
        print(f"Another entry from Billboard Hot 100 would be {reco_artist} with {reco_song} currently on place {recom_index +1}.")

In [32]:
def user_search(fp:[str]=None):
    if not fp:
        print("no file path given")
        return
    user = input("What are you looking for? ")
    if user == "end_now":
        break
    results = find_song_or_artists(user, fp)
    if results:
        if len(results) == 0:
            print("No match found")
            continue
        if len(results) == 1:
            if results[0][0] == 1:
                print("Direct Match:")
                print_result(results[0],fp=fp)
                return_random(results[0][2], fp=fp)
        else:
            print("Partial Matches:")
            for i in range(len(results)):
                print(f" + {i+1}: ",end="")
                print_result(results[i],fp=fp)
            specify = 241
            while 0 > int(specify) or int(specify) > len(results)+1:
                specify = input(f"Please specify your entry (1 to {i+1}): ")
                if specify.isnumeric() == False:
                    print("please check entry.")
                    specify = len(results)+2
                if 0 > int(specify) > len(results)+1:
                    print("please check entry.")
                    specify = len(results)+2
            print(" - - -\nMatch:")
            print_result(results[int(specify)-1],fp=fp)
            return_random(results[int(specify)-1][2], fp=fp)
        print("\n ************** new request *************\n")
        print(f"{fp} not found.")
        return None

### 1.3 Song recommender Spotipy

In [4]:
def user_input_get_features_old(this_df):
    audio_feat = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                  'liveness', 'valence', 'tempo', 'time_signature']
    user_input = input("What song are you looking for? ")
    song_hits = list(this_df[this_df["name"].str.lower()==user_input.lower()]["artist"])
    if song_hits:
        if len(song_hits) == 1:
            numpy_lst = []
            for a_f in audio_feat:
                numpy_lst.append(float(this_df[this_df["name"].str.lower()==user_input.lower()][a_f]))
            return np.array(numpy_lst).reshape(1,-1)
        else:
            print(f'Following artists found for "{user_input}":\n ')
            c = 0
            for a in song_hits:
                c += 1
                print(f" + {c}: {a}")
            print(" -------\n+ S: Search alternative on Spotify")
            user_choice = 241
            while 0 > int(user_choice) or int(user_choice) > len(song_hits)+1:
                user_choice = input(f"Please specify your entry (1 to {len(song_hits)}): ")
                if user_choice.lower() == "s":
                    return get_unknown_song_array(user_input)
                if user_choice.isnumeric() == False:
                    print("please check entry.")
                    user_choice = len(song_hits)+2
                if 0 > int(user_choice) > len(song_hits)+1:
                    print("please check entry.")
            this_df_index = list(this_df[this_df["name"].str.lower()==user_input.lower()].index)[int(user_choice)-1]
            # print(df_.iloc[df_index]["danceability"])
            numpy_lst = []
            for a_f in audio_feat:
                numpy_lst.append(float(this_df.iloc[this_df_index][a_f]))
            return np.array(numpy_lst).reshape(1,-1)
    else:
        print("Song not found in local database, searching on Spotify.")
        return get_unknown_song_array(user_input)

In [5]:
def user_input_get_features(this_df):
    audio_feat = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                  'liveness', 'valence', 'tempo', 'time_signature']
    user_input = input("What song are you looking for? ")
    song_hits = list(this_df[this_df["name"].str.lower()==user_input.lower()]["artist"])
    if song_hits:
        if len(song_hits) == 1:
#             numpy_lst = []
#             for a_f in audio_feat:
#                 numpy_lst.append(float(this_df[this_df["name"].str.lower()==user_input.lower()][a_f]))
#             return np.array(numpy_lst).reshape(1,-1)
            print(f'Following artist found for "{user_input}":\n ')
            print(f' + 1: {song_hits[0]}')
            print(" -------\n+ S: Search alternative on Spotify")
        else:
            print(f'Following artists found for "{user_input}":\n ')
            c = 0
            for a in song_hits:
                c += 1
                print(f" + {c}: {a}")
            print(" -------\n+ S: Search alternative on Spotify")
        user_choice = 241
        while 0 > int(user_choice) or int(user_choice) > len(song_hits)+1:
            user_choice = input(f"Please specify your entry (1 to {len(song_hits)}): ")
            if user_choice.lower() == "s":
                return get_unknown_song_array(user_input)
            if user_choice.isnumeric() == False:
                print("please check entry.")
                user_choice = len(song_hits)+2
            if 0 > int(user_choice) > len(song_hits)+1:
                print("please check entry.")
        this_df_index = list(this_df[this_df["name"].str.lower()==user_input.lower()].index)[int(user_choice)-1]
        # print(df_.iloc[df_index]["danceability"])
        numpy_lst = []
        for a_f in audio_feat:
            numpy_lst.append(float(this_df.iloc[this_df_index][a_f]))
        return np.array(numpy_lst).reshape(1,-1)
    else:
        print("Song not found in local database, searching on Spotify.")
        return get_unknown_song_array(user_input)

In [6]:
def get_unknown_song_array(this_song:[str]=None):
    if not this_song:
        print("no song provided.")
        return None
    x=sp.search(this_song, limit=15)
    if not x["tracks"]["items"]:
        print("song is not on spotify.")
        return None
    if len(x["tracks"]["items"]) == 1:
        pass
    else:
        artists_of_this_song = []
        for _ in range(len(x["tracks"]["items"])):
            artists_of_this_song.append(x["tracks"]["items"][_]["artists"][0]["name"])
        # artists_of_this_song = sorted(set(artists_of_this_song), key=lambda x: x.lower())
        print(f'Following artists found for "{this_song}":\n ')
        c = 0
        for a in artists_of_this_song:
            c += 1
            print(f" + {c}: {a}")
        user_choice = 241
        while 0 > int(user_choice) or int(user_choice) > len(artists_of_this_song) + 1:
            user_choice = input(f"Please specify your entry (1 to {len(artists_of_this_song)}): ")
            if user_choice.isnumeric() == False:
                print("please check entry.")
                user_choice = len(artists_of_this_song) + 2   # +2 to create a wrong int and run another while loop 
            if 0 > int(user_choice) > len(song_hits) + 1:
                print("please check entry.")
        # print((x["tracks"]["items"][int(user_choice)-1]["artists"][0]["name"]))
        song_info=sp.audio_features([x["tracks"]["items"][int(user_choice)-1]["id"]])
        remove_list = ['mode', 'duration_ms', 'id', 'type', 'uri', 'track_href', 'analysis_url'] #remove features not used in the model
        [song_info[0].pop(key) for key in remove_list] 
        song_list=list(song_info[0].values())
        song_list
        song_array = np.array(song_list).reshape(1, -1)
        song_array
        return song_array

In [7]:
def get_random_song_cluster(this_df, this_song_array):
    X = song_df.drop(['id', 'length','name', 'album','artist','release_date', 'type', 'mode', 'popularity', "cluster"], axis=1) 
    scaler = load_pickle("Model/scaler.pickle")
    kmeans = load_pickle("Model/kmeans_10.pickle")
    scaled_song = scaler.transform(this_song_array)
    cluster_pred = kmeans.predict(scaled_song)[0]
    print("cluster:", cluster_pred)
    index_song = this_df[this_df['cluster'] == cluster_pred].sample().index[0]
    id_song = this_df.iloc[index_song]['id']
    return print(f"{sp.track(id_song)['name']} by {sp.track(id_song)['artists'][0]['name']} (Spotify ID: {id_song})")

In [8]:
def add_cluster_column_to_song_df(this_df):
    # load model and cluster songs, add cluster column to df
    X = this_df.drop(['id', 'length','name', 'album','artist','release_date', 'type', 'mode', 'popularity'], axis=1) 
    
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
    
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(X_scaled_df)
    
    clusters = kmeans.predict(X_scaled_df)
    #clusters
    pd.Series(clusters).value_counts().sort_index()
    this_df["cluster"] = clusters
    return this_df

In [9]:
# song_df

In [39]:
def sample_clusters(this_df, num_of_samples:[int]=1):
    """TODO: number of cluster check"""
    for _ in range(10):
        this_song = song_df[song_df.cluster == _][["artist","name"]].sample()
        print(f"Cluster {_}")
        print(this_song)
    

<a id="set_up"></a>
## 2. SET UP

In [36]:
# create spotipy object
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(config.c_id, config.c_se))

In [37]:
# load song DataFrame
song_df = pd.read_csv("data/new_song_db.csv", sep=";")   #  cleaned csv does not need: , error_bad_lines=False)


In [38]:
song_df = add_cluster_column_to_song_df(song_df)

In [48]:
# Update Billboard Hot 100
fp_billbo_hot100 = ("./data//billboard_hot100.csv")
update_billboard_hot100(fp_billbo_hot100)


File is up to date (next update expected for 22.02.2022).
Update anyway (Y/n): n


<a id="song_recommender"></a>
## 3. SONG RECOMMENDER

In [55]:
while input("Start new song search (y/n): ").lower() != "n":
    fp_billbo_hot100 = ("./data/billboard_hot100.csv")
    user_search(fp_billbo_hot100)

Start new song search (y/n): y
What are you looking for? Stay
Direct Match:
Stay by The Kid LAROI & Justin Bieber on place 5
Another entry from Billboard Hot 100 would be Stephanie Beatriz, Olga Merediz & Encanto Cast with The Family Madrigal currently on place 20.

 ************** new request *************

What are you looking for? end_now
Start new song search (y/n): end_now


KeyboardInterrupt: Interrupted by user

In [40]:
song_array = user_input_get_features(song_df)
get_random_song_cluster(song_df, song_array)

What song are you looking for? freude 
Song not found in local database, searching on Spotify.
Following artists found for "freude ":
 
 + 1: Johann Sebastian Bach
 + 2: Johann Sebastian Bach
 + 3: Ludwig van Beethoven
 + 4: Johann Sebastian Bach
 + 5: Johann Sebastian Bach
 + 6: Gotthilf Fischer
 + 7: Johann Sebastian Bach
 + 8: Johann Sebastian Bach
 + 9: Ludwig van Beethoven
 + 10: Johann Sebastian Bach
 + 11: Johann Sebastian Bach
 + 12: Peter Breiner
 + 13: Johann Sebastian Bach
 + 14: Gotthilf Fischer
 + 15: Johann Sebastian Bach
Please specify your entry (1 to 15): 1
cluster: 7
The Moment by Toad The Wet Sprocket (Spotify ID: 2EYXHZ6Mbs9fi3JD5kFXl9)




In [None]:
# 
# track_id = "4yzlbUDqrP1ZriRp7zAcIP"
# IFrame(src="https://open.spotify.com/embed/track/"+track_id,
#        width="320",
#        height="80",
#        frameborder="0",
#        allowtransparency="true",
#        allow="encrypted-media",
#       )