# Intelligent systems course DVA439
## Music recommendation application

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import spotipy
import os
%matplotlib inline

### Reading the data

In [None]:
spotify_data = pd.read_csv('./data/kaggle_spotify2/tracks.csv')
spotify_data = spotify_data.replace({'year': '[0-9]{2}/[0-9]{2}/'}, {'year': ''}, regex=True)
spotify_data = spotify_data.replace({'year': '-[0-9]{2}'}, {'year': ''}, regex=True)
spotify_data.head()

### Values of top 10 most popular songs

In [None]:
top10_songs = spotify_data.nlargest(15, 'popularity')
fig = px.bar(top10_songs, x='name', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

### Clustering Songs with K-Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2))],verbose=True)
X = spotify_data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
spotify_data['cluster_label'] = song_cluster_labels

### Visualizing the Song Clusters with PCA

In [None]:
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

In [None]:
import plotly.express as px
fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

## Finding songs with spotipy that are not in the dataset

In [None]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

client_id = "4813df40c194459085aed9c3ce7add62"
client_secret = "a30fdf36eca04b9186c75432d4ae27f3"

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id, client_secret))


### Find a song based on the title and year
#### This function returns a dataframe with data for a song given the name and release year. The function uses Spotipy to fetch audio features and metadata for the specified song.

In [None]:
def find_song(name, year=0):
    song_data = defaultdict()
    if year == 0:
        results = sp.search(q= 'track: {}'.format(name), limit=1) 
    else:
        results = sp.search(q= 'track: {} year: {}'.format(name, year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)


def find_songs(songs):
    result = []
    frames = []
    for s in songs:
        frame = find_song(s["name"], s["year"])
        frames.append(frame)
    result = pd.concat(frames)
    return result

In [None]:
a = find_song("Euphoria", 2012)
a

### Recommend songs

In [None]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

# Gets the song data for a specific song. The song argument takes the form of a dictionary with 
# key-value pairs for the name and release year of the song.
def get_song_data(song, spotify_data):
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

# Gets the mean vector for a list of songs.
def get_mean_vector(song_list, spotify_data):
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

# Utility function for flattening a list of dictionaries.
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict
       
    
# Recommends songs based on a list of previous songs that a user has listened to.
def recommend_songs(song_list, spotify_data, n_songs=10):
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    display()
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

### Recommend songs from a custom song list

In [None]:
recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993}],  spotify_data)

In [None]:
recommend_songs([{'name': 'Toxicity', 'year':2001},
                {'name': 'Forest', 'year': 2001},
                {'name': 'B.Y.O.B.', 'year': 2005},
                {'name':  'Chop suey', 'year': 2001},
                {'name': 'Deer dance', 'year': 2001}], spotify_data)

In [None]:
recommend_songs([{'name': 'Symphony No. 40 in G minor', 'year': 0},
                {'name': 'Piano Concerto No. 21', 'year': 0},
                {'name': 'Rondo for Piano in D Major, K. 485', 'year': 0},
                {'name': 'Concerto for Piano No. 9 in E-flat major, K. 271 "Jeunehomme": II. Andantino', 'year': 0},
                {'name': 'Concerto for Piano and Orchestra No. 23 in A Major, KV 488: Allegro assai', 'year': 0}], spotify_data)

### Obtain artist and album genres

In [None]:
def album_artist_genres(name, year=0):
    result = ""
    if year == 0:
        result = sp.search(q= 'track: {}'.format(name))
    else:
        result = sp.search(q= 'track: {}, year: {}'.format(name, year))
        
    # print("artist:", result['tracks']['items'][0]['artists'][0]["name"])

    result = sp.search(q= "artist: {}".format(result['tracks']['items'][0]['artists'][0]["name"]))
    track = result['tracks']['items'][0]
    
    album = sp.album(track["album"]["external_urls"]["spotify"])
    # print("album genres:", album["genres"])

    artist = sp.artist(track["artists"][0]["external_urls"]["spotify"])
    # print("artist genres:", artist["genres"])
    
    if len(album["genres"]) > 0:
        return album["genres"]
    else:
        return artist["genres"]

In [None]:
genres = album_artist_genres("Symphony No. 40 in G minor", 0)
genres

In [None]:
song_list = [{'name': 'Come As You Are', 'year':1991},
            {'name': 'Smells Like Teen Spirit', 'year': 1991},
            {'name': 'Lithium', 'year': 1992},
            {'name': 'Toxicity', 'year':2001},
            {'name': 'Forest', 'year': 2001},
            {'name': 'B.Y.O.B.', 'year': 2005},
            {'name': 'Symphony No. 40 in G minor', 'year': 0},
            {'name': 'Piano Concerto No. 21', 'year': 0},
            {'name': 'Rondo for Piano in D Major, K. 485', 'year': 0},
            {'name': 'Euphoria', 'year': 2012},
            {'name': 'Hello', 'year': 2015},
            {'name': 'Someone like you', 'year': 2011}]

song_list_data = find_songs(song_list)

### Determine the number of clusters by grouping songs in genres

In [None]:
clusters = []
for s in song_list:
    genres = album_artist_genres(s["name"], s["year"])
    cluster_found = 0
    for idx, c in enumerate(clusters):
        intersection_set = set.intersection(set(genres), set(c))
        
        # check if some cluster has similar genres
        if len(intersection_set) >= len(genres) / 4 or len(intersection_set) > len(c):
            # merge lists
            clusters[idx].extend(list(set(genres) - set(c)))
            cluster_found = 1
            
    if cluster_found == 0:
        # add new list
        clusters.append(genres)
            
clusters_num = len(clusters)
print(clusters)
print("number of clusters:", clusters_num)

### Recommend songs for each cluster

In [None]:
def recommend_clusters(song_list_data, clusters_num):
    allRecomendetSongs=[]
    for i in range(clusters_num):
        frame = song_list_data.loc[song_list_data['cluster_label'] == i]
        song_list_cluster = []
        for n,y in zip(frame["name"], frame["year"]):
            song_list_cluster.append({"name": n, "year": y})
        recommendet = recommend_songs(song_list_cluster, spotify_data, 3)
        allRecomendetSongs += recommendet
        display(pd.DataFrame.from_dict(song_list_cluster))
        display(pd.DataFrame.from_dict(recommendet))
        print("--------------------")
   # print("################ Sum recomondation #############")
   # print(allRecomendetSongs)
    return allRecomendetSongs
        

### Recommend songs with k-clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def K_clustering(input_songs):
    song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                      ('kmeans', KMeans(n_clusters=clusters_num))])
    # song_list_data = find_songs(input_songs)
    X = song_list_data.select_dtypes(np.number)
    number_cols = list(X.columns)
    song_cluster_pipeline.fit(X)
    song_cluster_labels = song_cluster_pipeline.predict(X)
    song_list_data['cluster_label'] = song_cluster_labels

    k_recomSongs= recommend_clusters(song_list_data, clusters_num)
    return k_recomSongs

resK=K_clustering(song_list)
table_birch=pd.DataFrame.from_dict(resK)
display(table_birch)

### Recommend songs with BIRCH clustering

In [None]:
from sklearn.cluster import Birch

def BIRCH_clustering():
    model = Birch(threshold=0.01, n_clusters=clusters_num)
    X = song_list_data.select_dtypes(np.number)
    model.fit(X)
    yhat = model.predict(X)
    song_list_data['cluster_label'] = yhat

    birch_recomSongs = recommend_clusters(song_list_data, clusters_num)
    
    return birch_recomSongs;

restyty = BIRCH_clustering()

 ### Getting recommended songs from spotify and extracting the necessary information

In [None]:

#extract needed data from list of recomended spotify songs
def spotifyRecomend_extractData(tracks):
    res=[]    
    for track in tracks['tracks']:   
        trck={
            'name': track['name'],
            'artists': track['artists'][0]['name']
            #'release_date': track['release_date'],
            #'id': track['id']                              
        }
        res.append(trck)
    #res2=pd.DataFrame.from_dict(res)    
    return res

In [None]:
#Number of songs which we will send to spotify recomondation function
NUM_SpotifySng= 5
def getSpotifyRecomendedSongs(listSongs): #Input need to be DataFrame    
    
    #spotify_recomondation=[]
    #display(listSongs)
    #allRecomended=pd.DataFrame()
    allRecomended=[]
    # spotify function accepts max 5 songs, so we go through the list of songs and we divide them into groups ie we will 
    # first get recommended songs for the first 5 songs, then for the second 5 etc, and in the end for the rest of them (can be only 2 left)
    while(len(listSongs)>0):        
        if(len(listSongs)>NUM_SpotifySng):
            first5Songs=listSongs.head(NUM_SpotifySng)
            N = NUM_SpotifySng
            listSongs = listSongs.tail(listSongs.shape[0] - N)
        else:
            first5Songs = listSongs
            listSongs = listSongs.iloc[0:0]                        
        
        first5_ids=first5Songs['id'].values.tolist()    
        spRc=sp.recommendations(seed_tracks=first5_ids)
        recomendedNow=spotifyRecomend_extractData(spRc)
        #allRecomended = pd.concat([allRecomended, recomendedNow], axis=0, ignore_index=True) 
        allRecomended += recomendedNow
    return allRecomended

spotify_recomended=getSpotifyRecomendedSongs(song_list_data)
table=pd.DataFrame.from_dict(spotify_recomended)
display(table)

### Testing and comparing recommended song lists

In [None]:
def DisplayAsTable(inputList):
    table=pd.DataFrame.from_dict(inputList)
    display(table)

In [None]:
ls_input_songs= song_list #(CHANGE ONLY THIS IF YOU NEED)
df_input_songs = find_songs(ls_input_songs)  # list of songs based on which we are doing recomondation,return type: Dataframe, input: list[dict]
spotify_recommended = getSpotifyRecomendedSongs(df_input_songs) #recomended songs by spotify based on input songs,return type: list[dict], input type: Dataframe
kCluster_recommended= K_clustering(ls_input_songs) #input type: list[dict], output type: list[dict]


#Display tables
print("List of input songs")
DisplayAsTable(ls_input_songs)
print("Recomended songs by K_cluster")
DisplayAsTable(kCluster_recommended)
print("Recomended songs by spotify")
DisplayAsTable(spotify_recommended)


In [None]:
##It will return songs which are in both lists
def getSongsFromBothRecomList(ourRecomSongs, spotifyRcmSongs): #input 2 lists[dict]
    sgsInBoth=[]
    for songRecom in ourRecomSongs:
        sgName=songRecom['name']         
        if any(d['name'] == songRecom['name'] for d in spotifyRcmSongs):
            sgsInBoth.append(songRecom)    
           
    return sgsInBoth
        
k_vs_spotify=getSongsFromBothRecomList(kCluster_recommended,spotify_recommended)        
DisplayAsTable(k_vs_spotify)
