In [114]:
# You will need spotipy library installed, as well as sklearn / pandas/ numpy

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [115]:
#Read data from kaggle data set
data = pd.read_csv("../data/data.csv")
genreData = pd.read_csv('../data/data_by_genres.csv')
yearData = pd.read_csv('../data/data_by_year.csv')

In [116]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genreData.select_dtypes(np.number)
cluster_pipeline.fit(X)
genreData['cluster'] = cluster_pipeline.predict(X)

In [117]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False,))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
numberCols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [118]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="6829ed3b61a14b32bc343317e1978458", 
                                                           client_secret="3029e57d496d4830afcc09d56be5bbc3"))

def findSong(name, year):
    songData = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    songData['name'] = [name]
    songData['year'] = [year]
    songData['explicit'] = [int(results['explicit'])]
    songData['duration_ms'] = [results['duration_ms']]
    songData['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        songData[key] = value

    return pd.DataFrame(songData)

In [119]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib

numberCols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def getSongData(song, spotifyData):
    try:
        songData = spotifyData[(spotifyData['name'] == song['name']) & (spotifyData['year'] == song['year'])].iloc[0]
        return songData
    except IndexError:
        songData = findSong(song['name'], song['year'])
        if songData is not None:
            songData = songData.iloc[0]
        return songData
        

def getMeanVector(songList, spotifyData):
    songVectors = []
    for song in songList:
        songData = getSongData(song, spotifyData)
        if songData is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        songVector = songData[numberCols].values
        if songVector.shape[0] == len(numberCols):  # Check if song_vector has the correct length
            songVectors.append(songVector)
        else:
            print('Warning: Incomplete data for {}'.format(song['name']))
            continue
    
    if not songVectors:
        return None
    songMatrix = np.array(songVectors)
    return np.mean(songMatrix, axis=0)

def flattenDict(dict):
    
    flattenedDict = defaultdict()
    for key in dict[0].keys():
        flattenedDict[key] = []
    
    for dictionary in dict:
        for key, value in dictionary.items():
            flattenedDict[key].append(value)
            
    return flattenedDict


def recommendSongs( songList, spotifyData, numSongs=10):
    #print(spotify_data.head())  # Check the first few rows of the data
    metadataCols = ['name', 'year', 'artists']
    songDict = flattenDict(songList)
    
    songCenter = getMeanVector(songList, spotifyData)
    scaler = song_cluster_pipeline.steps[0][1]
    scaledData = scaler.transform(spotifyData[numberCols])
    scaledSongCenter = scaler.transform(songCenter.reshape(1, -1))
    distances = cdist(scaledSongCenter, scaledData, 'cosine')
    index = list(np.argsort(distances)[:, :numSongs][0])
    
    #resSongs will be the resulting list of reccomended songs
    resSongs = spotifyData.iloc[index]
    resSongs = resSongs[~resSongs['name'].isin(songDict['name'])]
    return resSongs[metadataCols].to_dict(orient='records')


In [122]:
#make sure song list format is "name: **, year: **"
recommend_songs([{'name': 'Them Changes', 'year':2017},
                {'name': 'WEIGHT OFF', 'year': 2016},
                {'name': 'Ordinary Pleasure', 'year': 2019},
                {'name': 'Deep Down Body Thurst', 'year': 2017},
                {'name': 'Some', 'year': 2017}],  data)

[{'name': 'Alone, Pt. II',
  'year': 2019,
  'artists': "['Alan Walker', 'Ava Max']"},
 {'name': 'Canyon Moon', 'year': 2019, 'artists': "['Harry Styles']"},
 {'name': 'Ophelia', 'year': 2016, 'artists': "['The Lumineers']"},
 {'name': 'hole in the bottle - ballerini album version',
  'year': 2020,
  'artists': "['Kelsea Ballerini']"},
 {'name': 'El Envidioso', 'year': 2020, 'artists': "['Los Dos Carnales']"},
 {'name': 'Look Up Child', 'year': 2018, 'artists': "['Lauren Daigle']"},
 {'name': 'Hablemos',
  'year': 2015,
  'artists': "['Ariel Camacho y Los Plebes Del Rancho']"},
 {'name': 'Sunflower - Spider-Man: Into the Spider-Verse',
  'year': 2019,
  'artists': "['Post Malone', 'Swae Lee']"},
 {'name': 'Starving',
  'year': 2016,
  'artists': "['Hailee Steinfeld', 'Grey', 'Zedd']"},
 {'name': 'One Of Them Girls', 'year': 2020, 'artists': "['Lee Brice']"}]