In [88]:
import pandas as pd
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
import os
from os import getenv

In [89]:
df = pd.read_csv('../data/data.csv')
data_artist_df = pd.read_csv('../data/data_by_artist.csv')
data_genre_df = pd.read_csv('../data/data_by_genres.csv')
data_year_df = pd.read_csv('../data/data_by_year.csv')
data_w_genre_df = pd.read_csv('../data/data_w_genres.csv')

In [90]:
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [91]:
data_artist_df.shape

(32539, 15)

In [92]:
df['artists'] = df['artists'].str.strip('[]')

In [93]:
df['artists'] = df['artists'].str.strip('"')

In [94]:
df['artists'] = df['artists'].str.strip("'")

In [95]:
df["artists"]

0                            Mamie Smith
1                  Screamin' Jay Hawkins
2                            Mamie Smith
3                        Oscar Velazquez
4                                   Mixe
                       ...              
174384    DJ Combo', 'Sander-7', 'Tony T
174385                      Alessia Cara
174386                         Roger Fly
174387                      Taylor Swift
174388                         Roger Fly
Name: artists, Length: 174389, dtype: object

In [96]:
df = df.drop(columns='release_date')
df = df.drop(columns='year')


In [115]:
df.head()
path = r'C:/Users/Megal/OneDrive/Documents/Github/build week spotify/ds-modeling/spotify_df.csv'

In [116]:
df.to_csv(path_or_buf=path)

In [125]:
number_cols = ['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
               'popularity', 'speechiness', 'tempo', 'valence'] # year

In [100]:

cluster_pipeline = Pipeline([('scaler', StandardScaler()),('kmeans',
                                                           KMeans(n_clusters=10, n_jobs=-1))])
X = data_genre_df.select_dtypes(np.number)
cluster_pipeline.fit(X)
data_genre_df['cluster'] = cluster_pipeline.predict(X)

In [101]:

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])

genre_embedding = tsne_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = data_genre_df['genres']
projection['cluster'] = data_genre_df['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3232 samples in 0.007s...
[t-SNE] Computed neighbors for 3232 samples in 0.318s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3232
[t-SNE] Computed conditional probabilities for sample 2000 / 3232
[t-SNE] Computed conditional probabilities for sample 3000 / 3232
[t-SNE] Computed conditional probabilities for sample 3232 / 3232
[t-SNE] Mean sigma: 0.789973
[t-SNE] Computed conditional probabilities in 0.097s
[t-SNE] Iteration 50: error = 82.9442444, gradient norm = 0.0185166 (50 iterations in 0.882s)
[t-SNE] Iteration 100: error = 76.4292450, gradient norm = 0.0033341 (50 iterations in 0.773s)
[t-SNE] Iteration 150: error = 76.1993027, gradient norm = 0.0009161 (50 iterations in 0.762s)
[t-SNE] Iteration 200: error = 76.1834183, gradient norm = 0.0003400 (50 iterations in 0.780s)
[t-SNE] Iteration 250: error = 76.1807785, gradient norm = 0.0002630 (50 iterations in 0.821s)
[t-SNE] KL divergence after 250 iterati

In [102]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('kmeans', KMeans(n_clusters=20,
                                                    verbose=2, n_jobs=-1))], verbose=True)
X = df.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_fitted = song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
df['cluster_label'] = song_cluster_labels

nertia 1025702.016910514
Iteration 76, inertia 1025691.9330898969
Iteration 77, inertia 1025684.3277408275
Iteration 78, inertia 1025677.3490785011
Iteration 79, inertia 1025671.7721301515
Iteration 80, inertia 1025667.3704461236
Iteration 81, inertia 1025664.7365463397
Converged at iteration 81: center shift 9.81894983330122e-05 within tolerance 9.999999999999879e-05.
Initialization complete
Iteration 0, inertia 1426790.294839426
Iteration 1, inertia 1112405.8491854244
Iteration 2, inertia 1083239.8088192588
Iteration 3, inertia 1069306.3091344286
Iteration 4, inertia 1062646.735167214
Iteration 5, inertia 1058941.4058433943
Iteration 6, inertia 1056203.2995425302
Iteration 7, inertia 1053897.2345584307
Iteration 8, inertia 1051736.9626903262
Iteration 9, inertia 1049157.9326859491
Iteration 10, inertia 1045961.9429076265
Iteration 11, inertia 1042634.4490922639
Iteration 12, inertia 1039304.7175589836
Iteration 13, inertia 1036880.2609517571
Iteration 14, inertia 1035598.988937173
It

In [118]:
import pickle
cluster_file = 'cluster.pickle'
pickle.dump(song_cluster_pipeline, open(cluster_file, 'wb'))

In [105]:


SPOTIPY_CLIENT_ID = getenv('SPOTIPY_CLIENT_ID')
SPOTIPY_SECRET_ID = getenv('SECRET_ID')

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID,
                                                           client_secret=SPOTIPY_SECRET_ID))

def find_song(name):
  song_data = defaultdict()
  results = sp.search(q='track: {}'.format(name), limit=1)
  if results['tracks']['items'] == []:
    return None

  results = results['tracks']['items'][0]
  track_id = results['id']
  audio_features = sp.audio_features(track_id)[0]

  song_data['name'] = [name]  
  # song_data['year'] = [year]
  song_data['explicit'] = [int(results['explicit'])]
  song_data['duration_ms'] = [results['duration_ms']]
  song_data['popularity'] = [results['popularity']]

  for key, value in audio_features.items():
    song_data[key] = value

  return pd.DataFrame(song_data)

In [106]:
xd = sp.search(q="track: michael jackson")

In [107]:
xd = xd['tracks']['items'][0]

In [108]:
t_id = xd['id']

In [109]:
find_song("Thriller")

Unnamed: 0,name,explicit,duration_ms,popularity,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,time_signature
0,Thriller,0,358053,67,0.764,0.887,11,-3.726,1,0.0744,...,0.00011,0.847,0.72,118.427,audio_features,7azo4rpSUh8nXgtonC6Pkq,spotify:track:7azo4rpSUh8nXgtonC6Pkq,https://api.spotify.com/v1/tracks/7azo4rpSUh8n...,https://api.spotify.com/v1/audio-analysis/7azo...,4


In [84]:
audio_feat = sp.audio_features(t_id)[0]

In [85]:
audio_feat

{'danceability': 0.822,
 'energy': 0.318,
 'key': 10,
 'loudness': -12.391,
 'mode': 1,
 'speechiness': 0.0575,
 'acousticness': 0.493,
 'instrumentalness': 0.0128,
 'liveness': 0.11,
 'valence': 0.138,
 'tempo': 102.998,
 'type': 'audio_features',
 'id': '6G8kHiVZ1jW7vHMPVRNZU0',
 'uri': 'spotify:track:6G8kHiVZ1jW7vHMPVRNZU0',
 'track_href': 'https://api.spotify.com/v1/tracks/6G8kHiVZ1jW7vHMPVRNZU0',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/6G8kHiVZ1jW7vHMPVRNZU0',
 'duration_ms': 245787,
 'time_signature': 4}

In [140]:
def get_song_data(song, df):

  try:
    song_data = df[(df['name'] == song['name'])].iloc[0]
     
    return song_data
  
  except IndexError:
    return find_song(song['name']) # , song['year']


def get_mean_vector(song_list, df):
  
  song_vectors = []

  for song in song_list:
    song_data = get_song_data(song, df)
    if song_data is None:
      print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
      continue
    song_vector = song_data[number_cols].values
    song_vectors.append(song_vector)

  song_matrix = np.array(list(song_vectors))
  return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
  flattened_dict = defaultdict()
  for key in dict_list[0].keys():
    flattened_dict[key] = []
  
  for dictionary in dict_list:
    for key, value in dictionary.items():
      flattened_dict[key].append(value)
  return flattened_dict
  

def recommend_songs(song_list, df=df, n_songs=10):
  metadata_cols = ['name', 'artists']
  song_dict = flatten_dict_list(song_list)

  song_center = get_mean_vector(song_list, df)
  scaler = song_cluster_pipeline.steps[0][1]
  scaled_data = scaler.transform(df[number_cols])
  scaled_song_center = scaler.transform(song_center.reshape(1,-1))
  distances = cdist(scaled_song_center, scaled_data, 'cosine')
  index = list(np.argsort(distances)[:, :n_songs][0])

  rec_songs = df.iloc[index]
  rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
  ten_songs = rec_songs[metadata_cols].to_dict(orient='records')
  return ten_songs, scaled_song_center, scaled_data

In [141]:
ten_song, ssc, sd = recommend_songs([{'name': 'DVP'}])

In [188]:
ten_song

[{'name': 'Tired Of Sex', 'artists': 'Weezer'},
 {'name': 'How You Love Me Now', 'artists': 'Hey Monday'},
 {'name': 'Heaven Is Falling', 'artists': 'Bad Religion'},
 {'name': 'Lost in Space', 'artists': 'Misfits'},
 {'name': "You Be Tails, I'll Be Sonic", 'artists': 'A Day To Remember'},
 {'name': 'Drowning (Face Down)', 'artists': 'Saving Abel'},
 {'name': 'Angels Of The Silences', 'artists': 'Counting Crows'},
 {'name': 'Anarchy in the U.K.', 'artists': 'Sex Pistols'},
 {'name': 'Search and Destroy - Iggy Pop Mix', 'artists': 'The Stooges'},
 {'name': 'The Air That I Breathe', 'artists': 'All That Remains'}]

In [171]:
ssc

array([[-1.3023512 , -0.99848224, -0.56549629,  1.88598466, -0.27040128,
        -0.5895536 ,  0.22587578,  0.6475452 ,  1.84937665,  0.6509402 ,
         1.15699688, -0.03417752,  1.02315088, -1.12498921]])

In [173]:
import itertools
r = list(itertools.chain.from_iterable(ssc))

In [183]:
r = [abs(i) for i in r]

In [193]:
r1 = sd[5]
r1 = [abs(i) for i in r1]

In [148]:
df1 = pd.DataFrame(dict(r=[ssc], theta=[number_cols]))

In [114]:
user_input = input('enter a song, or artist and well suggest 10 new songs:')
input_dict = {'name': user_input[0]}
recommend_songs([input_dict])

[{'name': 'Good Days', 'artists': 'SZA'},
 {'name': 'Long Time - Intro', 'artists': 'Playboi Carti'},
 {'name': 'No Love', 'artists': "August Alsina', 'Nicki Minaj"},
 {'name': 'Love Is Only a Feeling', 'artists': 'Joey Bada$$'},
 {'name': 'Impossible', 'artists': 'Travis Scott'},
 {'name': 'POWER', 'artists': 'Kanye West'},
 {'name': 'Bleed It Out', 'artists': 'Linkin Park'},
 {'name': 'The Dope Show', 'artists': 'Marilyn Manson'},
 {'name': 'By Myself', 'artists': "Christian French', 'Triegy"},
 {'name': 'Who Gon Stop Me', 'artists': "JAY-Z', 'Kanye West"}]

In [149]:
df1

Unnamed: 0,r,theta
0,"[[-1.3023512023072747, -0.9984822438640804, -0...","[acousticness, danceability, duration_ms, ener..."


[{'artists': 'Curtis Mayfield',
  'name': "We're a Winner - Live @ Bitter End, NYC",
  'year': 1971},
 {'artists': 'Millenium',
  'name': 'My Life Domino - Live in Krakow 2009',
  'year': 2010},
 {'artists': 'Talking Heads',
  'name': "I'm Not in Love - 2004 Remaster",
  'year': 1982},
 {'artists': 'Millenium',
  'name': 'Drunken Angels - Live in Krakow 2009',
  'year': 2010},
 {'artists': 'Paul Carrack',
  'name': 'Inspire Me - Live at Bonn, Germany, 2005',
  'year': 2020},
 {'artists': 'The Cure',
  'name': 'Lullaby - Live at Wembley 07/89 - Remix 07/09',
  'year': 1989},
 {'artists': 'Bill Withers',
  'name': 'Let Us Love - Live at Carnegie Hall, New York, NY - October 1972',
  'year': 1973},
 {'artists': 'The Pharcyde', 'name': 'Officer', 'year': 1992},
 {'artists': 'Bob Marley & The Wailers',
  'name': "Jammin' - Live At The Pavillon De Paris, 1977",
  'year': 1978},
 {'artists': 'Paul Carrack',
  'name': 'Another Cup of Coffee - Live at Buxton Opera House, 2004',
  'year': 2020}]

In [194]:
import plotly.graph_objects as go 

categories = number_cols

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=r,
    theta = categories,
    fill='toself',
    name='dvp'
))
fig.add_trace(go.Scatterpolar(
    r=r1,
    theta = categories,
    fill='toself',
    name=ten_song[5]['name']

))
fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 2]
        )),
    showlegend=True
)
fig.show()