In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
df = pd.read_csv('../data/spotify_data.csv', index_col=0)
df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


In [32]:
df.memory_usage().sum()

190201296

In [12]:
df['artist_name'].value_counts().head(30)

artist_name
Traditional                4058
Grateful Dead              2320
Johann Sebastian Bach      2125
Giacomo Meyerbeer          1345
Elvis Presley              1242
Wolfgang Amadeus Mozart    1084
Armin van Buuren           1061
Astor Piazzolla             932
Hans Zimmer                 863
Andrei Krylov               841
Ludwig van Beethoven        818
Andrew Lloyd Webber         806
Vybz Kartel                 806
Jim Brickman                766
Steven Halpern              762
Giuseppe Verdi              749
Nature Sounds               749
Denise Gagne                733
Francisco Canaro            699
Glee Cast                   698
Sonu Nigam                  673
Madhu Balakrishnan          664
Pritam                      630
Richard Wagner              621
Alan Tam                    619
Giacomo Puccini             616
Jack Hartmann               608
David Arkenstone            594
Frédéric Chopin             593
Alan Menken                 591
Name: count, dtype: int64

In [13]:
df[df['artist_name'] == 'Migos'].head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
86000,Migos,Hannah Montana,3SnLc0bQjQeE3HgGdNfb8u,44,2013,hip-hop,0.902,0.743,0,-3.495,1,0.284,0.103,6e-06,0.0956,0.626,135.951,212712,4
86041,Migos,Versace,6xhblLundMJAiG9jF7nlxs,44,2013,hip-hop,0.841,0.729,6,-3.047,0,0.239,0.0384,0.0,0.276,0.444,132.073,195474,4
86284,Migos,China Town,4qK1wkHWFM6hyFKo0SaAra,35,2013,hip-hop,0.796,0.836,5,-4.271,1,0.335,0.326,0.0,0.337,0.503,136.855,197376,4
86291,Migos,Bando,5N0vzzKfd2yDw1nrdXrvPv,35,2013,hip-hop,0.839,0.923,5,-3.54,0,0.183,0.269,0.0,0.0927,0.692,135.006,286056,4
138952,Migos,Fight Night,6Xa2q0dapj03CsRzmD0Os5,66,2014,hip-hop,0.874,0.706,1,-5.132,1,0.207,0.182,0.0,0.334,0.895,89.961,216248,4


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1159764 entries, 0 to 1473395
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159749 non-null  object 
 1   track_name        1159763 non-null  object 
 2   track_id          1159764 non-null  object 
 3   popularity        1159764 non-null  int64  
 4   year              1159764 non-null  int64  
 5   genre             1159764 non-null  object 
 6   danceability      1159764 non-null  float64
 7   energy            1159764 non-null  float64
 8   key               1159764 non-null  int64  
 9   loudness          1159764 non-null  float64
 10  mode              1159764 non-null  int64  
 11  speechiness       1159764 non-null  float64
 12  acousticness      1159764 non-null  float64
 13  instrumentalness  1159764 non-null  float64
 14  liveness          1159764 non-null  float64
 15  valence           1159764 non-null  float64
 16  tempo

In [15]:
numerical_cols = df.select_dtypes(include=[np.number]).columns

df[numerical_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1159764 entries, 0 to 1473395
Data columns (total 15 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   popularity        1159764 non-null  int64  
 1   year              1159764 non-null  int64  
 2   danceability      1159764 non-null  float64
 3   energy            1159764 non-null  float64
 4   key               1159764 non-null  int64  
 5   loudness          1159764 non-null  float64
 6   mode              1159764 non-null  int64  
 7   speechiness       1159764 non-null  float64
 8   acousticness      1159764 non-null  float64
 9   instrumentalness  1159764 non-null  float64
 10  liveness          1159764 non-null  float64
 11  valence           1159764 non-null  float64
 12  tempo             1159764 non-null  float64
 13  duration_ms       1159764 non-null  int64  
 14  time_signature    1159764 non-null  int64  
dtypes: float64(9), int64(6)
memory usage: 141.6 MB


In [16]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])

In [17]:
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

df['cluster'] = clusters

  super()._check_params_vs_input(X, default_n_init=10)


In [21]:
def create_playlist_vector(playlist, scaler, kmeans):
    playlist_subset = df[df['track_id'].isin(playlist)]
    playlist_subset = playlist_subset.drop('cluster', axis=1)
    numerical_playlist = playlist_subset.select_dtypes(include=[np.number])
    scaled_playlist = scaler.transform(numerical_playlist)
    playlist_vector = scaled_playlist.mean(axis=0)
    playlist_cluster = kmeans.predict([playlist_vector])[0]
    playlist_vector = np.append(playlist_vector, playlist_cluster)
    return playlist_vector

In [22]:
drake_songs = df[df['artist_name'] == 'Drake'].sample(10, random_state=42)
playlist = drake_songs['track_id'].tolist()

playlist_vector = create_playlist_vector(playlist, scaler, kmeans)

In [23]:
songs_in_cluster = df[df['cluster'] == playlist_vector[-1]]
songs_in_cluster = songs_in_cluster.drop('cluster', axis=1)
numerical_songs = songs_in_cluster.select_dtypes(include=[np.number])
scaled_songs = scaler.transform(numerical_songs)


In [27]:
playlist_vector, playlist_vector[:-1]

(array([ 2.67645188e+00,  3.88747721e-01,  2.14452549e-01, -1.21884681e-01,
         4.25355421e-01,  4.09145701e-01, -2.79637870e-01,  6.94454703e-01,
        -1.78758509e-01, -6.90941824e-01, -1.01302089e-01, -7.92359953e-01,
        -7.33291695e-04,  9.48095331e-02, -1.83622005e-01,  4.00000000e+00]),
 array([ 2.67645188e+00,  3.88747721e-01,  2.14452549e-01, -1.21884681e-01,
         4.25355421e-01,  4.09145701e-01, -2.79637870e-01,  6.94454703e-01,
        -1.78758509e-01, -6.90941824e-01, -1.01302089e-01, -7.92359953e-01,
        -7.33291695e-04,  9.48095331e-02, -1.83622005e-01]))

In [28]:
similarities = cosine_similarity([playlist_vector[:-1]], scaled_songs)

In [29]:
top_indices = similarities.argsort()[0][-5:][::-1]
top_songs = songs_in_cluster.iloc[top_indices]
top_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
506682,J. Cole,m y . l i f e (with 21 Savage & Morray),1D3z6HTiQsNmZxjl7F7eoG,77,2021,hip-hop,0.597,0.587,7,-7.026,0,0.137,0.0622,0.0,0.214,0.0753,140.021,218802,4
85749,Chance the Rapper,Cocoa Butter Kisses,0aMHIW1lqrulVCx0LLlr6a,66,2013,hip-hop,0.679,0.645,6,-7.777,0,0.222,0.0518,0.0,0.145,0.203,125.795,307188,4
350207,Summer Walker,Girls Need Love (with Drake) - Remix,14SaZBTjxlorHJQxXh01Hu,78,2018,pop,0.656,0.432,6,-9.481,0,0.217,0.443,0.0,0.132,0.0897,97.022,222374,4
433220,Tiësto,The Business,6f3Slt0GbA2bPZlz0aIFXN,80,2020,dance,0.798,0.62,8,-7.079,0,0.232,0.414,0.0192,0.112,0.235,120.031,164000,4
321049,Dynoro,In My Mind,0E9ZjEAyAwOXZ7wJC0PD33,77,2018,dance,0.694,0.77,6,-5.335,1,0.149,0.176,1.1e-05,0.118,0.163,125.905,184560,4


In [37]:
rap_songs = df[df['genre'] == 'hip-hop'].sort_values('popularity', ascending=False).head(20)

In [38]:
rap_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster
605178,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",4nrPB8O7Y7wsOCJdgXkthe,96,2023,hip-hop,0.778,0.632,2,-5.6,0,0.0493,0.274,0.0,0.0915,0.498,122.104,218289,4,4
560200,Bizarrap,"Quevedo: Bzrp Music Sessions, Vol. 52",2tTmW7RDtMQtBk7m2rYeSw,92,2022,hip-hop,0.621,0.782,2,-5.548,1,0.044,0.0125,0.033,0.23,0.55,128.033,198938,4,4
560194,Yandel,Yandel 150,2oiixB9QMIzhWaHGVlQx4g,91,2022,hip-hop,0.783,0.729,6,-3.549,0,0.0691,0.0492,0.000272,0.1,0.58,167.968,216148,4,4
952551,Eminem,Mockingbird,561jH07mF1jHuk7KlaeF0s,90,2004,hip-hop,0.637,0.678,0,-3.798,1,0.266,0.209,0.0,0.156,0.254,84.039,250760,4,4
560181,Drake,Rich Flex,1bDbXMyjaUIooNwFE9wn0N,90,2022,hip-hop,0.561,0.52,11,-9.342,0,0.244,0.0503,2e-06,0.355,0.424,153.15,239360,3,4
560190,Ñengo Flow,Gato de Noche,54ELExv56KCAB4UP9cOCzC,89,2022,hip-hop,0.892,0.662,8,-3.894,1,0.162,0.169,1e-06,0.363,0.607,93.976,227013,4,4
560192,Oliver Tree,Miss You,73vIOb4Q7YN6HeJTbscRx5,89,2022,hip-hop,0.587,0.742,6,-6.64,0,0.0529,0.0128,0.00107,0.146,0.199,145.007,206000,4,4
605182,Gorillaz,Tormenta (feat. Bad Bunny),38UYeBLfvpnDSG9GznZdnL,89,2023,hip-hop,0.637,0.768,0,-6.468,1,0.056,0.458,0.00192,0.378,0.297,94.982,193464,4,4
560203,Arcángel,La Jumpa,5MxFWjuqQIsbNWbMdMdbli,87,2022,hip-hop,0.713,0.703,8,-5.769,1,0.194,0.298,0.0,0.321,0.576,123.06,255693,4,4
825779,Eminem,Without Me,7lQ8MOhq6IN2w8EYcFNSUk,87,2002,hip-hop,0.908,0.669,7,-2.827,1,0.0738,0.00286,0.0,0.237,0.662,112.238,290320,4,4
