In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import cluster

In [15]:
pd.set_option("display.max_columns", None)

In [39]:
data = pd.read_csv('CSV Files/spotify.csv')
data.shape

(18555, 21)

In [40]:
data.head()

Unnamed: 0,title,artist,uri,features_danceability,features_energy,features_key,features_loudness,features_mode,features_speechiness,features_acousticness,features_instrumentalness,features_liveness,features_valence,features_tempo,features_type,features_id,features_uri,features_track_href,features_analysis_url,features_duration_ms,features_time_signature
0,See You Again (feat. Kali Uchis),"Tyler, The Creator",spotify:track:7KA4W4McWYRpgf0fWsJZWB,0.558,0.559,6.0,-9.222,1.0,0.0959,0.371,7e-06,0.109,0.62,78.558,audio_features,7KA4W4McWYRpgf0fWsJZWB,spotify:track:7KA4W4McWYRpgf0fWsJZWB,https://api.spotify.com/v1/tracks/7KA4W4McWYRp...,https://api.spotify.com/v1/audio-analysis/7KA4...,180387.0,4.0
1,Hall of Fame (feat. will.i.am),The Script,spotify:track:7wMq5n8mYSKlQIGECKUgTX,0.421,0.873,10.0,-4.343,1.0,0.0564,0.0654,0.0,0.123,0.629,84.786,audio_features,7wMq5n8mYSKlQIGECKUgTX,spotify:track:7wMq5n8mYSKlQIGECKUgTX,https://api.spotify.com/v1/tracks/7wMq5n8mYSKl...,https://api.spotify.com/v1/audio-analysis/7wMq...,202533.0,4.0
2,Back Where I Belong (feat. Avicii),Otto Knows,spotify:track:78W8wiUIlQ2SnWY9TVowKZ,0.497,0.865,0.0,-3.19,1.0,0.0644,0.0306,0.0679,0.443,0.595,125.96,audio_features,78W8wiUIlQ2SnWY9TVowKZ,spotify:track:78W8wiUIlQ2SnWY9TVowKZ,https://api.spotify.com/v1/tracks/78W8wiUIlQ2S...,https://api.spotify.com/v1/audio-analysis/78W8...,174203.0,4.0
3,Bumpy Ride,Mohombi,spotify:track:0tBRo4P60DgKmg4jt48upm,0.709,0.854,0.0,-4.447,0.0,0.0399,0.047,0.0,0.0724,0.826,105.074,audio_features,0tBRo4P60DgKmg4jt48upm,spotify:track:0tBRo4P60DgKmg4jt48upm,https://api.spotify.com/v1/tracks/0tBRo4P60DgK...,https://api.spotify.com/v1/audio-analysis/0tBR...,224402.0,4.0
4,Danza Kuduro (feat. Don Omar),Lucenzo,spotify:track:1kAZhbcsXqfUjnVeqPywn2,0.706,0.89,0.0,-6.575,1.0,0.0847,0.0855,0.0,0.0465,0.896,130.051,audio_features,1kAZhbcsXqfUjnVeqPywn2,spotify:track:1kAZhbcsXqfUjnVeqPywn2,https://api.spotify.com/v1/tracks/1kAZhbcsXqfU...,https://api.spotify.com/v1/audio-analysis/1kAZ...,202347.0,4.0


#### Dropping Cols

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18555 entries, 0 to 18554
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   title                      18554 non-null  object 
 1   artist                     18555 non-null  object 
 2   uri                        18555 non-null  object 
 3   features_danceability      18554 non-null  float64
 4   features_energy            18554 non-null  float64
 5   features_key               18554 non-null  float64
 6   features_loudness          18554 non-null  float64
 7   features_mode              18554 non-null  float64
 8   features_speechiness       18554 non-null  float64
 9   features_acousticness      18554 non-null  float64
 10  features_instrumentalness  18554 non-null  float64
 11  features_liveness          18554 non-null  float64
 12  features_valence           18554 non-null  float64
 13  features_tempo             18554 non-null  flo

In [42]:
data['features_time_signature'].value_counts()

4.0    17295
3.0     1001
5.0      161
1.0       96
0.0        1
Name: features_time_signature, dtype: int64

In [43]:
# Dropping unnecessary columns: 
data = data.drop(['features_type', 'features_id','features_uri', 'features_track_href', 'features_analysis_url'], axis=1)

In [44]:
# remove header prefix
data.columns = data.columns.str.replace("features_", "")

In [45]:
data.head(1)

Unnamed: 0,title,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,See You Again (feat. Kali Uchis),"Tyler, The Creator",spotify:track:7KA4W4McWYRpgf0fWsJZWB,0.558,0.559,6.0,-9.222,1.0,0.0959,0.371,7e-06,0.109,0.62,78.558,180387.0,4.0


#### Scaling

In [46]:
X = data.drop(columns = ['title', 'artist', 'uri'])

In [48]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
danceability,18554.0,0.585915,0.165983,0.0,0.474,0.598,0.706,0.988
energy,18554.0,0.629068,0.22795,2e-05,0.468,0.664,0.815,1.0
key,18554.0,5.293629,3.561703,0.0,2.0,5.0,9.0,11.0
loudness,18554.0,-8.527306,3.892723,-60.0,-10.779,-7.772,-5.613,1.342
mode,18554.0,0.646761,0.477989,0.0,0.0,1.0,1.0,1.0
speechiness,18554.0,0.078316,0.081423,0.0,0.0346,0.04665,0.0801,0.934
acousticness,18554.0,0.2743,0.308013,0.0,0.0214,0.131,0.472,0.996
instrumentalness,18554.0,0.128419,0.272614,0.0,0.0,0.000164,0.037775,0.993
liveness,18554.0,0.198341,0.173085,0.00829,0.0946,0.128,0.25,1.0
valence,18554.0,0.540253,0.24478,0.0,0.347,0.545,0.739,1.0


not sure if duration is going to be importaint? Will keep it around for now. Time signature should be categorical 

In [47]:
scaler = StandardScaler().fit(X)
X_train = pd.DataFrame(scaler.transform(X), columns = X.columns)
X_train.head(4)

NameError: name 'export_scaler' is not defined