In [1]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load in data without genre category

In [2]:
# Read in the CSV file as a Pandas Dataframe
Standard_Spotify_df = pd.read_csv(
    Path("../Resources/spotify_songs_nogenre.csv")
)

Standard_Spotify_df.head()

Unnamed: 0.1,Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053
1,1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726
2,2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859
3,3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992
4,4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656


In [3]:
# Drop 'Unnamed' column
Standard_Spotify_df = Standard_Spotify_df.drop(['Unnamed: 0', "song"], axis=1)
Standard_Spotify_df.head()

Unnamed: 0,artist,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Britney Spears,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053
1,blink-182,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726
2,Faith Hill,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859
3,Bon Jovi,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992
4,*NSYNC,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656


# Experiment with plotting

In [4]:
# Plot the clusters using the "danceability" and "energy" columns
Standard_Spotify_df.hvplot.scatter(
    x="danceability",
    y="energy",
    by="key"
)

In [5]:
# Plot the clusters using the "loudness" and "liveness" columns
Standard_Spotify_df.hvplot.scatter(
    x="loudness",
    y="liveness",
    by="explicit"
)

# Scale Data

In [6]:
# Scale price data, return, and variance values
Spotify_data_scaled = StandardScaler().fit_transform(
    Standard_Spotify_df[["duration_ms", "popularity", "danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]
)

In [7]:
# Create a DataFrame with the scaled data
df_spotify_scaled = pd.DataFrame(
    Spotify_data_scaled,
    columns=["duration_ms", "popularity", "danceability", "energy", "key", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]
)

# Copy the "explicit" from the original data
# df_spotify_scaled["explicit"] = Standard_Spotify_df["explicit"]

# Copy the tickers names from the original data
df_spotify_scaled["artist"] = Standard_Spotify_df["artist"]

# Set the Ticker column as index
df_spotify_scaled = df_spotify_scaled.set_index("artist")

# Display sample data
df_spotify_scaled.head()

Unnamed: 0_level_0,duration_ms,popularity,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Britney Spears,-0.449516,0.802968,0.595254,0.74413,-1.211348,0.035403,-0.622748,0.986974,-0.173316,1.235719,1.550256,-0.929867
blink-182,-1.576468,0.896731,-1.662882,1.156685,-1.488038,0.307519,-0.569697,-0.684669,-0.173518,3.063158,0.599207,1.060944
Faith Hill,0.557109,0.287268,-0.986153,-1.469257,0.448791,-1.807847,-0.775658,0.254151,-0.173518,0.496211,-1.239486,0.620779
Bon Jovi,-0.108752,0.84985,-0.829437,1.261461,-1.488038,0.749837,-0.592582,-0.592345,-0.173364,1.178834,-0.034825,-0.004843
*NSYNC,-0.72043,0.240386,-0.38066,1.359688,0.725481,0.36546,-0.540572,-0.508676,-0.161666,-0.687713,1.482324,1.948543


In [8]:
# Encode the "EnergyType" column to variables to categorize oil versus non-oil firms. 
spotify_dummies = pd.get_dummies(Standard_Spotify_df[["year", "explicit"]])
spotify_dummies.head()

# add artist index
# Copy the tickers names from the original data
spotify_dummies["artist"] = Standard_Spotify_df["artist"]

spotify_dummies = spotify_dummies.set_index("artist")

In [9]:
# Concatenate the scaled data DataFrame.
df_spotify_scaled = pd.concat([df_spotify_scaled, spotify_dummies], axis=1)

# Display the sample data
df_spotify_scaled.head()

Unnamed: 0_level_0,duration_ms,popularity,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,explicit
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Britney Spears,-0.449516,0.802968,0.595254,0.74413,-1.211348,0.035403,-0.622748,0.986974,-0.173316,1.235719,1.550256,-0.929867,2000,False
blink-182,-1.576468,0.896731,-1.662882,1.156685,-1.488038,0.307519,-0.569697,-0.684669,-0.173518,3.063158,0.599207,1.060944,1999,False
Faith Hill,0.557109,0.287268,-0.986153,-1.469257,0.448791,-1.807847,-0.775658,0.254151,-0.173518,0.496211,-1.239486,0.620779,1999,False
Bon Jovi,-0.108752,0.84985,-0.829437,1.261461,-1.488038,0.749837,-0.592582,-0.592345,-0.173364,1.178834,-0.034825,-0.004843,2000,False
*NSYNC,-0.72043,0.240386,-0.38066,1.359688,0.725481,0.36546,-0.540572,-0.508676,-0.161666,-0.687713,1.482324,1.948543,2000,False


# Initaite K-means

In [10]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

In [11]:
# Fit the model for the df_stocks_scaled DataFrame
model.fit(df_spotify_scaled)



In [12]:
# Predict the model segments (clusters)
music_clusters = model.predict(df_spotify_scaled)

# View the stock segments
print(music_clusters)

[1 1 1 ... 2 2 2]


In [13]:
# Create a new column in the DataFrame with the predicted clusters
df_spotify_scaled["MusicClusters"] = music_clusters

# Review the DataFrame
df_spotify_scaled.head()

Unnamed: 0_level_0,duration_ms,popularity,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year,explicit,MusicClusters
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Britney Spears,-0.449516,0.802968,0.595254,0.74413,-1.211348,0.035403,-0.622748,0.986974,-0.173316,1.235719,1.550256,-0.929867,2000,False,1
blink-182,-1.576468,0.896731,-1.662882,1.156685,-1.488038,0.307519,-0.569697,-0.684669,-0.173518,3.063158,0.599207,1.060944,1999,False,1
Faith Hill,0.557109,0.287268,-0.986153,-1.469257,0.448791,-1.807847,-0.775658,0.254151,-0.173518,0.496211,-1.239486,0.620779,1999,False,1
Bon Jovi,-0.108752,0.84985,-0.829437,1.261461,-1.488038,0.749837,-0.592582,-0.592345,-0.173364,1.178834,-0.034825,-0.004843,2000,False,1
*NSYNC,-0.72043,0.240386,-0.38066,1.359688,0.725481,0.36546,-0.540572,-0.508676,-0.161666,-0.687713,1.482324,1.948543,2000,False,1


In [14]:
# Create the scatter plot with x="PC1" and y="PC2"
df_spotify_scaled.hvplot.scatter(
    x="danceability",
    y="popularity",
    by="MusicClusters",
    title = "Scatter Plot by Music Segment - PCA=2"
)

# Use PCA to reduce number of factors

In [15]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [16]:
# Fit the df_spotify_scaled data to the PCA
spotify_pca_data = pca.fit_transform(df_spotify_scaled)

In [17]:
# Review the first five rose of the PCA data
# using bracket notation ([0:5])
spotify_pca_data[:5]

array([[ 9.50902196, -0.46152216],
       [10.4221388 , -1.97135112],
       [10.4066925 ,  2.43501753],
       [ 9.48445034, -1.3943639 ],
       [ 9.46755845, -1.76161418]])

In [18]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.7337746, 0.0469077])

* Variance is only ~77%, going to increase amount of PCAs

In [19]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=3)

In [20]:
# Fit the df_spotify_scaled data to the PCA
spotify_pca_data = pca.fit_transform(df_spotify_scaled)

In [21]:
# Review the first five rose of the PCA data
# using bracket notation ([0:5])
spotify_pca_data[:5]

array([[ 9.50902197e+00, -4.61791196e-01, -9.27524554e-01],
       [ 1.04221388e+01, -1.97193188e+00,  1.95103317e+00],
       [ 1.04066925e+01,  2.43487923e+00,  1.61987291e+00],
       [ 9.48445035e+00, -1.39466280e+00,  1.29860674e+00],
       [ 9.46755847e+00, -1.76226971e+00,  9.82243809e-03]])

In [22]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.7337746 , 0.04690771, 0.0322803 ])

* Variance is now ~81% 

In [23]:
# Creating a DataFrame with the PCA data
df_spotify_pca = pd.DataFrame(spotify_pca_data, columns=["PC1", "PC2", "PC3"])

# Copy the tickers names from the original data
df_spotify_pca["artist"] = Standard_Spotify_df["artist"]

# Set the Ticker column as index
df_spotify_pca = df_spotify_pca.set_index("artist")

# Review the DataFrame
df_spotify_pca.head()

Unnamed: 0_level_0,PC1,PC2,PC3
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Britney Spears,9.509022,-0.461791,-0.927525
blink-182,10.422139,-1.971932,1.951033
Faith Hill,10.406693,2.434879,1.619873
Bon Jovi,9.48445,-1.394663,1.298607
*NSYNC,9.467558,-1.76227,0.009822


__Rerun K-means with df_spotify_PCA DataFrame & create scatter plot using music cluster__

In [24]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_spotify_pca)

# Predict the model segments (clusters)
music_clusters = model.predict(df_spotify_pca)

# Print the stock segments
print(music_clusters)

[1 1 1 ... 2 2 2]




In [25]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_spotify_pca_predictions = df_spotify_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_spotify_pca_predictions["MusicCluster"] = music_clusters

# Review the DataFrame
df_spotify_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,PC3,MusicCluster
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Britney Spears,9.509022,-0.461791,-0.927525,1
blink-182,10.422139,-1.971932,1.951033,1
Faith Hill,10.406693,2.434879,1.619873,1
Bon Jovi,9.48445,-1.394663,1.298607,1
*NSYNC,9.467558,-1.76227,0.009822,1


In [26]:
import plotly.express as px

# Create a 3D scatter plot
fig = px.scatter_3d(
    df_spotify_pca_predictions, 
    x='PC1', 
    y='PC2', 
    z='PC3', 
    color='MusicCluster', 
    title="3D Scatter Plot by Music Segment - PCA=3"
)

# Show the plot
fig.show()


# Intiate elbow method to determine Clusters

In [27]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1, 11))

In [28]:
# Create an empy list to store the inertia values
inertia = []

In [29]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_stocks_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_spotify_pca)
    inertia.append(model.inertia_)























In [30]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {
    "k": k,
    "inertia": inertia
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_pca = pd.DataFrame(elbow_data_pca)
df_elbow_pca.head()

Unnamed: 0,k,inertia
0,1,76693.853693
1,2,24656.069046
2,3,14690.49509
3,4,11373.348848
4,5,9831.932503


In [31]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot_pca = df_elbow_pca.hvplot.line(x="k", y="inertia", title="Elbow Curve Using PCA Data", xticks=k)
elbow_plot_pca

* 3 & 4 clusters are close, rerun model with 4 clusters

In [32]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=4)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_spotify_pca)

# Predict the model segments (clusters)
music_clusters = model.predict(df_spotify_pca)

# Print the stock segments
print(music_clusters)

[1 1 1 ... 0 0 0]






In [33]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_spotify_pca_predictions = df_spotify_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_spotify_pca_predictions["MusicCluster"] = music_clusters

# Review the DataFrame
df_spotify_pca_predictions.head()

Unnamed: 0_level_0,PC1,PC2,PC3,MusicCluster
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Britney Spears,9.509022,-0.461791,-0.927525,1
blink-182,10.422139,-1.971932,1.951033,1
Faith Hill,10.406693,2.434879,1.619873,1
Bon Jovi,9.48445,-1.394663,1.298607,1
*NSYNC,9.467558,-1.76227,0.009822,1


In [34]:

# Create a 3D scatter plot
fig = px.scatter_3d(
    df_spotify_pca_predictions, 
    x='PC1', 
    y='PC2', 
    z='PC3', 
    color='MusicCluster', 
    title="3D Scatter Plot by Music Segment - PCA=3"
)

# Show the plot
fig.show()

# Analysis
* By conducting both dimension reduction & clustering, it seems like our model was able to recognize a structure that our spotify data can be organized by even without the inclusion of the "Genre" feature. 