In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np

# Load the dataset into a Pandas dataframe
df = pd.read_csv('spotify.csv')

In [None]:
# Explore the dataset

print(df.head())  # Print the first few rows of the dataframe
print(df.describe())   # Print summary statistics for each column
print(df.info())   # Print information about the dataframe, including the number of rows and columns, and the data types of each column

In [None]:
# Preprocess the dataset

# Remove any rows with missing or invalid values
df.dropna(inplace=True)

# Select a subset of columns to use in the clustering process
columns_to_use = ['explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'year']
df = df[columns_to_use]

# Normalize the values of each column
df = (df - df.mean()) / df.std()

In [None]:
# Clustering with the K-means algorithm

# Import the necessary libraries
from sklearn.cluster import KMeans

# Decide on the appropriate number of clusters

# One way to determine the appropriate number of clusters is to use the elbow method
# This involves fitting the model to the data with a range of different numbers of clusters
# and then examining the resulting sum of squared distances for each number of clusters

sse = []  # Initialize an empty list to store the sum of squared distances for each number of clusters

# Fit the KMeans model to the data with a range of different numbers of clusters
for k in range(1, 30):
    kmeans = KMeans(n_clusters=k)  # Initialize the KMeans model with the current number of clusters
    kmeans.fit(df)  # Fit the model to the data
    sse.append(kmeans.inertia_)  # Add the sum of squared distances for the current number of clusters to the list


In [None]:
# Import the necessary libraries
import matplotlib.pyplot as plt

# Plot the sum squared distances for each number of clusters for elbow method

plt.plot(range(1, 30), sse)
plt.title('Elbow Method for Clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.show()


<img src='./images/elbow_method.png' style='width:600px;height:500px;'>

In [None]:
# Fit the clustering model to the preprocessed data

# First, we need to decide on the appropriate number of clusters to use
# We will use the elbow method, as described in the previous step,
# to determine that the appropriate number of clusters is 8

kmeans = KMeans(n_clusters=8, random_state=1)  # Initialize the KMeans model with 8 clusters
kmeans.fit(df)  # Fit the model to the data

In [None]:
# Generate the clusters

# To generate the clusters with the KMeans algorithm,
# we can use the predict() method on the fitted KMeans model

clusters = kmeans.predict(df)  # Generate cluster assignments for each data point

# Print the cluster assignments for the first few data points
print(clusters[:10])

[5 4 5 4 5 5 0 1 0 0]

In [None]:
# Evaluate the quality of the generated clusters

# One way to evaluate the quality of the generated clusters is to use the silhouette score
# This score measures how similar each data point is to the other data points in its own cluster
# compared to the other clusters

from sklearn.metrics import silhouette_score

silhouette_score(df, clusters)  # Calculate the silhouette score for the generated clusters


The silhouette score is 0.12484136813463163. A silhouette score close to 0 can indicate that the clusters are not well-defined and that the data points within each cluster are not very similar to each other. This can be due to a variety of factors, but one possibility is that the features you are using to perform the clustering are not sufficiently descriptive of the underlying data.

For example, since songs can be in genres and subgenres, it's possible that these labels may not be very informative for certain songs. This is because genres and subgenres can be somewhat interchangeable and may not always accurately reflect the characteristics of a particular song.

In [None]:
# Visualize the generated clusters

# To visualize the generated clusters, we will first need to reduce the data to two dimensions
# so that we can plot it on a scatter plot

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np


pca = PCA(n_components=2)  # Initialize a PCA model with 2 components
df_2d = pca.fit_transform(df)  # Reduce the data to two dimensions using the PCA model

# Plot the data points on a scatter plot
# Coloring the data points according to their cluster assignment
plt.scatter(df_2d[:, 0], df_2d[:, 1], c=clusters)
plt.title('Clustering Of Spotify Songs')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

<img src='./images/spotify_cluster.png' style='width:600px;height:500px;'>

In [None]:
# Get the unique cluster assignments
unique_clusters = np.unique(clusters)

# Create a grid of subplots
fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(8, 8), sharex=True, sharey=True)

# Flatten the array of subplots to make it easier to iterate over
axs = axs.flatten()

# Iterate over the clusters
for i, cluster in enumerate(unique_clusters):
    # Select the data points belonging to the current cluster
    df_cluster = df_2d[clusters == cluster]
    
    # Select the data points belonging to other clusters
    df_other_clusters = df_2d[clusters != cluster]
    
    # Plot the data points belonging to other clusters in gray
    axs[i].scatter(df_other_clusters[:, 0], df_other_clusters[:, 1], c='gray', label='Other clusters', alpha=0.5)
    
    # Plot the data points belonging to the current cluster with a different color
    axs[i].scatter(df_cluster[:, 0], df_cluster[:, 1], c='red', label='Cluster {}'.format(cluster))
    
    # Set the x and y labels for the current subplot
    axs[i].set_xlabel('Component 1')
    axs[i].set_ylabel('Component 2')
    
    # Add a legend to the current subplot
    axs[i].legend()

plt.show()


<img src='./images/indiv_cluster.png' style='width:500px;height:600px;'>

In [None]:
# First, let's create a new dataframe with the cluster assignments as a column
clustered_df = df.copy()
clustered_df['cluster'] = clusters

# Now, we can examine the characteristics of the individual clusters
# For example, we can group the data by cluster and compute the mean of each column
cluster_means = clustered_df.groupby('cluster').mean()
print(cluster_means)

# We can also compare the clusters to each other by creating a plot of the cluster means
# This can help us to see how the clusters differ from each other
cluster_means.plot(kind='bar')
plt.title('Cluster Characteristics')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()

<img src='./images/mean_cluster.png' style='width:700px;height:400px;'>