In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
# Load the song data
songs = pd.read_csv('../data/clean_data.csv', index_col=0)

In [3]:
# Select the numeric features for clustering
features = ['Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Popularity', 'Speechiness', 'Release Year']
X = songs[features]

In [4]:
# Feature scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Hyperparameter tuning
best_score = -1
best_k = None

In [6]:
for k in range(2, 11):
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)

    # Evaluate clustering using silhouette score
    score = silhouette_score(X_scaled, labels)

    # Update best score and number of clusters
    if score > best_score:
        best_score = score
        best_k = k

In [7]:
# Re-fit K-means with the best number of clusters
kmeans = KMeans(n_clusters=best_k, random_state=42)
labels = kmeans.fit_predict(X_scaled)

In [8]:
# Add cluster assignments to the songs DataFrame
songs['Cluster'] = labels

In [10]:
# Display the cluster assignments
print(songs[['Track', 'Artist', 'Cluster']].head())

                       Track                Artist  Cluster
0         Shook Ones, Pt. II             Mobb Deep        1
1  Hypnotize - 2014 Remaster  The Notorious B.I.G.        1
2       Ambitionz Az A Ridah                  2Pac        1
3         N.Y. State of Mind                   Nas        1
4          It Was A Good Day              Ice Cube        1


In [11]:
# Count the number of songs in each cluster
cluster_counts = songs['Cluster'].value_counts()
print(cluster_counts)

1    5451
2    2475
0     790
Name: Cluster, dtype: int64
