In [30]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings

warnings.filterwarnings("ignore")

# Load the data
data = pd.read_csv("Data/data.csv")

# Clustering pipeline for song data
song_cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=19, verbose=False, n_init='auto', random_state=42))
])

# Select numerical features
X_song = data.select_dtypes(np.number)

# Fit and predict clusters
song_cluster_pipeline.fit(X_song)
song_cluster_labels = song_cluster_pipeline.predict(X_song)
data['cluster_label'] = song_cluster_labels

In [31]:
# 1. Calculate Internal Validation Metrics
silhouette_avg = silhouette_score(X_song, song_cluster_labels)
ch_score = calinski_harabasz_score(X_song, song_cluster_labels)
db_score = davies_bouldin_score(X_song, song_cluster_labels)

print("\nClustering Evaluation Metrics:")
print(f"Silhouette Score: {silhouette_avg:.3f}")
print(f"Calinski-Harabasz Score: {ch_score:.3f}")
print(f"Davies-Bouldin Score: {db_score:.3f}")



Clustering Evaluation Metrics:
Silhouette Score: -0.225
Calinski-Harabasz Score: 6343.014
Davies-Bouldin Score: 83.138


In [17]:
# 3. Cluster Size Distribution
cluster_sizes = data['cluster_label'].value_counts().sort_index()
print("\nCluster Size Distribution:")
print(cluster_sizes)


Cluster Size Distribution:
cluster_label
0      3048
1      7101
2     10876
3     10536
4      6766
5      7117
6     10369
7     11167
8      4519
9      9898
10    12898
11    13512
12     5017
13     7168
14     2659
15     2555
16      169
17     9819
18     5794
19    13119
20     6264
21    10282
Name: count, dtype: int64


In [18]:
# 4. Additional Cluster Analysis
print("\nCluster Statistics:")
# Calculate mean values of features for each cluster
cluster_means = data.groupby('cluster_label')[X_song.columns].mean()
print("\nMean values of features for each cluster:")
print(cluster_means.round(2))


Cluster Statistics:

Mean values of features for each cluster:
               valence     year  acousticness  danceability  duration_ms  \
cluster_label                                                              
0                 0.22  1957.28          0.81          0.32    690466.06   
1                 0.66  1972.82          0.56          0.44    200847.92   
2                 0.26  1954.91          0.90          0.35    204000.42   
3                 0.65  1946.45          0.90          0.53    182952.15   
4                 0.50  2007.41          0.15          0.67    227447.30   
5                 0.53  1978.08          0.41          0.48    262363.54   
6                 0.59  2003.09          0.14          0.60    230725.31   
7                 0.79  1979.74          0.36          0.67    211049.32   
8                 0.26  1954.84          0.89          0.38    209355.22   
9                 0.17  1965.11          0.91          0.31    226864.08   
10                0.70  

In [19]:
# 5. Feature Importance Analysis
# Get the KMeans component from the pipeline
kmeans = song_cluster_pipeline.named_steps['kmeans']
scaler = song_cluster_pipeline.named_steps['scaler']

# Calculate the relative importance of features
feature_importance = np.abs(kmeans.cluster_centers_).mean(axis=0)
feature_importance = pd.DataFrame(
    feature_importance,
    columns=['Importance'],
    index=X_song.columns
)
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)



Feature Importance:
                  Importance
duration_ms         1.068907
year                0.763002
energy              0.744481
popularity          0.741259
acousticness        0.727763
explicit            0.706130
mode                0.703370
speechiness         0.690211
loudness            0.662405
danceability        0.657544
valence             0.607666
instrumentalness    0.542602
liveness            0.394425
tempo               0.349136
key                 0.231723
