# Notebook for Clustering Spotify Data

## Analysis for Clustering

### All Data Scatter Plot with Genre as color

In [None]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.decomposition import PCA
import plotly_express as px
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

preprocessed_data = load_and_concatenate_parquet_files("data/preprocessed_spotify_data")

metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

data_for_analysis = preprocessed_data.copy()
data_for_analysis = data_for_analysis[data_for_analysis["genre"].notna()]
data_for_analysis = data_for_analysis.sample(n=200000, random_state=42)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])
scaled_data_for_analysis = pd.merge(pd.DataFrame(data_scaled, columns=metric_columns), data_for_analysis.drop(columns=metric_columns), left_index=True, right_index=True)


pca = PCA(n_components=2)
pca.fit(scaled_data_for_analysis[metric_columns])
scaled_data_for_analysis[["PCA1", "PCA2"]] = pca.transform(scaled_data_for_analysis[metric_columns])

px.scatter(scaled_data_for_analysis, x="PCA1", y="PCA2", color="genre", hover_data={'PCA1': False, 'PCA2': False, 'genre': True, 'artist_name': True, 'track_name': True, 
                        'danceability': True, 'energy': True, 'valence': True},).show()

This shows that genres cannot easily be identified by music features

### Clusters with HDBSCAN

In [None]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.decomposition import PCA
import plotly_express as px
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

hdbscan_cluster_df = load_and_concatenate_parquet_files("data/hdbscan_clustered_subset")

metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

data_for_analysis = hdbscan_cluster_df.copy()
sample_size = 200000 if len(data_for_analysis) > 200000 else len(data_for_analysis)
data_for_analysis = data_for_analysis.sample(n=sample_size, random_state=42)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])
scaled_data_for_analysis = pd.merge(pd.DataFrame(data_scaled, columns=metric_columns), data_for_analysis.drop(columns=metric_columns), left_index=True, right_index=True)


pca = PCA(n_components=2)
pca.fit(scaled_data_for_analysis[metric_columns])
scaled_data_for_analysis[["PCA1", "PCA2"]] = pca.transform(scaled_data_for_analysis[metric_columns])


px.scatter(scaled_data_for_analysis, x="PCA1", y="PCA2", color="hdbscan_cluster", hover_data={'PCA1': False, 'PCA2': False, 'genre': True, 'artist_name': True, 'track_name': True, 
                        'danceability': True, 'energy': True, 'valence': True},).show()

In [None]:
print(hdbscan_cluster_df["hdbscan_cluster"].value_counts())

### Clusters with KMEANS

#### Silhouette Score

Calculate the Silhouette Score

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from custom_utils import load_and_concatenate_parquet_files, save_dataframe_as_parquet

def calculate_silhouette_score():
    preprocessed_data = load_and_concatenate_parquet_files("data/preprocessed_spotify_data")
    metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

    data_for_analysis = preprocessed_data.copy()
    data_for_analysis = data_for_analysis.sample(n=20000, random_state=42)


    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])

    k_range = range(2, 11)
    silhouette_scores = []

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        cluster_labels = kmeans.fit_predict(data_scaled) 
        score = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(score)
        print(f"Number of clusters: {k}, Silhouette Score: {score}")

    silhouette_df = pd.DataFrame({
        'Number of Clusters (k)': k_range,
        'Silhouette Score': silhouette_scores
    })

    save_dataframe_as_parquet(silhouette_df, "data", "silhouette_scores", always_overwrite=True)

# calculate_silhouette_score()

In [None]:
from custom_utils import load_and_concatenate_parquet_files

silhouette_scores = load_and_concatenate_parquet_files("data/silhouette_scores")
plt.figure(figsize=(10, 6))
plt.plot(silhouette_scores['Number of Clusters (k)'], silhouette_scores['Silhouette Score'], marker='o')
plt.title('Silhouette Scores for Different Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(silhouette_scores['Number of Clusters (k)'])
plt.grid(True)
plt.show()

### Code for Elbow Method

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from custom_utils import load_and_concatenate_parquet_files, save_dataframe_as_parquet

def calculate_elbow_score():
    preprocessed_data = load_and_concatenate_parquet_files("data/preprocessed_spotify_data")
    metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

    data_for_analysis = preprocessed_data.copy()
    data_for_analysis = data_for_analysis.sample(n=20000, random_state=42)

    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])

    k_range = range(2, 30)
    wcss = []


    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data_scaled) 
        wcss.append(kmeans.inertia_) 
        print(f"Number of clusters: {k}, WCSS: {kmeans.inertia_}")

    wcss_df = pd.DataFrame({
        'Number of Clusters (k)': k_range,
        'Elbow Method Score': wcss
    })

    save_dataframe_as_parquet(wcss_df, "data", "elbow_method_scores", always_overwrite=True)

# calculate_elbow_score()

In [None]:
from custom_utils import load_and_concatenate_parquet_files

elbow_method_scores = load_and_concatenate_parquet_files("data/elbow_method_scores")
plt.figure(figsize=(10, 6))
plt.plot(elbow_method_scores['Number of Clusters (k)'], elbow_method_scores['Elbow Method Score'], marker='o')
plt.title('Elbow Method Scores for Different Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Elbow Method Score')
plt.xticks(elbow_method_scores['Number of Clusters (k)'])
plt.grid(True)
plt.show()

### Plotting KMeans Cluster

In [None]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.decomposition import PCA
import plotly_express as px
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

kmeans_cluster_df = load_and_concatenate_parquet_files("data/kmeans_clustered_subset")

metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

data_for_analysis = kmeans_cluster_df.copy()
data_for_analysis = data_for_analysis.sample(n=200000, random_state=42)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])
scaled_data_for_analysis = pd.merge(pd.DataFrame(data_scaled, columns=metric_columns), data_for_analysis.drop(columns=metric_columns), left_index=True, right_index=True)


pca = PCA(n_components=2)
pca.fit(scaled_data_for_analysis[metric_columns])
scaled_data_for_analysis[["PCA1", "PCA2"]] = pca.transform(scaled_data_for_analysis[metric_columns])


px.scatter(scaled_data_for_analysis, x="PCA1", y="PCA2", color="kmeans_cluster", hover_data={'PCA1': False, 'PCA2': False, 'genre': True, 'artist_name': True, 'track_name': True, 
                        'danceability': True, 'energy': True, 'valence': True},).show()

In [None]:
print(kmeans_cluster_df["kmeans_cluster"].value_counts())

## Cluster with KMEANS and HDBSCAN

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from custom_utils import load_and_concatenate_parquet_files

kmeans_hdbscan_cluster_df = load_and_concatenate_parquet_files("data/kmeans_hdbscan_clustered_subset")
metric_columns = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]

data_for_analysis = kmeans_hdbscan_cluster_df.copy()
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_for_analysis[metric_columns])
scaled_data_for_analysis = pd.merge(pd.DataFrame(data_scaled, columns=metric_columns), 
                                    data_for_analysis.drop(columns=metric_columns), 
                                    left_index=True, right_index=True)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_scaled)
scaled_data_for_analysis[["PCA1", "PCA2"]] = pca_result

kmeans_clusters = data_for_analysis['kmeans_cluster'].unique()
num_clusters = len(kmeans_clusters)

fig, axes = plt.subplots(nrows=int(np.ceil(num_clusters / 2)), ncols=2, figsize=(15, 5 * np.ceil(num_clusters / 2)))
axes = axes.flatten()

for i, cluster in enumerate(kmeans_clusters):
    ax = axes[i]
    
    cluster_data = scaled_data_for_analysis[scaled_data_for_analysis['kmeans_cluster'] == cluster]
    cluster_data_sampled = cluster_data.sample(frac=0.1, random_state=42)
    
    scatter = ax.scatter(cluster_data_sampled['PCA1'], cluster_data_sampled['PCA2'], 
                        c=cluster_data_sampled['hdbscan_cluster'], cmap='tab20', alpha=0.5, s=10)
    ax.set_title(f'K-Means Cluster {cluster}')
    ax.set_xlabel('PCA1')
    ax.set_ylabel('PCA2')
    
    cbar = plt.colorbar(scatter, ax=ax, orientation='vertical')
    cbar.set_label('HDBSCAN Cluster')

for j in range(num_clusters, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
new_kmeans_hdbscan_cluster_df = kmeans_hdbscan_cluster_df.copy()
new_kmeans_hdbscan_cluster_df["combined_cluster"] = new_kmeans_hdbscan_cluster_df["kmeans_cluster"].astype(str) + "_" + new_kmeans_hdbscan_cluster_df["hdbscan_cluster"].astype(str)
print(new_kmeans_hdbscan_cluster_df["combined_cluster"].value_counts())

# Code for Clustering

In [None]:
from custom_utils import load_and_concatenate_parquet_files, save_dataframe_as_parquet
import pandas as pd
from sklearn.cluster import KMeans
import hdbscan
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import joblib

def reduce_data(data, dimensions, metric_columns):
    numeric_columns = data[metric_columns]
    pca_standardized = PCA(n_components=dimensions)
    pca_standardized_result = pca_standardized.fit_transform(numeric_columns)
    return pca_standardized_result

def normalize_data(data:pd.DataFrame, metric_columns):
    min_max_scaler = MinMaxScaler()
    numeric_columns = data[metric_columns]
    other_columns = data.drop(columns=metric_columns).reset_index(drop=True)
    normalized_data = min_max_scaler.fit_transform(numeric_columns)
    normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns.columns)
    normalized_data = pd.merge(normalized_df, other_columns, left_index=True, right_index=True)
    return normalized_df

def cluster_with_kmeans(data_for_clustering:pd.DataFrame):
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans_labels = kmeans.fit_predict(data_for_clustering)
    return kmeans_labels, kmeans

def cluster_with_hdbscan(data_for_clustering:pd.DataFrame):
    hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True)
    hdbscan_labels = hdbscan_clusterer.fit_predict(data_for_clustering)
    return hdbscan_labels, hdbscan_clusterer

def run_kmeans_and_hdbscan(original_data:pd.DataFrame, cluster_data:pd.DataFrame):
    data_for_clustering = cluster_data.copy()
    kmeans_labels, kmeans = cluster_with_kmeans(data_for_clustering)

    data_for_clustering["kmeans_cluster"] = kmeans_labels
    original_data["kmeans_cluster"] = kmeans_labels
    print(f"KMeans_cluster distro: {data_for_clustering['kmeans_cluster'].value_counts()}")
    hdbscan_cluster_df = []
    hdbscan_models = {}
    for cluster, data in data_for_clustering.groupby("kmeans_cluster"):
        hdbscan_cluster_labels, hdbscan_clusterer = cluster_with_hdbscan(data.drop(columns=["kmeans_cluster"]))
        current_original_data = original_data[original_data["kmeans_cluster"] == cluster].reset_index(drop=True).copy()
        current_original_data["hdbscan_cluster"] = hdbscan_cluster_labels
        hdbscan_cluster_df.append(current_original_data)
        hdbscan_models[cluster] = hdbscan_clusterer
        print(f"HDBSCAN_cluster distro for kmeans {cluster}: {current_original_data['hdbscan_cluster'].value_counts()}")

    all_cluster_results = pd.concat(hdbscan_cluster_df).reset_index(drop=True)
    
    return all_cluster_results, kmeans, hdbscan_models

def cluster_data(original_data:pd.DataFrame, subset_fraction=None, kmeans=True, use_hdbscan=True, should_normalize_data=True, pca_dimensions=None, columns_to_use=None):
    
    if subset_fraction:
        original_data = original_data.sample(frac=subset_fraction)
    
    data_for_clustering = original_data.copy().reset_index(drop=True)
    
    metric_columns = columns_to_use if columns_to_use else ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]
    
    if should_normalize_data:
        data_for_clustering = normalize_data(data_for_clustering, metric_columns=metric_columns)

    if pca_dimensions:
        data_for_clustering = reduce_data(data_for_clustering, pca_dimensions, metric_columns)
    
    if pca_dimensions is None:
        data_for_clustering = data_for_clustering[metric_columns]

    if kmeans and use_hdbscan:
        result_df, kmeans, hdbscan_models = run_kmeans_and_hdbscan(original_data, data_for_clustering)
        folder_path = save_dataframe_as_parquet(result_df, folder_path="data", folder_name="kmeans_hdbscan_clustered_subset", always_overwrite=False, model_object={"kmeans": kmeans})
        for key, value in hdbscan_models.items():
            joblib.dump(value, f'{folder_path}/hdbscan_model_{key}.pkl')
        result = result_df.copy()
    
    elif kmeans:
        kmeans_labels, kmeans = cluster_with_kmeans(data_for_clustering)
        original_data["kmeans_cluster"] = kmeans_labels
        save_dataframe_as_parquet(original_data, folder_path="data", folder_name="kmeans_clustered_subset", always_overwrite=False, model_object={"kmeans": kmeans})
        print(f"KMeans_cluster distro: {original_data['kmeans_cluster'].value_counts()}")
        result = original_data.copy()
        
    elif use_hdbscan:
        hdbscan_labels, hdbscan_clusterer = cluster_with_hdbscan(data_for_clustering)
        original_data["hdbscan_cluster"] = hdbscan_labels
        save_dataframe_as_parquet(original_data, folder_path="data", folder_name="hdbscan_clustered_subset", always_overwrite=False, model_object={"hdbscan": hdbscan_clusterer})
        print(f"HDBSCAN_cluster distro: {original_data['hdbscan_cluster'].value_counts()}")
        result = original_data.copy()

    return result

# preprocessed_data = load_and_concatenate_parquet_files("data/preprocessed_spotify_data")
# clustered_data = cluster_data(preprocessed_data, subset_fraction=0.02, kmeans=False, use_hdbscan=True,  should_normalize_data=True, pca_dimensions=None, columns_to_use=None)
# display(clustered_data)