In [None]:
# import libraries

import pandas as pd

from sklearn import set_config
set_config(transform_output='pandas') 

from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.cluster import KMeans

import plotly.express as px

import random

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from sklearn.metrics import silhouette_score, silhouette_samples
import numpy as np

from sklearn import set_config
set_config(transform_output='pandas')

# Exploring Data

In [None]:
# read file
df_audio_features_5000 = pd.read_csv("df_audio_features_5000_cleaned_whitespaces.csv",
index_col=["name", "artist"]
                                    )

df_audio_features_5000.drop(["type","id","html"], axis=1, inplace=True)  
df_audio_features_5000.head()

In [None]:
# correlation matrix
correlation_matrix = df_audio_features_5000.corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# variation
variation = df_audio_features_5000.var()
variation

In [None]:
# Boxplots
df_audio_features_5000.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()

# Final DataFrame

In [None]:
# read file
df_audio_features_5000 = pd.read_csv("df_audio_features_5000_cleaned_whitespaces.csv",
index_col=["name", "artist"]
                                    )
# clean data                                     
df_audio_features_5000.drop(["key", "loudness", "mode", "liveness", "tempo", "duration_ms", "time_signature", "type","id","html"], axis=1, inplace=True)                               
df_audio_features_5000.sample(50)



In [None]:
df_audio_features_5000.info()

In [None]:
df_audio_features_5000.describe()

In [None]:
# Boxplots
df_audio_features_5000.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
# Scatterplots
attributes = [
    'danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence'
]

num_rows = len(attributes)
num_cols = len(attributes)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(25,25))

for i, x_attribute in enumerate(attributes):
    for j, y_attribute in enumerate(attributes):
        ax = axes[i, j]
        ax.scatter(x=df_audio_features_5000[x_attribute], y=df_audio_features_5000[y_attribute])
        ax.set_xlabel(x_attribute)
        ax.set_ylabel(y_attribute)

plt.tight_layout()
plt.show()



In [None]:
# correlation matrix
correlation_matrix = df_audio_features_5000.corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# variation
variation = df_audio_features_5000.var()
variation

In [None]:
# Distances (Euclidean)
eucl = pd.DataFrame(pairwise_distances(df_audio_features_5000),
                    index=df_audio_features_5000.index,
                    columns=df_audio_features_5000.index)
eucl

In [None]:
plt.subplots(figsize=(40,25))
sns.heatmap(eucl, cmap='coolwarm');

# Data Scaling

## StandardScaler

In [None]:
standardscaler = StandardScaler().fit_transform(df_audio_features_5000)
df_audio_features_5000_norm = pd.DataFrame(standardscaler, columns=df_audio_features_5000.columns, index=df_audio_features_5000.index)
df_audio_features_5000_norm

In [None]:
# Boxplots after scaling
df_audio_features_5000_norm.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
df_audio_features_5000_norm.describe()

In [None]:
# correlation
correlation_matrix = df_audio_features_5000_norm.corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Scatterplots after sclaing
attributes = [
    'danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence'
]

num_rows = len(attributes)
num_cols = len(attributes)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(25,25))

for i, x_attribute in enumerate(attributes):
    for j, y_attribute in enumerate(attributes):
        ax = axes[i, j]
        ax.scatter(x=df_audio_features_5000_norm[x_attribute], y=df_audio_features_5000_norm[y_attribute])
        ax.set_xlabel(x_attribute)
        ax.set_ylabel(y_attribute)

plt.tight_layout()
plt.show()


In [None]:
# Euclidean Distances
eucl = pd.DataFrame(pairwise_distances(df_audio_features_5000_norm),
                    index=df_audio_features_5000_norm.index,
                    columns=df_audio_features_5000_norm.index)
eucl

In [None]:
plt.subplots(figsize=(40,25))
sns.heatmap(eucl, cmap='coolwarm');

# Choosing the right number of clusters

## Inertia

In [None]:
!wmic cpu get NumberOfCores,NumberOfLogicalProcessors

In [None]:
import os
os.environ['OMP_NUM_THREADS']="7"

In [None]:
# Test
kmeans_2 = KMeans(n_clusters=2, n_init = "auto", random_state = 42)
kmeans_2.fit(df_audio_features_5000_norm)
inertia_2 = kmeans_2.inertia_
inertia_2

In [None]:
# k = 1 - 100
max_k = 100

inertia_list = []

for i in range(1,max_k):
    kmeans_all = KMeans(n_clusters=i, n_init = "auto", random_state = 42)
    kmeans_all.fit(df_audio_features_5000_norm)
    inertia_list.append(round(kmeans_all.inertia_))
    
inertia_list

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
plt.title(f'Inertia evolution from 1 cluster to {max_k} cluster')
sns.lineplot(x=range(1, max_k), y=inertia_list, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

## Silhouette Score

In [None]:
# k = 2 - 100
sil_score = []

for k in range(2, max_k):
    labels = KMeans(n_clusters=k, n_init = "auto", random_state = 42).fit(df_audio_features_5000_norm).labels_
    sil_score.append(silhouette_score(df_audio_features_5000_norm, labels))

plt.title(f'Silhouette score evolution from 2 cluster to {max_k} clusters')
sns.lineplot(x=range(2, max_k), y=sil_score, marker='o')
plt.show()

In [None]:
# k = 20 - 100
sil_score = []

for k in range(20, max_k):
    labels = KMeans(n_clusters=k, n_init = "auto", random_state = 21).fit(df_audio_features_5000_norm).labels_
    sil_score.append(silhouette_score(df_audio_features_5000_norm, labels))

plt.title(f'Silhouette score evolution from 20 cluster to {max_k} clusters')
sns.lineplot(x=range(20, max_k), y=sil_score, marker='o')
plt.show()

In [None]:
# Knife graphs
max_k = 36
data = df_audio_features_5000_norm

for k in range(7, max_k):
    kmean_model = KMeans(n_clusters=k, n_init = "auto", random_state = 42).fit(data)
    labels = kmean_model.labels_

    if 1 < k < data.shape[0]:

        fig, (axis1, axis2) = plt.subplots(1, 2)
        fig.set_size_inches(20, 8)

        sil_avg = silhouette_score(data, labels)
        print(f"* For k = {k} the average to silhouette is: {round(sil_avg,4)}")
        sample_sil_val = silhouette_samples(data, labels)

        y_lower = 10

        for i in range(k):
            ith_cluster_sv = sample_sil_val[labels == i]
            print(f"\t- For cluster = {i} the silhouette value is: {round(np.mean(ith_cluster_sv),2)}")

            ith_cluster_sv.sort()

            # Calculate where to put the silhuette on the y axes
            ith_cluster_size = ith_cluster_sv.shape[0]
            y_upper = y_lower + ith_cluster_size


            # Paint the cluster
            axis1.fill_betweenx(np.arange(y_lower, y_upper),
                               0, ith_cluster_sv,
                                alpha = 0.7)


            # Label the cluster
            axis1.text(-0.05, y_lower + 0.5 * ith_cluster_size, str(i))

            # Calculate the next y lower value for the next cluster of the graph
            y_lower = y_upper + 10 # we left 10 spaces without any observation

        axis1.set_title("Silhouette score for k = %s"%str(k))
        axis1.set_xlabel("S(i)")
        axis1.set_ylabel("Cluster ID")


        plt.show()

# Choose k = 30

In [None]:
kmeans_30 = KMeans(n_clusters=30, n_init = "auto", random_state = 42)
kmeans_30.fit(df_audio_features_5000_norm)

In [None]:
kmeans_30.cluster_centers_

In [None]:
df_audio_features_5000_norm['cluster'] = kmeans_30.labels_
df_audio_features_5000_norm.sample(50)

### Parallel Coordinates

In [None]:
fig = px.parallel_coordinates(df_audio_features_5000_norm, color="cluster",
                              dimensions=['danceability', 'energy','speechiness', 'acousticness', 'instrumentalness','valence'],
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

### Exploring the Clusters

In [None]:
df_audio_features_5000_norm.groupby(by="cluster").mean()

In [None]:
df_audio_features_5000_norm.groupby(by="cluster").count()

In [None]:
df_audio_features_5000_norm.query("cluster == 0").sample(50)

In [None]:
centroids = kmeans_30.cluster_centers_
centroids_df = pd.DataFrame(centroids)
centroids_df.columns = ['danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence', 'cluster']
centroids_df

In [None]:
# Distances between clusters
eucl_centroids = pd.DataFrame(pairwise_distances(centroids_df),
                    index=centroids_df.index,
                    columns=centroids_df.index)
eucl_centroids

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(eucl_centroids);

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plot_data = pd.DataFrame(df_audio_features_5000_norm)
plot_data.columns = ['danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence', "cluster"]
plot_data["cluster"] = kmeans_30.labels_

sns.scatterplot(data=plot_data, x='danceability', y='energy', hue='cluster', palette='Set2', s=75)
sns.scatterplot(data=centroids_df, x='danceability', y='energy', color='red', s=250)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plot_data = pd.DataFrame(df_audio_features_5000_norm)
plot_data.columns = ['danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence', "cluster"]
plot_data["cluster"] = kmeans_30.labels_

sns.scatterplot(data=plot_data, x='danceability', y='speechiness', hue='cluster', palette='Set2', s=75)
sns.scatterplot(data=centroids_df, x='danceability', y='energy', color='red', s=250)
plt.show()

In [None]:
attributes = [
    'danceability', 'energy',
    'speechiness', 'acousticness', 'instrumentalness',
    'valence'
]

num_rows = len(attributes)
num_cols = len(attributes)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(25, 25))

plot_data = pd.DataFrame(df_audio_features_5000_norm)
plot_data.columns = attributes + ["cluster"]
plot_data["cluster"] = kmeans_30.labels_

for i, x_attribute in enumerate(attributes):
    for j, y_attribute in enumerate(attributes):
        ax = axes[i, j]
        
        sns.scatterplot(data=plot_data, x=x_attribute, y=y_attribute, hue='cluster', palette='Set2', s=75, ax=ax)
        sns.scatterplot(data=centroids_df, x=x_attribute, y=y_attribute, color='red', s=250, ax=ax)
        
        ax.set_xlabel(x_attribute)
        ax.set_ylabel(y_attribute)
        ax.get_legend().remove()

plt.tight_layout()
plt.show()

In [None]:
# 3D Plot with centroids
colors = ['r','g','b']
markers = ['o', '^','s']
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')

for marker, d in plot_data.groupby('cluster'):
    ax.scatter(d['danceability'], d['energy'], d['speechiness'], marker=markers[marker], label=d['cluster'])
ax.scatter(clusters_test_2['danceability'], clusters_test_2['energy'],clusters_test_2['speechiness'], color='red', s=250)
ax.set_xlabel('Danceability')
ax.set_ylabel('Energy')
ax.set_zlabel('speechiness')
plt.show()

In [None]:
df_test = df_audio_features_5000[['danceability','energy']]
df_test.head()

In [None]:
df_test_norm = MinMaxScaler().fit_transform(df_test)

# let's see the first 5 observations after normalising the data
df_test_norm = pd.DataFrame(df_test_norm)
df_test_norm.columns = ['danceability','energy']
df_test_norm[:5]

In [None]:
kmeans_test = KMeans(n_clusters=30, n_init = "auto")
kmeans_test.fit(df_test_norm)

In [None]:
kmeans_test.cluster_centers_

In [None]:
clusters_test = pd.DataFrame(kmeans_test.cluster_centers_)
clusters_test.columns = ['danceability','energy']
clusters_test

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
plot_data = pd.DataFrame(df_test_norm)
plot_data.columns = ['danceability','energy']

plt.title('Comparing our centroids and our dataset')
plt.xlabel('danceability normalised')
plt.ylabel('energy normalised')
sns.scatterplot(data=plot_data, x='danceability', y='energy')
sns.scatterplot(data=clusters_test, x='danceability', y='energy', color='red', s=250)
plt.show()

# Posting Clusters to Spotify

In [None]:
# read data again to keep id (necessary for Spotify)
df_audio_features_5000 = pd.read_csv("df_audio_features_5000_cleaned_whitespaces.csv",
index_col=["name", "artist"]
                                    )
                                     
df_audio_features_5000.drop(["key", "loudness", "mode", "liveness", "tempo", "duration_ms", "time_signature", "type","html"], axis=1, inplace=True)                               
df_audio_features_5000.sample(50)

In [None]:
to_scale = ["danceability", "energy", "speechiness", "acousticness", "instrumentalness", "valence"]

scaled_songs = df_audio_features_5000.copy()
scaled_songs[to_scale] = StandardScaler().fit_transform(df_audio_features_5000[to_scale])
scaled_songs

In [None]:
kmeans = KMeans(n_clusters=30, n_init="auto")
kmeans.fit(scaled_songs.iloc[:, 1:-2])

In [None]:
labeled_songs = df_audio_features_5000.copy()
labeled_songs["cluster"] = kmeans.labels_
labeled_songs

In [None]:
!pip install spotipy

In [None]:
import spotipy
import pickle
from spotipy.oauth2 import SpotifyOAuth


In [None]:
scope = 'playlist-modify-public'
username = "YOUR_USERNAME"
redirectUri = "YOUR_HOST"
client_id = "YOUR_CLIENT_ID"
client_secret = "YOUR_CLIENT_SECRET"

token = SpotifyOAuth(scope=scope,
                     username=username,
                     client_id=client_id,
                     client_secret=client_secret,
                     redirect_uri=redirectUri,
#                      open_browser=False                    # this line is need in Colab, but not on local machine
                     )
spotifyObject = spotipy.Spotify(auth_manager = token)

In [None]:
playlist_collection = {}
for i in range(20):
    playlist_name = f'my_playlist_cluster_{i}'
    playlist_description= 'this is a test list'
    playlist_id = spotifyObject.user_playlist_create(user=username,
                                                     name=playlist_name,
                                                     public=True,
                                                     description=playlist_description)['id']
    id_list = list(labeled_songs.loc[labeled_songs["cluster"] == i]
                                .sample(50)
                                .id)
    uris = [f'spotify:track:{str(id).strip()}' for id in id_list]
    spotifyObject.user_playlist_add_tracks(user=username,playlist_id=playlist_id,tracks=uris)
    playlist_collection[playlist_name] = playlist_id

In [None]:
labeled_songs.query("cluster == 6").head(20)
