# Naming the Clusters

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.pyplot as plt


In [2]:
url = "https://drive.google.com/file/d/1oYQSNxfvw6kFr6-N9rKLRAnLXlp0osEt/view?usp=drive_link"
path = f"https://drive.google.com/uc?export=download&id={url.split('/')[-2]}"
original_songs_df = pd.read_csv(path)

# create a copy
songs_df = original_songs_df.copy()

# data cleaning
songs_df.columns = songs_df.columns.str.strip()
songs_df = songs_df.drop(["type", "Unnamed: 0"], axis=1)

songs_df.head()

Unnamed: 0,name,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,id,html
0,Se Eu Quiser Falar Com Deus ...,Gilberto Gil,0.658,0.259,11,-13.141,0,0.0705,0.694,5.9e-05,0.975,0.306,110.376,256213,4,1n7JnwviZ7zf0LR1tcGFq7,https://open.spotify.com/track/1n7JnwviZ7zf0LR...
1,Saudade De Bahia ...,Antônio Carlos Jobim,0.742,0.399,2,-12.646,1,0.0346,0.217,2e-06,0.107,0.693,125.039,191867,4,5QGM1U0eCYrQuwSJwTm5Zq,https://open.spotify.com/track/5QGM1U0eCYrQuwS...
2,"Canta Canta, Minha Gente ...",Martinho Da Vila,0.851,0.73,2,-11.048,1,0.347,0.453,6.3e-05,0.124,0.905,93.698,152267,4,0NLIFSZxPzQhCwnkn5PJYs,https://open.spotify.com/track/0NLIFSZxPzQhCwn...
3,Mulher Eu Sei ...,Chico César,0.705,0.0502,4,-18.115,1,0.0471,0.879,4.1e-05,0.386,0.524,106.802,186227,4,3mXqOdlLE1k67WsAxryPFs,https://open.spotify.com/track/3mXqOdlLE1k67Ws...
4,Rosa Morena ...,Kurt Elling,0.651,0.119,6,-19.807,1,0.038,0.916,0.000343,0.104,0.402,120.941,273680,4,7bSzjzjTkWT2CkIPPdp0eA,https://open.spotify.com/track/7bSzjzjTkWT2CkI...


In [3]:
# These are features to scale
to_scale = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature']

# Now I take the columns I wish to scale, fit and transform them, and inject them back into my dataframe
songs_df[to_scale] = MinMaxScaler().fit_transform(songs_df[to_scale])

songs_df.head()

Unnamed: 0,name,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,id,html
0,Se Eu Quiser Falar Com Deus ...,Gilberto Gil,0.680455,0.259,1.0,0.763897,0.0,0.076797,0.696787,6e-05,0.987842,0.31066,0.5158,0.059067,0.8,1n7JnwviZ7zf0LR1tcGFq7,https://open.spotify.com/track/1n7JnwviZ7zf0LR...
1,Saudade De Bahia ...,Antônio Carlos Jobim,0.767322,0.399,0.181818,0.771967,1.0,0.037691,0.217871,2e-06,0.108409,0.703553,0.584322,0.042058,0.8,5QGM1U0eCYrQuwSJwTm5Zq,https://open.spotify.com/track/5QGM1U0eCYrQuwS...
2,"Canta Canta, Minha Gente ...",Martinho Da Vila,0.880041,0.73,0.181818,0.798018,1.0,0.377996,0.454819,6.4e-05,0.125633,0.918782,0.437862,0.03159,0.8,0NLIFSZxPzQhCwnkn5PJYs,https://open.spotify.com/track/0NLIFSZxPzQhCwn...
3,Mulher Eu Sei ...,Chico César,0.729059,0.0502,0.363636,0.682811,1.0,0.051307,0.88253,4.1e-05,0.391084,0.53198,0.499098,0.040567,0.8,3mXqOdlLE1k67WsAxryPFs,https://open.spotify.com/track/3mXqOdlLE1k67Ws...
4,Rosa Morena ...,Kurt Elling,0.673216,0.119,0.545455,0.655228,1.0,0.041394,0.919679,0.000348,0.10537,0.408122,0.565171,0.063684,0.8,7bSzjzjTkWT2CkIPPdp0eA,https://open.spotify.com/track/7bSzjzjTkWT2CkI...


In [4]:
# Initialise the model
n_clusters = 25
my_kmeans = KMeans(n_clusters=n_clusters,
                   n_init="auto",
                   random_state = 123)

# Fit the model to the data
my_kmeans.fit(songs_df[to_scale])

# Obtain the cluster output
clusters = my_kmeans.labels_

# Attach the cluster output to our original DataFrame
songs_df["cluster"] = clusters

songs_df.head()

Unnamed: 0,name,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,id,html,cluster
0,Se Eu Quiser Falar Com Deus ...,Gilberto Gil,0.680455,0.259,1.0,0.763897,0.0,0.076797,0.696787,6e-05,0.987842,0.31066,0.5158,0.059067,0.8,1n7JnwviZ7zf0LR1tcGFq7,https://open.spotify.com/track/1n7JnwviZ7zf0LR...,15
1,Saudade De Bahia ...,Antônio Carlos Jobim,0.767322,0.399,0.181818,0.771967,1.0,0.037691,0.217871,2e-06,0.108409,0.703553,0.584322,0.042058,0.8,5QGM1U0eCYrQuwSJwTm5Zq,https://open.spotify.com/track/5QGM1U0eCYrQuwS...,7
2,"Canta Canta, Minha Gente ...",Martinho Da Vila,0.880041,0.73,0.181818,0.798018,1.0,0.377996,0.454819,6.4e-05,0.125633,0.918782,0.437862,0.03159,0.8,0NLIFSZxPzQhCwnkn5PJYs,https://open.spotify.com/track/0NLIFSZxPzQhCwn...,7
3,Mulher Eu Sei ...,Chico César,0.729059,0.0502,0.363636,0.682811,1.0,0.051307,0.88253,4.1e-05,0.391084,0.53198,0.499098,0.040567,0.8,3mXqOdlLE1k67WsAxryPFs,https://open.spotify.com/track/3mXqOdlLE1k67Ws...,10
4,Rosa Morena ...,Kurt Elling,0.673216,0.119,0.545455,0.655228,1.0,0.041394,0.919679,0.000348,0.10537,0.408122,0.565171,0.063684,0.8,7bSzjzjTkWT2CkIPPdp0eA,https://open.spotify.com/track/7bSzjzjTkWT2CkI...,10


In [5]:
from sklearn.metrics import pairwise_distances

# Get the cluster centers
centroids = my_kmeans.cluster_centers_

# Calculate the pairwise distances for each song to each cluster centroid
all_distances = pairwise_distances(songs_df, centroids, metric='euclidean')

def summarize_clusters(songs_df, centroids, all_distances):
   
    clusters_summary = pd.DataFrame()

    for cluster_num in range(len(centroids)):
        # Filter songs in the current cluster
        cluster_songs = songs_df[songs_df['cluster'] == cluster_num]
        
        # Calculate distances for the current cluster's songs
        indices = [songs_df.index.get_loc(idx) for idx in cluster_songs.index]
        cluster_distances = all_distances[indices, cluster_num]
        cluster_songs = cluster_songs.copy()  # Make a copy to avoid SettingWithCopyWarning
        cluster_songs['distance_to_centroid'] = cluster_distances
        
        # Calculate mean of the specified features for the cluster
        means = cluster_songs[['danceability', 'valence', 'acousticness', 'energy', 'instrumentalness']].mean()
        
        # Find the closest and farthest songs
        closest_song = cluster_songs.nsmallest(1, 'distance_to_centroid')
        farthest_song = cluster_songs.nlargest(1, 'distance_to_centroid')
        # Prepare the summary for this cluster
        summary_data = {
            'Cluster': cluster_num,
            'Mean Danceability': means['danceability'],
            'Mean Valence': means['valence'],
            'Mean Acousticness': means['acousticness'],
            'Mean Energy': means['energy'],
            'Mean Instrumentalness': means['instrumentalness'],
            'Closest Song Name': closest_song.index.get_level_values('name')[0],
            'Closest Song Artist': closest_song.index.get_level_values('artist')[0],
            'Closest Song Distance': closest_song['distance_to_centroid'].iloc[0],
            'Farthest Song Name': farthest_song.index.get_level_values('name')[0],
            'Farthest Song Artist': farthest_song.index.get_level_values('artist')[0],
            'Farthest Song Distance': farthest_song['distance_to_centroid'].iloc[0]
        }
        
        # Append the summary data to the clusters_summary DataFrame
        clusters_summary = pd.concat([clusters_summary, pd.DataFrame([summary_data])], ignore_index=True)

    return clusters_summary

clusters_summary = summarize_clusters(songs_df, centroids, all_distances)
clusters_summary


ValueError: could not convert string to float: 'Se Eu Quiser Falar Com Deus                                                                                                                                                      '