# Clustering Music Genres

In [19]:
import pandas as pd
import numpy as np
from sklearn import cluster

data = pd.read_csv(r"C:\Users\bhara\Downloads\VK/Spotify-2000.csv")
print(data.head())

   Index                   Title             Artist            Top Genre  \
0      1                 Sunrise        Norah Jones      adult standards   
1      2             Black Night        Deep Purple           album rock   
2      3          Clint Eastwood           Gorillaz  alternative hip hop   
3      4           The Pretender       Foo Fighters    alternative metal   
4      5  Waitin' On A Sunny Day  Bruce Springsteen         classic rock   

   Year  Beats Per Minute (BPM)  Energy  Danceability  Loudness (dB)  \
0  2004                     157      30            53            -14   
1  2000                     135      79            50            -11   
2  2001                     168      69            66             -9   
3  2007                     173      96            43             -4   
4  2002                     106      82            58             -5   

   Liveness  Valence Length (Duration)  Acousticness  Speechiness  Popularity  
0        11       68          

In [20]:
data.isnull().sum()

Index                     0
Title                     0
Artist                    0
Top Genre                 0
Year                      0
Beats Per Minute (BPM)    0
Energy                    0
Danceability              0
Loudness (dB)             0
Liveness                  0
Valence                   0
Length (Duration)         0
Acousticness              0
Speechiness               0
Popularity                0
dtype: int64

In [21]:
#Any drop_duplicates values from the data frame
data.drop_duplicates().any()

Index                     True
Title                     True
Artist                    True
Top Genre                 True
Year                      True
Beats Per Minute (BPM)    True
Energy                    True
Danceability              True
Loudness (dB)             True
Liveness                  True
Valence                   True
Length (Duration)         True
Acousticness              True
Speechiness               True
Popularity                True
dtype: bool

In [22]:
data = data.drop("Index", axis=1)

In [23]:
#the correlation between all the audio features in the dataset
print(data.dtypes)


Title                     object
Artist                    object
Top Genre                 object
Year                       int64
Beats Per Minute (BPM)     int64
Energy                     int64
Danceability               int64
Loudness (dB)              int64
Liveness                   int64
Valence                    int64
Length (Duration)         object
Acousticness               int64
Speechiness                int64
Popularity                 int64
dtype: object


Clustering Analysis of Audio Features

In [24]:
data2 = data[["Beats Per Minute (BPM)", "Loudness (dB)", 
              "Liveness", "Valence", "Acousticness", 
              "Speechiness"]]

from sklearn.preprocessing import MinMaxScaler
for i in data.columns:
    MinMaxScaler(i)

In [25]:
#The KMeans class in the sklearn module can be used to cluster data into a specified number of clusters.The code then fits the K-Means model to the audio features
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(data2)

In [26]:
print(data.head())

                    Title             Artist            Top Genre  Year  \
0                 Sunrise        Norah Jones      adult standards  2004   
1             Black Night        Deep Purple           album rock  2000   
2          Clint Eastwood           Gorillaz  alternative hip hop  2001   
3           The Pretender       Foo Fighters    alternative metal  2007   
4  Waitin' On A Sunny Day  Bruce Springsteen         classic rock  2002   

   Beats Per Minute (BPM)  Energy  Danceability  Loudness (dB)  Liveness  \
0                     157      30            53            -14        11   
1                     135      79            50            -11        17   
2                     168      69            66             -9         7   
3                     173      96            43             -4         3   
4                     106      82            58             -5        10   

   Valence Length (Duration)  Acousticness  Speechiness  Popularity  
0       68            

In [27]:
data[['Beats Per Minute (BPM)']]

Unnamed: 0,Beats Per Minute (BPM)
0,157
1,135
2,168
3,173
4,106
...,...
1989,94
1990,175
1991,168
1992,174


In [28]:
import plotly
!pip install --upgrade nbformat




In [29]:
import plotly.graph_objects as go

PLOT = go.Figure()

for i in data["Top Genre"].unique():
    PLOT.add_trace(go.Scatter3d(
        x=data[data["Top Genre"] == i]['Beats Per Minute (BPM)'],
        y=data[data["Top Genre"] == i]['Energy'],
        z=data[data["Top Genre"] == i]['Danceability'],
        mode='markers',
        marker=dict(size=6, line=dict(width=1)),  # Corrected marker syntax
        name=str(i)
    ))

PLOT.update_traces(hovertemplate='Beats Per Minute (BPM): %{x} <br>Energy: %{y} <br>Danceability: %{z}')

PLOT.update_layout(
    width=800, height=800, autosize=True, showlegend=True,
    scene=dict(
        xaxis=dict(title=dict(text='Beats Per Minute (BPM)', font=dict(color='black'))),
        yaxis=dict(title=dict(text='Energy', font=dict(color='black'))),
        zaxis=dict(title=dict(text='Danceability', font=dict(color='black')))
    ),
    font=dict(family="Gilroy", color='black', size=12)
)

PLOT.show(renderer="browser")

