<a href="https://colab.research.google.com/github/RepoNotAvailable/Machine-Learning-/blob/main/Clustering_Music_Genres_with_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn import cluster

In [3]:
data = pd.read_csv("Spotify-2000.csv")
print(data.head())

   Index                   Title             Artist            Top Genre  \
0      1                 Sunrise        Norah Jones      adult standards   
1      2             Black Night        Deep Purple           album rock   
2      3          Clint Eastwood           Gorillaz  alternative hip hop   
3      4           The Pretender       Foo Fighters    alternative metal   
4      5  Waitin' On A Sunny Day  Bruce Springsteen         classic rock   

   Year  Beats Per Minute (BPM)  Energy  Danceability  Loudness (dB)  \
0  2004                     157      30            53            -14   
1  2000                     135      79            50            -11   
2  2001                     168      69            66             -9   
3  2007                     173      96            43             -4   
4  2002                     106      82            58             -5   

   Liveness  Valence Length (Duration)  Acousticness  Speechiness  Popularity  
0        11       68          

In [4]:
data=data.drop("Index", axis=1)

In [5]:
# Convert relevant columns to numeric, handling errors
for col in data.columns:
    if data[col].dtype == 'object':  # Check if column is of object type (likely string)
        try:
            # Attempt to convert to numeric, replacing errors with NaN
            data[col] = pd.to_numeric(data[col], errors='coerce')
        except ValueError:
            print(f"Could not convert column '{col}' to numeric. Skipping for correlation.")

In [6]:
print(data.corr())

                        Title  Artist  Top Genre      Year  \
Title                     NaN     NaN        NaN       NaN   
Artist                    NaN     NaN        NaN       NaN   
Top Genre                 NaN     NaN        NaN       NaN   
Year                      NaN     NaN        NaN  1.000000   
Beats Per Minute (BPM)    NaN     NaN        NaN  0.012570   
Energy                    NaN     NaN        NaN  0.147235   
Danceability              NaN     NaN        NaN  0.077493   
Loudness (dB)             NaN     NaN        NaN  0.343764   
Liveness                  NaN     NaN        NaN  0.019017   
Valence                   NaN     NaN        NaN -0.166163   
Length (Duration)         NaN     NaN        NaN -0.023915   
Acousticness              NaN     NaN        NaN -0.132946   
Speechiness               NaN     NaN        NaN  0.054097   
Popularity                NaN     NaN        NaN -0.158962   

                        Beats Per Minute (BPM)    Energy  Danceabilit

In [7]:
print(data.isnull().sum())  # Check for missing values (NaN) in each column

Title                     1993
Artist                    1994
Top Genre                 1994
Year                         0
Beats Per Minute (BPM)       0
Energy                       0
Danceability                 0
Loudness (dB)                0
Liveness                     0
Valence                      0
Length (Duration)            4
Acousticness                 0
Speechiness                  0
Popularity                   0
dtype: int64


In [8]:
print(data.dtypes)

Title                     float64
Artist                    float64
Top Genre                 float64
Year                        int64
Beats Per Minute (BPM)      int64
Energy                      int64
Danceability                int64
Loudness (dB)               int64
Liveness                    int64
Valence                     int64
Length (Duration)         float64
Acousticness                int64
Speechiness                 int64
Popularity                  int64
dtype: object


In [9]:
for col in data.columns:
       if data[col].nunique() <= 1:  # Check for constant columns (only 1 unique value)
           print(f"Column '{col}' is constant or has zero variance.")

Column 'Title' is constant or has zero variance.
Column 'Artist' is constant or has zero variance.
Column 'Top Genre' is constant or has zero variance.


Clustering Analysis of Audio Features
#K-means clustering algorithm to find the similarities between all the audio features

In [10]:
data2 = data[["Beats Per Minute (BPM)", "Loudness (dB)",
              "Liveness", "Valence", "Acousticness",
              "Speechiness","Energy", "Danceability"]]

from sklearn.preprocessing import MinMaxScaler
for i in data.columns:
    MinMaxScaler(i)

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(data2)

In [11]:
data["Music Segments"] = clusters
MinMaxScaler(data["Music Segments"])
data["Music Segments"] = data["Music Segments"].map({1: "Cluster 1", 2:
    "Cluster 2", 3: "Cluster 3", 4: "Cluster 4", 5: "Cluster 5",
    6: "Cluster 6", 7: "Cluster 7", 8: "Cluster 8",
    9: "Cluster 9", 10: "Cluster 10"})

In [12]:
print(data.head())

   Title  Artist  Top Genre  Year  Beats Per Minute (BPM)  Energy  \
0    NaN     NaN        NaN  2004                     157      30   
1    NaN     NaN        NaN  2000                     135      79   
2    NaN     NaN        NaN  2001                     168      69   
3    NaN     NaN        NaN  2007                     173      96   
4    NaN     NaN        NaN  2002                     106      82   

   Danceability  Loudness (dB)  Liveness  Valence  Length (Duration)  \
0            53            -14        11       68              201.0   
1            50            -11        17       81              207.0   
2            66             -9         7       52              341.0   
3            43             -4         3       37              269.0   
4            58             -5        10       87              256.0   

   Acousticness  Speechiness  Popularity Music Segments  
0            94            3          71      Cluster 9  
1            17            7        

In [13]:
import plotly.graph_objects as go
PLOT = go.Figure()
for i in list(data["Music Segments"].unique()):


    PLOT.add_trace(go.Scatter3d(x = data[data["Music Segments"]== i]['Beats Per Minute (BPM)'],
                                y = data[data["Music Segments"] == i]['Energy'],
                                z = data[data["Music Segments"] == i]['Danceability'],
                                mode = 'markers',marker_size = 6, marker_line_width = 1,
                                name = str(i)))
PLOT.update_traces(hovertemplate='Beats Per Minute (BPM): %{x} <br>Energy: %{y} <br>Danceability: %{z}')


PLOT.update_layout(width = 800, height = 800, autosize = True, showlegend = True,
                   scene = dict(xaxis=dict(title = 'Beats Per Minute (BPM)', titlefont_color = 'black'),
                                yaxis=dict(title = 'Energy', titlefont_color = 'black'),
                                zaxis=dict(title = 'Danceability', titlefont_color = 'black')),
                   font = dict(family = "Gilroy", color  = 'black', size = 12))