In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn import tree, metrics
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from yellowbrick.classifier import ROCAUC
import torch
from torch import nn, optim
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '0' ]

In [2]:
data = pd.read_csv("musicData.csv")
data.head()

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,4-Apr,0.27,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50005 entries, 0 to 50004
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  float64
 1   artist_name       50000 non-null  object 
 2   track_name        50000 non-null  object 
 3   popularity        50000 non-null  float64
 4   acousticness      50000 non-null  float64
 5   danceability      50000 non-null  float64
 6   duration_ms       50000 non-null  float64
 7   energy            50000 non-null  float64
 8   instrumentalness  50000 non-null  float64
 9   key               50000 non-null  object 
 10  liveness          50000 non-null  float64
 11  loudness          50000 non-null  float64
 12  mode              50000 non-null  object 
 13  speechiness       50000 non-null  float64
 14  tempo             50000 non-null  object 
 15  obtained_date     50000 non-null  object 
 16  valence           50000 non-null  float6

<font size="5"> <font color='blue'> Find rows that don't contain any data and discard them

In [4]:
missing = data.isna().all(axis=1)
print(data[missing])

       instance_id artist_name track_name  popularity  acousticness  \
10000          NaN         NaN        NaN         NaN           NaN   
10001          NaN         NaN        NaN         NaN           NaN   
10002          NaN         NaN        NaN         NaN           NaN   
10003          NaN         NaN        NaN         NaN           NaN   
10004          NaN         NaN        NaN         NaN           NaN   

       danceability  duration_ms  energy  instrumentalness  key  liveness  \
10000           NaN          NaN     NaN               NaN  NaN       NaN   
10001           NaN          NaN     NaN               NaN  NaN       NaN   
10002           NaN          NaN     NaN               NaN  NaN       NaN   
10003           NaN          NaN     NaN               NaN  NaN       NaN   
10004           NaN          NaN     NaN               NaN  NaN       NaN   

       loudness mode  speechiness tempo obtained_date  valence music_genre  
10000       NaN  NaN          NaN

In [5]:
data1 = data[~missing].reset_index(drop=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   instance_id       50000 non-null  float64
 1   artist_name       50000 non-null  object 
 2   track_name        50000 non-null  object 
 3   popularity        50000 non-null  float64
 4   acousticness      50000 non-null  float64
 5   danceability      50000 non-null  float64
 6   duration_ms       50000 non-null  float64
 7   energy            50000 non-null  float64
 8   instrumentalness  50000 non-null  float64
 9   key               50000 non-null  object 
 10  liveness          50000 non-null  float64
 11  loudness          50000 non-null  float64
 12  mode              50000 non-null  object 
 13  speechiness       50000 non-null  float64
 14  tempo             50000 non-null  object 
 15  obtained_date     50000 non-null  object 
 16  valence           50000 non-null  float6

<font size="5"> <font color='blue'> Count the number of songs with missing duration ("-1") or tempo data ("?")

In [6]:
print("{} songs have missing duration data.".format(sum(data1['duration_ms']==-1)))
print("{} songs have missing tempo data.".format(sum(data1['tempo']=="?")))
print("{} songs have both data missing.".format(sum((data1["duration_ms"]==-1) & (data1['tempo']=="?"))))

4939 songs have missing duration data.
4980 songs have missing tempo data.
479 songs have both data missing.


<font size="5"> <font color='blue'> Convert these values to NaN and then replace NaN with the average value of duration and tempo of songs of the same genre

In [7]:
# Replace -1 in 'duration_ms' with NaN
data1['duration_ms'] = data1['duration_ms'].replace(-1, np.nan)

# Replace '?' in 'tempo' with NaN
data1['tempo'] = data1['tempo'].replace('?', np.nan)

# Convert 'tempo' to numeric in case it is still an object type (after replacing '?')
data1['tempo'] = pd.to_numeric(data1['tempo'], errors='coerce')

# Verify that the replacement worked
print(data1[['duration_ms', 'tempo']].isna().sum())  # Check count of NaN values

duration_ms    4939
tempo          4980
dtype: int64


In [8]:
data1[data1['duration_ms'].isna()].head()

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
13,62039.0,DJ Shadow,Broken Levee Blues,31.0,0.86,0.737,,0.405,0.0361,A,0.173,-10.536,Minor,0.0424,154.745,4-Apr,0.647,Electronic
16,83926.0,San Holo,One Thing,59.0,0.136,0.336,,0.746,0.0,C#,0.737,-4.315,Minor,0.0685,151.756,4-Apr,0.21,Electronic
24,40033.0,The Prodigy,Diesel Power,56.0,0.068,0.725,,0.877,3.6e-05,C,0.09,-3.496,Major,0.0646,105.919,4-Apr,0.0977,Electronic
35,27048.0,Fabian Mazur,If U Wanted To,33.0,0.108,0.493,,0.682,0.0,A,0.196,-5.781,Minor,0.287,,4-Apr,0.239,Electronic


In [9]:
# Fill missing 'duration_ms' with the average 'duration_ms' for each genre
data1['duration_ms'] = data1.groupby('music_genre')['duration_ms'].transform(
    lambda x: x.fillna(x.mean())
)

# Fill missing 'tempo' with the average 'tempo' for each genre
data1['tempo'] = data1.groupby('music_genre')['tempo'].transform(
    lambda x: x.fillna(x.mean())
)

data1.head()

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,270703.43126,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,4-Apr,0.27,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic


In [10]:
data1[data1['music_genre']=='Electronic']['duration_ms'].mean()

270703.43125968566

In [11]:
print(data1[['duration_ms', 'tempo']].isna().sum())  # Check count of NaN values

duration_ms    0
tempo          0
dtype: int64


In [12]:
data1.head()

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,270703.43126,0.941,0.792,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.0127,0.622,218293.0,0.89,0.95,D,0.124,-7.043,Minor,0.03,115.002,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.62,215613.0,0.755,0.0118,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.0254,0.774,166875.0,0.7,0.00253,C#,0.157,-4.498,Major,0.239,128.014,4-Apr,0.27,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.909,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic


<font size="5"> <font color='blue'> Train test split: 4500 songs from each genre as the training set and 500 songs from each genre make up the testing set

In [13]:
# Group the data by 'music_genre'
grouped = data1.groupby('music_genre')

# Initialize empty lists to store train/test data
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

# Split each genre group into train/test sets
for _, group in grouped:
    dropped = ['instance_id', 'artist_name','track_name','obtained_date','music_genre']
    X_group = group.drop(columns = dropped)  # Features (all columns except 'music_genre')
    y_group = group['music_genre']  # Labels (only the 'music_genre' column)
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, y_group, train_size=4500, test_size=500, random_state=42)
    X_train_list.append(X_train_group)
    X_test_list.append(X_test_group)
    y_train_list.append(y_train_group)
    y_test_list.append(y_test_group)

# Concatenate the train/test sets from all genre groups
X_train = pd.concat(X_train_list)
X_test = pd.concat(X_test_list)
y_train = pd.concat(y_train_list)
y_test = pd.concat(y_test_list)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (45000, 13)
X_test shape: (5000, 13)
y_train shape: (45000,)
y_test shape: (5000,)


<font size="5"> <font color='blue'> Standardize the numerical non-categorical data

In [14]:
numerical_columns = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness','speechiness','tempo','valence']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_train[numerical_columns])
X_train[numerical_columns] = scaled_data
scaled_data1 = scaler.fit_transform(X_test[numerical_columns])
X_test[numerical_columns] = scaled_data1

In [15]:
X_train.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
18716,0.371618,-0.890603,-1.658939,-0.681362,1.010771,-0.55156,G#,-0.669772,0.756035,Major,-0.447967,2.058628,0.117462
18779,-0.144277,-0.058545,-0.415942,-0.41725,1.014555,-0.556853,G,0.144285,1.056728,Minor,-0.565129,0.086139,0.060814
15135,0.565078,-0.878437,1.991665,-0.116852,0.371341,-0.555977,A,-0.845366,0.083086,Major,-0.388894,-0.716494,0.635383
19480,1.790329,-0.834788,0.793461,0.480572,0.99942,-0.556884,G,-0.608966,0.892832,Major,-0.642909,-0.580636,1.744059
16437,0.500592,-0.86293,1.96367,-0.73469,-0.30214,-0.556876,D,-0.782078,0.278139,Major,-0.324898,-0.003747,0.485672


In [16]:
X_test.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
16501,0.941997,-0.89742,0.587927,-0.636147,0.45396,-0.568783,D#,0.981737,0.593691,Major,-0.60621,0.522918,0.603908
17586,0.058772,-0.903512,0.196637,-0.203161,1.229618,-0.168284,C#,-0.465995,0.912139,Major,-0.140277,-0.631398,-0.109013
17653,0.184947,-0.895584,1.074245,-0.193192,0.165431,-0.560197,E,-0.297093,0.280269,Major,-0.403368,-0.45495,-0.469525
16055,-0.256665,-0.841238,-0.166703,1.690881,0.641317,-0.568777,F,-0.091998,0.595961,Major,-0.551985,0.336597,-0.202179
15705,0.752735,0.262019,1.035116,-0.096284,-0.745124,-0.56784,B,-0.508221,-0.520554,Major,-0.310985,-1.387317,-0.931304


In [23]:
from catboost import CatBoostClassifier
cat_features = ['key','mode']
model = CatBoostClassifier(iterations=500,cat_features=cat_features, verbose=0, random_state=42)
model.fit(X_train,y_train)
proba = model.predict_proba(X_test)
from sklearn.metrics import roc_auc_score, classification_report
test_roc_auc = roc_auc_score(y_test, proba, multi_class='ovr', average='macro')
print(test_roc_auc)

0.9352220444444445
