In [39]:
# import libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from matplotlib.colors import LinearSegmentedColormap
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, adjusted_rand_score, normalized_mutual_info_score
from sklearn.impute import KNNImputer, SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import cluster, datasets
!pip install kneed
from kneed import KneeLocator



In [41]:
# load 100 top Billboard songs dataset 
songs_billboard = "df_song_dict_billboard.csv"
df_songs_billboard = pd.read_csv(songs_billboard)
df_songs_billboard

Unnamed: 0,title,artist,rank
0,Luther,Kendrick Lamar & SZA,1
1,Die With A Smile,Lady Gaga & Bruno Mars,2
2,Not Like Us,Kendrick Lamar,3
3,TV Off,Kendrick Lamar Featuring Lefty Gunplay,4
4,A Bar Song (Tipsy),Shaboozey,5
...,...,...,...
95,No Pole,Don Toliver,96
96,Small Town Fame,Drake,97
97,Que Pasaria...,Rauw Alejandro & Bad Bunny,98
98,Greenlight,Tate McRae,99


In [43]:
# load audio features dataset 
audio_features = "df_audio_features_clean.csv"
df_audio_features = pd.read_csv(audio_features)
df_audio_features

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90455,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5
90456,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4
90457,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4
90458,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4


In [45]:
# Summary of the data: checking data types
column_summary_df_audio_features = pd.DataFrame({
    'Column Name': df_audio_features.columns,
    'Data Type': df_audio_features.dtypes.values,
    'Non-Null Count': df_audio_features.notnull().sum().values,
    'Null Count': df_audio_features.isnull().sum().values,
    'Data Type': df_audio_features.dtypes.values,
    'Percentage Null': df_audio_features.isnull().mean().values * 100,
    'Unique Values': df_audio_features.nunique().values
})

print(column_summary_df_audio_features)

         Column Name Data Type  Non-Null Count  Null Count  Percentage Null  \
0           track_id    object           90460           0              0.0   
1            artists    object           90460           0              0.0   
2         album_name    object           90460           0              0.0   
3         track_name    object           90460           0              0.0   
4         popularity     int64           90460           0              0.0   
5        duration_ms     int64           90460           0              0.0   
6           explicit      bool           90460           0              0.0   
7       danceability   float64           90460           0              0.0   
8             energy   float64           90460           0              0.0   
9                key     int64           90460           0              0.0   
10          loudness   float64           90460           0              0.0   
11              mode     int64           90460      