In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import logging

from sklearn.preprocessing import StandardScaler

from scipy.stats import skew, kurtosis

from utils.general import *

import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
random_state = 42
rng = np.random.default_rng(random_state)

# Data Loading And Preprocessing

In [4]:
billboard_df = pd.read_csv("../data/billboard/hot-100_all.csv")
spotify_df = pd.read_csv("../data/spofity/songs.csv")
audio_analysis_df = pd.read_csv("../data/audio/audio_features_full.csv")

In [5]:
billboard_df.drop(axis=1, inplace=True, labels=['image', 'artist'])
billboard_df['date'] = pd.to_datetime(billboard_df['date'])
spotify_df.drop(axis=1, inplace=True, labels=['spotify_name',
                                              'artist',
                                              'artist_genres',
                                              'spotify_id',
                                              'spotify_id',
                                              'spotify_uri',
                                              'spotify_external_url',
                                              'spotify_artist_popularity',
                                              'preview_url',
                                              'preview_url_audio',
                                              'full_audio',
                                              'full_audio_duration_s'
                                              ])
audio_analysis_df.drop(axis=1, inplace=True, labels=['name'])

In [6]:
songs_df = spotify_df.merge(audio_analysis_df, how='inner', on='billboard_name')
songs_df

Unnamed: 0,billboard_name,duration_ms,spotify_popularity,spotify_artist_popularity_mean,explicit,danceability,energy,key,loudness,mode,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,All I Want For Christmas Is You,241106.0,69.0,81.000000,False,0.336,0.627,7.0,-7.463,1.0,...,1.619618,99.735660,-3.865835,99.024666,2.729783,112.219580,-7.488522,122.569650,2.323859,141.572560
1,Rockin' Around The Christmas Tree,126266.0,62.0,59.000000,False,0.589,0.472,8.0,-8.749,1.0,...,-1.039626,78.420586,-4.437555,55.536427,3.890496,70.359543,0.014326,77.899239,6.889563,93.610161
2,Jingle Bell Rock,130973.0,62.0,50.000000,False,0.754,0.424,2.0,-8.463,1.0,...,1.430321,58.685158,-4.030815,67.332291,1.802275,58.469532,-5.335912,53.423290,0.133941,58.774597
3,A Holly Jolly Christmas,135533.0,54.0,48.000000,False,0.683,0.375,0.0,-13.056,1.0,...,-1.355817,60.197350,-6.695084,52.782772,-4.325858,66.221947,-3.533713,50.849602,-1.266797,90.991325
4,Circles,215280.0,86.0,91.000000,False,0.695,0.762,0.0,-3.497,1.0,...,2.563944,78.141319,-12.359889,83.661438,4.207565,65.643173,-5.280680,54.441185,-0.751733,59.799530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,Christmas Isn't Canceled (Just You),231549.0,41.0,77.000000,False,0.580,0.789,1.0,-4.918,0.0,...,5.321839,70.412506,1.213545,77.698616,4.693950,85.284431,-2.604682,76.687698,5.805956,77.433144
1419,Moved To Miami,222225.0,66.0,88.500000,True,0.717,0.444,1.0,-11.126,1.0,...,4.598643,180.801086,-4.373017,110.878738,4.545245,111.550697,4.918246,63.780304,8.114554,75.806396
1420,Hibachi,170413.0,69.0,86.333333,True,0.681,0.522,5.0,-8.740,0.0,...,8.091636,79.854568,-1.646704,123.382797,5.309008,98.507568,0.213412,80.767159,2.804790,73.490234
1421,Thailand,200958.0,70.0,84.000000,True,0.875,0.478,7.0,-10.562,1.0,...,10.420262,91.743813,-2.071233,75.112267,8.341298,97.730263,-0.260812,69.875168,1.959964,62.722679


In [7]:
# print(songs_df.apply(lambda x: x.nunique()))
songs_df.describe()

Unnamed: 0,duration_ms,spotify_popularity,spotify_artist_popularity_mean,danceability,energy,key,loudness,mode,speechiness,acousticness,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,...,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,200696.557976,67.404076,83.302977,0.666045,0.622016,5.153197,-6.747289,0.622628,0.139837,0.223816,...,2.398114,84.513799,-3.647905,82.115186,2.668189,80.81822,-2.539943,79.075432,1.677588,80.37177
std,50770.975125,17.661345,12.018877,0.151579,0.162841,3.60034,2.57996,0.4849,0.125819,0.253148,...,3.631675,24.471115,3.398574,24.63778,3.253561,24.665541,3.023733,24.767819,3.186344,26.740148
min,32000.0,0.0,0.0,0.15,0.0076,0.0,-33.663,0.0,0.0232,3e-06,...,-13.240079,32.670311,-18.392536,31.161884,-10.105947,31.325922,-14.229393,28.560005,-13.992401,28.597084
25%,170322.0,64.0,78.0,0.57,0.525,1.0,-7.8415,0.0,0.0428,0.03155,...,0.246056,68.009289,-5.805181,65.208897,0.716595,64.160015,-4.468847,62.009989,-0.239844,61.936447
50%,195428.0,70.0,86.0,0.68,0.633,5.0,-6.36,1.0,0.0798,0.121,...,2.442511,81.295982,-3.595455,78.402687,2.753472,77.774918,-2.510375,75.437561,1.633557,75.101379
75%,223599.0,77.0,91.0,0.776,0.7335,8.0,-5.0775,1.0,0.218,0.3215,...,4.722763,98.500587,-1.416037,94.838615,4.810408,93.11882,-0.625247,91.067223,3.652482,93.510078
max,613026.0,95.0,100.0,0.965,0.984,11.0,-1.321,1.0,0.699,0.995,...,13.629806,207.025589,6.79554,219.371109,12.521308,229.869766,7.686097,229.040588,14.772246,226.710175


In [8]:
songs_df.isna().sum()

billboard_name                    0
duration_ms                       0
spotify_popularity                0
spotify_artist_popularity_mean    0
explicit                          0
                                 ..
mfcc18_var                        0
mfcc19_mean                       0
mfcc19_var                        0
mfcc20_mean                       0
mfcc20_var                        0
Length: 76, dtype: int64

In [9]:
songs_df = pd.get_dummies(songs_df, prefix=['explicit'], columns=['explicit'])

# Testing Different Popularity Metrics

In [31]:
def popularity_metrics(df, score_type='basic'):
    metrics = ['peak_rank', 'lifetime_peak_rank', 'debut_rank', 'sensationality', 'avg_rank_score', 'std_rank_score', 'time_on_chart', 'num_occurrences', 'rank_sum', 'skewness', 'kurtosis']
    metric_vals = []

    for i in range(len(df)):
        song = df.iloc[i]
        billboard_entries = billboard_df[billboard_df['title'] == song['billboard_name']].sort_values(by='date')
        value_counts = billboard_entries['rank'].value_counts()
        ranks = value_counts.index
        rank_counts = value_counts.values


        # max rank
        peak = min(ranks)
        if score_type == 'classic':
            ranks = [rank_score_classic(peak, ra) for ra in ranks]
        elif score_type == 'score_01':
            ranks = [rank_score_01(peak, ra) for ra in ranks]
        elif score_type == 'score_02':
            ranks = [rank_score_02(peak, ra) for ra in ranks]
        else:
            ranks = [rank_score_basic(ra) for ra in ranks]

        # sensationality
        sensation = squiggle(rank_counts, ranks, scaled=True)

        # mean rank
        avg_rank = np.mean(ranks)

        # rank std
        std_rank = np.std(ranks)

        # length
        time_on_chart = np.max(billboard_entries['weeks'])

        # lifetime_peak
        lifetime_peak = np.max(billboard_entries['peakPos'])

        # debut rank
        debut_rank = billboard_entries['rank'].iloc[0]

        # number of occurrences
        num_occurrences = len(billboard_entries)

        rank_sum = sum(ranks)

        skewness = skew(ranks)

        kurt = kurtosis(ranks)

        metric_vals.append([peak, debut_rank, lifetime_peak, sensation, avg_rank, std_rank, time_on_chart, num_occurrences, rank_sum, skewness, kurt])

    return pd.DataFrame(data=metric_vals, columns=metrics)

In [32]:
pop_metrics = popularity_metrics(songs_df, 'basic')
pop_metrics.describe()

Unnamed: 0,peak_rank,lifetime_peak_rank,debut_rank,sensationality,avg_rank_score,std_rank_score,time_on_chart,num_occurrences,rank_sum,skewness,kurtosis
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,50.872804,60.858749,59.452565,0.173917,0.025285,0.01506,8.41532,7.326072,0.221581,0.450854,-1.18876
std,28.872677,28.879139,29.739192,0.264423,0.03115,0.045298,10.98796,9.710895,0.485595,0.822457,2.763996
min,1.0,1.0,1.0,0.01,0.01,0.0,1.0,1.0,0.01,-1.361391,-3.0
25%,26.0,39.0,36.5,0.014492,0.012616,0.0,1.0,1.0,0.014493,0.0,-3.0
50%,53.0,66.0,65.0,0.040918,0.015764,0.001422,2.0,2.0,0.040831,0.0,-2.0
75%,76.0,86.0,86.0,0.221774,0.02381,0.008715,14.0,11.0,0.196657,0.701393,-0.694673
max,100.0,100.0,100.0,1.0,0.409608,0.424401,90.0,87.0,3.69796,4.800006,22.324941


In [33]:
pop_metrics = popularity_metrics(songs_df, 'classic')
pop_metrics.describe()

Unnamed: 0,peak_rank,lifetime_peak_rank,debut_rank,sensationality,avg_rank_score,std_rank_score,time_on_chart,num_occurrences,rank_sum,skewness,kurtosis
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,50.872804,60.858749,59.452565,0.898278,14.673815,9.749975,8.41532,7.326072,146.21785,0.032422,-1.819822
std,28.872677,28.879139,29.739192,0.117932,15.124559,10.401953,10.98796,9.710895,217.31022,0.468476,1.306573
min,1.0,1.0,1.0,0.761594,1.0,0.0,1.0,1.0,1.0,-2.398583,-3.0
25%,26.0,39.0,36.5,0.761594,1.0,0.0,1.0,1.0,1.0,0.0,-3.0
50%,53.0,66.0,65.0,1.0,10.857143,7.537241,2.0,2.0,38.0,0.0,-2.0
75%,76.0,86.0,86.0,1.0,25.657143,17.808576,14.0,11.0,232.0,0.0,-1.025932
max,100.0,100.0,100.0,1.0,65.333333,40.0,90.0,87.0,1252.0,2.719581,7.127719


In [36]:
pop_metrics = popularity_metrics(songs_df, 'score_01')
pop_metrics.describe()

Unnamed: 0,peak_rank,lifetime_peak_rank,debut_rank,sensationality,avg_rank_score,std_rank_score,time_on_chart,num_occurrences,rank_sum,skewness,kurtosis
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,50.872804,60.858749,59.452565,0.900328,1.780139,0.575463,8.41532,7.326072,17.299562,0.415488,-1.320215
std,28.872677,28.879139,29.739192,0.10891,4.671507,3.534214,10.98796,9.710895,73.512941,0.762767,2.497012
min,1.0,1.0,1.0,0.770033,1.020408,0.0,1.0,1.0,1.020408,-1.360677,-3.0
25%,26.0,39.0,36.5,0.773848,1.026667,0.0,1.0,1.0,1.029851,0.0,-3.0
50%,53.0,66.0,65.0,0.970869,1.036364,0.001515,2.0,2.0,2.103609,0.0,-2.0
75%,76.0,86.0,86.0,1.0,1.066667,0.009947,14.0,11.0,10.965784,0.68799,-0.715733
max,100.0,100.0,100.0,1.0,65.333333,37.124415,90.0,87.0,927.0,4.636364,21.196639


In [37]:
pop_metrics = popularity_metrics(songs_df, 'score_02')
pop_metrics.describe()

Unnamed: 0,peak_rank,lifetime_peak_rank,debut_rank,sensationality,avg_rank_score,std_rank_score,time_on_chart,num_occurrences,rank_sum,skewness,kurtosis
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,50.872804,60.858749,59.452565,0.132618,0.016987,0.011042,8.41532,7.326072,0.169956,0.345212,-1.374249
std,28.872677,28.879139,29.739192,0.239601,0.030406,0.043514,10.98796,9.710895,0.450653,0.750308,2.464867
min,1.0,1.0,1.0,0.005025,0.005025,0.0,1.0,1.0,0.005025,-1.474878,-3.0
25%,26.0,39.0,36.5,0.007299,0.006536,0.0,1.0,1.0,0.007299,0.0,-3.0
50%,53.0,66.0,65.0,0.022614,0.008691,0.000406,2.0,2.0,0.022286,0.0,-2.0
75%,76.0,86.0,86.0,0.135852,0.014067,0.003357,14.0,11.0,0.120634,0.470999,-0.87388
max,100.0,100.0,100.0,1.0,0.409608,0.424401,90.0,87.0,3.69796,4.443465,19.928467


In [13]:
# data_df is whats going to be used for all
data_df = songs_df.copy()
song_billboard_names = data_df.pop('billboard_name')
song_audio_analysis_files = data_df.pop('audio_analysis_file')
data_df[data_df.columns] = StandardScaler().fit_transform(data_df)

In [14]:
# y_col = 'sensationality'
# y = data_df.pop(y_col)