In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from utils.general import squiggle

In [76]:
billboard_df = pd.read_csv("../data/billboard/hot-100_all.csv")
spotify_df = pd.read_csv("../data/spofity/songs.csv")
audio_analysis_df = pd.read_csv("../data/audio/audio_features_full.csv")

In [77]:
billboard_df.drop(axis=1, inplace=True, labels=['image', 'artist'])
spotify_df.drop(axis=1, inplace=True, labels=['spotify_name',
                                              'artist',
                                              'artist_genres',
                                              'spotify_id',
                                              'spotify_id',
                                              'spotify_uri',
                                              'spotify_external_url',
                                              'spotify_artist_popularity',
                                              'preview_url',
                                              'preview_url_audio',
                                              'full_audio',
                                              'full_audio_duration_s'
                                              ])
audio_analysis_df.drop(axis=1, inplace=True, labels=['name'])

In [78]:
songs_df = spotify_df.merge(audio_analysis_df, how='inner', on='billboard_name')
scaled = StandardScaler().fit_transform(songs_df.drop(labels=['billboard_name', 'audio_analysis_file'], axis=1))
scaled

array([[ 0.79619602,  0.09039431, -0.19168066, ...,  1.7566952 ,
         0.20289671,  2.28952756],
       [-1.46652146, -0.30609083, -2.02277807, ..., -0.04750548,
         1.63629755,  0.4952495 ],
       [-1.37377841, -0.30609083, -2.77186338, ..., -1.03606864,
        -0.48462733, -0.80795235],
       ...,
       [-0.59668352,  0.09039431,  0.25222174, ...,  0.06832741,
         0.35388483, -0.25743892],
       [ 0.00515125,  0.14703505,  0.05801444, ..., -0.37159101,
         0.08865204, -0.66025422],
       [-0.84342691,  1.33649048, -0.44137576, ..., -1.37463759,
        -1.13589577, -1.06906583]])

In [53]:
songs_df.describe()

Unnamed: 0,duration_ms,spotify_popularity,spotify_artist_popularity_mean,danceability,energy,key,loudness,mode,speechiness,acousticness,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,...,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,200696.557976,67.404076,83.302977,0.666045,0.622016,5.153197,-6.747289,0.622628,0.139837,0.223816,...,2.398114,84.513799,-3.647905,82.115186,2.668189,80.81822,-2.539943,79.075432,1.677588,80.37177
std,50770.975125,17.661345,12.018877,0.151579,0.162841,3.60034,2.57996,0.4849,0.125819,0.253148,...,3.631675,24.471115,3.398574,24.63778,3.253561,24.665541,3.023733,24.767819,3.186344,26.740148
min,32000.0,0.0,0.0,0.15,0.0076,0.0,-33.663,0.0,0.0232,3e-06,...,-13.240079,32.670311,-18.392536,31.161884,-10.105947,31.325922,-14.229393,28.560005,-13.992401,28.597084
25%,170322.0,64.0,78.0,0.57,0.525,1.0,-7.8415,0.0,0.0428,0.03155,...,0.246056,68.009289,-5.805181,65.208897,0.716595,64.160015,-4.468847,62.009989,-0.239844,61.936447
50%,195428.0,70.0,86.0,0.68,0.633,5.0,-6.36,1.0,0.0798,0.121,...,2.442511,81.295982,-3.595455,78.402687,2.753472,77.774918,-2.510375,75.437561,1.633557,75.101379
75%,223599.0,77.0,91.0,0.776,0.7335,8.0,-5.0775,1.0,0.218,0.3215,...,4.722763,98.500587,-1.416037,94.838615,4.810408,93.11882,-0.625247,91.067223,3.652482,93.510078
max,613026.0,95.0,100.0,0.965,0.984,11.0,-1.321,1.0,0.699,0.995,...,13.629806,207.025589,6.79554,219.371109,12.521308,229.869766,7.686097,229.040588,14.772246,226.710175


In [54]:
songs_df.isna().sum()

billboard_name                    0
duration_ms                       0
spotify_popularity                0
spotify_artist_popularity_mean    0
explicit                          0
                                 ..
mfcc18_var                        0
mfcc19_mean                       0
mfcc19_var                        0
mfcc20_mean                       0
mfcc20_var                        0
Length: 76, dtype: int64

In [55]:
songs_df = pd.get_dummies(songs_df, prefix=['explicit'], columns=['explicit'])

In [9]:
sens = []
for i in range(len(songs_df)):
    song = songs_df.iloc[i]
    billboard_entries = billboard_df[billboard_df['title'] == song['billboard_name']]
    value_counts = billboard_entries['rank'].value_counts()
    ranks = value_counts.index
    rank_counts = value_counts.values
    sens.append(squiggle(rank_counts, ranks, scaled=True))
songs_df['sensationality'] = np.array(sens)
songs_df['sensationality']

0       0.999999
1       0.998958
2       0.981122
3       0.931350
4       0.999999
          ...   
1418    0.012658
1419    0.011764
1420    0.010989
1421    0.010526
1422    0.010309
Name: sensationality, Length: 1423, dtype: float64

In [56]:
import librosa

y, sr = librosa.load('../data/audio/full/Ariana Grande  obvious  Lyrics.wav')
mfccs = librosa.feature.mfcc(y, sr=sr)
mfccs

  return f(*args, **kwargs)
  mfccs = librosa.feature.mfcc(y, sr=sr)


array([[-479.9394, -479.9394, -479.9394, ..., -479.9394, -479.9394,
        -479.9394],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       ...,
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ],
       [   0.    ,    0.    ,    0.    , ...,    0.    ,    0.    ,
           0.    ]], dtype=float32)