In [6]:
from utils.general import *
from utils.billboard_data import get_chart
from utils.spotify import get_song_data
from utils.youtube import *
from utils.audio import extract_audio_features

import pandas as pd
import numpy as np

import animation
import os
import glob
import json
import wget
import time
from math import floor

import warnings

# import matplotlib.pyplot as plt
# import cupy as cp
# import cudf

In [7]:
warnings.filterwarnings('ignore')

In [8]:
@animation.wait('spinner', text='Fetching Billboard Charts', speed=0.2)
def get_billboard_data(start_year, end_year, chart_title='hot-100', output_dir=os.getcwd()):
    state_cols = ['last_date', 'last_id', 'next_id', 'num_weeks', 'num_songs']
    default_state = [[None, 0, 0, 0, 0]]
    output_path = mkdir(f"{output_dir}/{chart_title}")
    year = end_year
    while year >= start_year:
        for week in all_day_in_year(4, year):
            state_df= pd.read_csv(f'{output_path}/state.csv') if os.path.exists(f'{output_path}/state.csv') else \
                pd.DataFrame(data=default_state, columns=state_cols)
            next_id = state_df.iloc[0]['next_id']
            num_weeks = state_df.iloc[0]['num_weeks']
            num_songs = state_df.iloc[0]['num_songs']
            chart_df, last_id = get_chart(chart_title=chart_title, week=week, starting_id=next_id)
            state_df.update(pd.DataFrame(data=[[week, last_id-1, last_id, num_weeks+1, num_songs+(last_id-next_id)]], columns=state_cols))
            state_df.to_csv(f"{output_path}/state.csv", index=False)
            chart_df.to_csv(f"{output_path}/{chart_title}_{week}.csv", index_label='title')
        year -= 1
    print(f"Data written to {output_path}")

In [10]:
@animation.wait('spinner', text='Fetching Spotify Song Data', speed=0.2)
def fetch_spotify_songs(song_dir, glob_pattern='*', output_dir='./', output_file='songs.csv', audio_analysis_dir=None, preview_audio_dir='../data/audio/previews', preview_format="m4a", verbose=False):
    state_cols = ['billboard_name', 'spotify_name', 'song_data', 'audio_features', 'audio_analysis']
    spotify_song_cols = ['billboard_name', 'spotify_name', 'artist', 'duration_ms', 'spotify_id', 'spotify_uri', 'spotify_external_url', 'spotify_popularity', 'spotify_artist_popularity', 'spotify_artist_popularity_mean', 'explicit', 'preview_url', 'preview_url_audio', 'full_audio', 'full_audio_duration_s', 'artist_genres']
    audio_feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
    audio_analysis_cols = ['audio_analysis_file']
    song_dir = os.path.abspath(song_dir)
    audio_analysis_dir = os.path.abspath(audio_analysis_dir) if audio_analysis_dir is not None else f"{output_dir}/audio_analysis"
    audio_analysis_dir = mkdir(audio_analysis_dir)
    path_glob = glob.glob(f"{song_dir}/{glob_pattern}")
    # output_dir = os.path.abspath(output_dir)
    output_dir = mkdir(output_dir)
    # preview_audio_dir = os.path.abspath(preview_audio_dir)
    preview_audio_dir = mkdir(preview_audio_dir)
    for file in path_glob:
        weeks_songs = pd.read_csv(file)
        spotify_songs_df = open_or_create_csv(f'{output_dir}/{output_file}', spotify_song_cols)
        for i in range(len(weeks_songs)):
            song = weeks_songs.iloc[i]
            spotify_song = spotify_songs_df.loc[spotify_songs_df['billboard_name'] == song['title']]
            if spotify_song.empty:
                # if empty get song data
                song_data, audio_features, audio_analysis = get_song_data(song['title'], artist=song['artist'])
                state = [song['title']]

                if song_data is not None:
                    state.append(song_data['name'])
                    state.append(True)
                    s_df = pd.DataFrame(data=[[
                        song['title'],
                        song_data['name'],
                        song['artist'],
                        song_data['duration_ms'],
                        song_data['id'],
                        song_data['uri'],
                        song_data['external_urls']['spotify'],
                        song_data['popularity'],
                        song_data['artist_popularity'],
                        sum(song_data['artist_popularity'])/len(song_data['artist_popularity']),
                        song_data['explicit'],
                        song_data['preview_url'],
                         wget.download(song_data['preview_url'], out=f"{preview_audio_dir}/{remove_punctuation(song['title'])}.{preview_format}").split(os.sep)[-1] if song_data['preview_url'] else None,
                        "not_fetched",
                        -1,
                        song_data['artist_genres']]], columns=spotify_song_cols)
                else:
                    state.append(None)
                    state.append(False)
                    s_df = pd.DataFrame(data=[[song['title']] + [None for i in range(len(spotify_song_cols)-1)]], columns=spotify_song_cols)

                if audio_features is not None:
                    state.append(True)
                    # audio_feature_cols = ['spotify_' + item for item in audio_feature_cols]
                    af_df = pd.DataFrame(data=[[audio_features['danceability'],
                                                audio_features['energy'],
                                                audio_features['key'],
                                                audio_features['loudness'],
                                                audio_features['mode'],
                                                audio_features['speechiness'],
                                                audio_features['acousticness'],
                                                audio_features['instrumentalness'],
                                                audio_features['liveness'],
                                                audio_features['valence'],
                                                audio_features['tempo'],
                                                audio_features['time_signature']]], columns=audio_feature_cols)
                    # af_df = pd.DataFrame(data=[['sample feature 1']], columns=['audio_feature_1'])
                else:
                    state.append(False)
                    af_df = pd.DataFrame(data=[[None for i in range(len(audio_feature_cols))]], columns=audio_feature_cols)

                if audio_analysis is not None:
                    state.append(True)
                    song_title = remove_punctuation(song['title'])
                    with open(f"{audio_analysis_dir}/{song_title}.json", "w+", encoding="utf-8") as json_file:
                        json.dump(audio_analysis, json_file, ensure_ascii=False, indent=4)
                        aa_df = pd.DataFrame(data=[[f"{song_title}.json"]], columns=audio_analysis_cols)
                    # aa_df = pd.DataFrame(data=[['sample analysis 1']], columns=['audio_analysis_1'])
                else:
                    state.append(False)
                    aa_df = pd.DataFrame(data=[[None for i in range(len(audio_analysis_cols))]], columns=audio_analysis_cols)
                    # aa_df = pd.DataFrame(data=[['sample analysis 1 EMPTY']], columns=['audio_analysis_1'])

                s_df = pd.concat([s_df, af_df, aa_df], axis=1)
                spotify_songs_df = pd.concat([spotify_songs_df, s_df])
                spotify_songs_df.to_csv(f'{output_dir}/{output_file}', index=False)
                state_df = open_or_create_csv(f'{output_dir}/state.csv', cols=state_cols)
                state_df = pd.concat([state_df, pd.DataFrame(data=[state], columns=state_cols)])
                state_df.to_csv(f'{output_dir}/state.csv', index=False)
            elif verbose:
                print(f"{spotify_song.iloc[0]['billboard_name']} - {song['artist']} skipped...")
    print(f"Data written to {output_file}")

In [11]:
@animation.wait('spinner', text='Fetching Audio Files', speed=0.2)
def fetch_audio_data(songs_csv_filepath="../data/spotify/songs.csv", output_dir="../data/audio/full"):
    songs_csv_filepath = os.path.abspath(songs_csv_filepath)
    output_dir = mkdir(output_dir)
    songs_df_full = pd.read_csv(songs_csv_filepath)
    songs_df = songs_df_full[['billboard_name', 'artist', 'full_audio']]
    try:
        for i in range(len(songs_df)):
            song = songs_df.iloc[i]
            if song['full_audio'] == "not_fetched":
                query_string = f"{song['billboard_name']} {song['artist']} lyrics"
                yt_id = yt_query(query_string, all_ids=False)
                # print(f"{song['billboard_name']}: {yt_id}")
                audio_file, duration_s = yt_download_audio(yt_id, output_dir=output_dir) if yt_id is not None else (None, None)
                songs_df_full.iat[i, songs_df_full.columns.get_loc('full_audio')] = audio_file.split(os.sep)[-1]
                songs_df_full.iat[i, songs_df_full.columns.get_loc('full_audio_duration_s')] = duration_s
    except Exception:
        songs_df_full.to_csv(songs_csv_filepath, index=False)
    finally:
        songs_df_full.to_csv(songs_csv_filepath, index=False)
    print(f"Data written to {output_dir}")

### Features to extract (modelled off GTZAN dataset):
0. length of analysed segments
1. Chroma stft (short term fourier transform)
2. rms (root mean square)
3. spectral centroid
4. spectral bandwidth
5. rolloff
6. zero crossing rate
7. harmony
8. perceptr
9. tempo
10. mfccs

In [12]:
@animation.wait('spinner', text='Fetching Audio Features', speed=0.2)
def fetch_audio_features(song_path='../data/spofity/songs.csv', output_path='../data/audio/audio_features_full.csv', audio_dir='../data/audio/full', audio_type='full', batch_size=None):
    assert audio_type == 'full' or audio_type == 'preview', 'audio_type must either be full or preview'
    audio_field = "full_audio" if audio_type == "full" else 'preview_url_audio'
    song_path = os.path.abspath(song_path)
    output_path = os.path.abspath(output_path)
    audio_dir = os.path.abspath(audio_dir)
    songs_df = pd.read_csv(song_path)

    status_dir = os.sep.join(output_path.split(os.sep)[:-1])
    state_cols = ['billboard_name']
    status_file = f"{status_dir}/{audio_type}_state.csv"
    status_df = open_or_create_csv(status_file, state_cols)
    count = 0

    for i in range(len(songs_df)):
        filename = songs_df.iloc[i][audio_field] if not pd.isna(songs_df.iloc[i][audio_field]) else None
        if filename in status_df['billboard_name'].unique() or not filename: continue
        filepath = f"{audio_dir}/{filename}"
        audio_features, cols = extract_audio_features(filepath, song_name=songs_df.iloc[i]['billboard_name'])
        output_df = open_or_create_csv(output_path, cols)
        output_df = pd.concat([output_df, audio_features])
        output_df.to_csv(output_path, index=False)
        status_df = pd.concat([status_df, pd.DataFrame(data=[[filename]], columns=state_cols)])
        status_df.to_csv(status_file, index=False)
        count += 1
        if batch_size and count >= batch_size: break
    output_df = open_or_create_csv(output_path, cols=[])
    # songs_df = pd.concat([songs_df, output_df], axis=1)
    # songs_df.to_csv(song_path, index=False)
    return output_df

In [None]:
# start_time = time.time()
# get_billboard_data(2020, 2021, output_dir="../data/billboard")
# end_time = time.time() - start_time
# print(end_time)

In [16]:
# merge_csvs_in_path('../data/billboard/hot-100', glob_pattern='hot-100_*.csv', output_path='../data/billboard', output_filename='hot-100_all')

In [None]:
# fetch_spotify_songs(song_dir="../data/billboard/hot-100/", glob_pattern="hot-100_*.csv", output_dir='../data/spofity', output_file='songs.csv', audio_analysis_dir='../data/spofity/audio_analysis', preview_audio_dir="../data/audio/previews")

In [None]:
# start_time = time.time()
# fetch_audio_data("../data/spofity/songs.csv", output_dir="../data/audio/full")
# execution_time = time.time() - start_time
# print(f"{execution_time} seconds\n{floor(execution_time/60)} mins, {execution_time%60} seconds")
# execution time to get Billboard charts, Spotify data and YouTube audio data for 2020 and 2021
#                = 4776.834360599518 seconds
#                = 79 mins, 36.83436059951782 seconds

In [14]:
fetch_audio_features('../data/spofity/songs.csv', batch_size=1)

[Kching Audio Features	


Unnamed: 0,billboard_name,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,All I Want For Christmas Is You,Mariah Carey All I Want For Christmas Is You ...,241.696508,0.317660,0.091317,0.160702,0.005551,2963.679826,4.258230e+05,2887.388904,...,1.619618,99.735657,-3.865835,99.024666,2.729783,112.219582,-7.488522,122.569649,2.323859,141.572556
1,Rockin' Around The Christmas Tree,Brenda Lee Rockin Around The Christmas Tree O...,127.523991,0.333602,0.087056,0.099516,0.002398,2337.289099,3.094718e+05,2206.637701,...,-1.039626,78.420586,-4.437555,55.536427,3.890496,70.359535,0.014326,77.899239,6.889563,93.610161
2,Jingle Bell Rock,Bobby Helms Jingle Bell Rock Lyrics.m4a,131.169524,0.299308,0.089412,0.129217,0.002749,2307.602960,5.159224e+05,2374.071872,...,2.863054,54.716408,-3.496011,67.803917,3.036057,54.096622,-4.979988,54.835514,1.543585,60.288670
3,A Holly Jolly Christmas,Burl Ives Holly Jolly Christmas Lyrics.m4a,153.553560,0.328846,0.089120,0.069804,0.000859,2200.466281,2.688014e+05,2378.395310,...,-1.355817,60.197353,-6.695084,52.782776,-4.325858,66.221947,-3.533713,50.849602,-1.266797,90.991325
4,Circles,Post Malone Circles Lyrics.m4a,215.411519,0.340946,0.093090,0.329962,0.017911,1821.167852,6.991870e+05,2252.989888,...,2.563944,78.141327,-12.359889,83.661438,4.207565,65.643173,-5.280680,54.441189,-0.751733,59.799530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Loyal,PARTYNEXTDOOR Drake Loyal LYRICS.m4a,198.089433,0.388931,0.083847,0.230641,0.019024,1538.706183,1.033882e+06,1896.777562,...,-9.299193,131.676193,-14.050437,115.656837,-6.608180,92.221558,-10.323092,92.573296,-7.643232,81.850471
196,KEII,Anuel AA Keii LetraLyrics.m4a,239.699592,0.326721,0.092567,0.273150,0.017297,2030.518374,5.556884e+05,2266.121330,...,9.003270,100.250137,-4.268690,80.482101,-0.368161,76.360161,-4.739402,66.883820,-0.469320,78.236832
197,Believe,Meek Mill Justin Timberlake Believe Lyrics.m4a,211.440907,0.381199,0.093838,0.278260,0.011884,2254.896524,8.066539e+05,2483.954274,...,1.469345,70.120537,-6.670409,77.969902,4.237351,65.954094,-5.991199,67.876350,1.282665,77.939034
198,Whats Poppin,Jack Harlow WHATS POPPIN feat Dababy Tory Lan...,227.694875,0.429759,0.095006,0.154810,0.007709,3536.144878,2.245250e+06,2906.832664,...,0.405196,85.309494,-6.410496,91.954247,2.467787,86.325760,-4.378901,74.595940,-1.017326,73.496964
