In [1]:
from utils.general import *
from utils.billboard_data import get_chart
from utils.spotify import get_song_data
from utils.youtube import *
from utils.audio import extract_audio_features

import pandas as pd

import animation
import os
import glob
import json
import wget

import warnings

# import matplotlib.pyplot as plt
# import cupy as cp
# import cudf

In [2]:
warnings.filterwarnings('ignore')

In [3]:
@animation.wait('spinner', text='Fetching Billboard Charts', speed=0.2)
@execution_time(round_to=2)
def get_billboard_data(start_year, end_year, chart_title='hot-100', output_dir=os.getcwd()):
    state_cols = ['last_date', 'last_id', 'next_id', 'num_weeks', 'num_songs']
    default_state = [[None, 0, 0, 0, 0]]
    output_path = mkdir(f"{output_dir}/{chart_title}")
    year = end_year
    while year >= start_year:
        for week in all_day_in_year(4, year):
            state_df= pd.read_csv(f'{output_path}/state.csv') if os.path.exists(f'{output_path}/state.csv') else \
                pd.DataFrame(data=default_state, columns=state_cols)
            next_id = state_df.iloc[0]['next_id']
            num_weeks = state_df.iloc[0]['num_weeks']
            num_songs = state_df.iloc[0]['num_songs']
            chart_df, last_id = get_chart(chart_title=chart_title, week=week, starting_id=next_id)
            state_df.update(pd.DataFrame(data=[[week, last_id-1, last_id, num_weeks+1, num_songs+(last_id-next_id)]], columns=state_cols))
            state_df.to_csv(f"{output_path}/state.csv", index=False)
            chart_df.to_csv(f"{output_path}/{chart_title}_{week}.csv", index_label='title')
        year -= 1
    print(f"Data written to {output_path}")

In [4]:
@animation.wait('spinner', text='Fetching Spotify Song Data', speed=0.2)
@execution_time(round_to=2)
def fetch_spotify_songs(song_dir, glob_pattern='*', output_dir='./', output_file='songs.csv', audio_analysis_dir=None, preview_audio_dir='../data/audio/previews', preview_format="wav", verbose=False):

    state_cols = ['billboard_name', 'spotify_name', 'song_data', 'audio_features', 'audio_analysis']

    spotify_song_cols = ['billboard_name', 'spotify_name', 'artist', 'duration_ms', 'spotify_id', 'spotify_uri', 'spotify_external_url', 'spotify_popularity', 'spotify_artist_popularity', 'spotify_artist_popularity_mean', 'explicit', 'preview_url', 'preview_url_audio', 'full_audio', 'full_audio_duration_s', 'artist_genres']

    audio_feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

    audio_analysis_cols = ['audio_analysis_file']

    song_dir = os.path.abspath(song_dir)
    audio_analysis_dir = os.path.abspath(audio_analysis_dir) if audio_analysis_dir is not None else f"{output_dir}/audio_analysis"
    audio_analysis_dir = mkdir(audio_analysis_dir)
    path_glob = glob.glob(f"{song_dir}/{glob_pattern}")
    # output_dir = os.path.abspath(output_dir)
    output_dir = mkdir(output_dir)
    # preview_audio_dir = os.path.abspath(preview_audio_dir)
    preview_audio_dir = mkdir(preview_audio_dir)
    for file in path_glob:
        weeks_songs = pd.read_csv(file)
        spotify_songs_df = open_or_create_csv(f'{output_dir}/{output_file}', spotify_song_cols)
        for i in range(len(weeks_songs)):
            song = weeks_songs.iloc[i]
            spotify_song = spotify_songs_df.loc[spotify_songs_df['billboard_name'] == song['title']]
            if spotify_song.empty:
                # if empty get song data
                song_data, audio_features, audio_analysis = get_song_data(song['title'], artist=song['artist'])
                state = [song['title']]

                if song_data is not None:
                    state.append(song_data['name'])
                    state.append(True)
                    s_df = pd.DataFrame(data=[[
                        song['title'],
                        song_data['name'],
                        song['artist'],
                        song_data['duration_ms'],
                        song_data['id'],
                        song_data['uri'],
                        song_data['external_urls']['spotify'],
                        song_data['popularity'],
                        song_data['artist_popularity'],
                        sum(song_data['artist_popularity'])/len(song_data['artist_popularity']),
                        song_data['explicit'],
                        song_data['preview_url'],
                         wget.download(song_data['preview_url'], out=f"{preview_audio_dir}/{remove_punctuation(song['title'])}.{preview_format}").split(os.sep)[-1] if song_data['preview_url'] else None,
                        "not_fetched",
                        -1,
                        song_data['artist_genres']]], columns=spotify_song_cols)
                else:
                    state.append(None)
                    state.append(False)
                    s_df = pd.DataFrame(data=[[song['title']] + [None for i in range(len(spotify_song_cols)-1)]], columns=spotify_song_cols)

                if audio_features is not None:
                    state.append(True)
                    # audio_feature_cols = ['spotify_' + item for item in audio_feature_cols]
                    af_df = pd.DataFrame(data=[[audio_features['danceability'],
                                                audio_features['energy'],
                                                audio_features['key'],
                                                audio_features['loudness'],
                                                audio_features['mode'],
                                                audio_features['speechiness'],
                                                audio_features['acousticness'],
                                                audio_features['instrumentalness'],
                                                audio_features['liveness'],
                                                audio_features['valence'],
                                                audio_features['tempo'],
                                                audio_features['time_signature']]], columns=audio_feature_cols)
                    # af_df = pd.DataFrame(data=[['sample feature 1']], columns=['audio_feature_1'])
                else:
                    state.append(False)
                    af_df = pd.DataFrame(data=[[None for i in range(len(audio_feature_cols))]], columns=audio_feature_cols)

                if audio_analysis is not None:
                    state.append(True)
                    song_title = remove_punctuation(song['title'])
                    with open(f"{audio_analysis_dir}/{song_title}.json", "w+", encoding="utf-8") as json_file:
                        json.dump(audio_analysis, json_file, ensure_ascii=False, indent=4)
                        aa_df = pd.DataFrame(data=[[f"{song_title}.json"]], columns=audio_analysis_cols)
                    # aa_df = pd.DataFrame(data=[['sample analysis 1']], columns=['audio_analysis_1'])
                else:
                    state.append(False)
                    aa_df = pd.DataFrame(data=[[None for i in range(len(audio_analysis_cols))]], columns=audio_analysis_cols)
                    # aa_df = pd.DataFrame(data=[['sample analysis 1 EMPTY']], columns=['audio_analysis_1'])

                s_df = pd.concat([s_df, af_df, aa_df], axis=1)
                spotify_songs_df = pd.concat([spotify_songs_df, s_df])
                spotify_songs_df.to_csv(f'{output_dir}/{output_file}', index=False)
                state_df = open_or_create_csv(f'{output_dir}/state.csv', cols=state_cols)
                state_df = pd.concat([state_df, pd.DataFrame(data=[state], columns=state_cols)])
                state_df.to_csv(f'{output_dir}/state.csv', index=False)
            elif verbose:
                print(f"{spotify_song.iloc[0]['billboard_name']} - {song['artist']} skipped...")
    print(f"Data written to {output_file}")

In [5]:
@animation.wait('spinner', text='Fetching Audio Files', speed=0.2)
@execution_time(round_to=2)
def fetch_audio_data(songs_csv_filepath="../data/spotify/songs.csv", output_dir="../data/audio/full"):
    songs_csv_filepath = os.path.abspath(songs_csv_filepath)
    output_dir = mkdir(output_dir)
    songs_df_full = pd.read_csv(songs_csv_filepath)
    songs_df = songs_df_full[['billboard_name', 'artist', 'full_audio']]
    try:
        for i in range(len(songs_df)):
            song = songs_df.iloc[i]
            if song['full_audio'] == "not_fetched":
                query_string = f"{song['billboard_name']} {song['artist']} lyrics"
                yt_id = yt_query(query_string, all_ids=False)
                # print(f"{song['billboard_name']}: {yt_id}")
                audio_file, duration_s = yt_download_audio(yt_id, output_dir=output_dir) if yt_id is not None else (None, None)
                songs_df_full.iat[i, songs_df_full.columns.get_loc('full_audio')] = audio_file.split(os.sep)[-1]
                songs_df_full.iat[i, songs_df_full.columns.get_loc('full_audio_duration_s')] = duration_s
    except Exception as e:
        print(f"Exception: {e}")
        songs_df_full.to_csv(songs_csv_filepath, index=False)
    finally:
        songs_df_full.to_csv(songs_csv_filepath, index=False)
    print(f"Data written to {output_dir}")

### Features to extract (modelled off GTZAN dataset):
0. length of analysed segments
1. Chroma stft (short term fourier transform)
2. rms (root mean square)
3. spectral centroid
4. spectral bandwidth
5. rolloff
6. zero crossing rate
7. harmony
8. perceptr
9. tempo
10. mfccs

In [28]:
@animation.wait('spinner', text='Fetching Audio Features', speed=0.2)
@execution_time(round_to=2)
def fetch_audio_features(song_path='../data/spofity/songs.csv', output_path='../data/audio/audio_features_full.csv', audio_dir='../data/audio/full', audio_type='full', batch_size=None):
    assert audio_type == 'full' or audio_type == 'preview', 'audio_type must either be full or preview'
    audio_field = "full_audio" if audio_type == "full" else 'preview_url_audio'
    song_path = os.path.abspath(song_path)
    output_path = os.path.abspath(output_path)
    audio_dir = os.path.abspath(audio_dir)
    songs_df = pd.read_csv(song_path)

    status_dir = os.sep.join(output_path.split(os.sep)[:-1])
    state_cols = ['billboard_name']
    status_file = f"{status_dir}/{audio_type}_state.csv"
    status_df = open_or_create_csv(status_file, state_cols)
    count = 0

    for i in range(len(songs_df)):
        filename = songs_df.iloc[i][audio_field] if not pd.isna(songs_df.iloc[i][audio_field]) else None
        if filename in status_df['billboard_name'].unique() or not filename: continue
        filepath = f"{audio_dir}/{filename}"
        audio_features, cols = extract_audio_features(filepath, song_name=songs_df.iloc[i]['billboard_name'])
        try:
            output_df = open_or_create_csv(output_path, cols)
            output_df = pd.concat([output_df, audio_features])
            output_df.to_csv(output_path, index=False)
            status_df = pd.concat([status_df, pd.DataFrame(data=[[filename]], columns=state_cols)])
            status_df.to_csv(status_file, index=False)
        except KeyboardInterrupt:
            output_df = open_or_create_csv(output_path, cols)
            output_df = pd.concat([output_df, audio_features])
            output_df.to_csv(output_path, index=False)
            status_df = pd.concat([status_df, pd.DataFrame(data=[[filename]], columns=state_cols)])
            status_df.to_csv(status_file, index=False)
            print('KeyboardInterrupt during save. Data successfully saved.')
        count += 1
        if batch_size and count >= batch_size: break
    try:
        output_df = open_or_create_csv(output_path, cols=[])
    except KeyboardInterrupt:
        output_df = open_or_create_csv(output_path, cols=[])
        print('KeyboardInterrupt during save. Data successfully saved.')
    # songs_df = pd.concat([songs_df, output_df], axis=1)
    # songs_df.to_csv(song_path, index=False)
    return output_df

# Dataset Construction

In [7]:
# get_billboard_data(2020, 2021, output_dir="../data/billboard")

In [8]:
# merge_csvs_in_path('../data/billboard/hot-100', glob_pattern='hot-100_*.csv', output_path='../data/billboard', output_filename='hot-100_all')

In [9]:
# fetch_spotify_songs(song_dir="../data/billboard/hot-100/", glob_pattern="hot-100_*.csv", output_dir='../data/spofity', output_file='songs.csv', audio_analysis_dir='../data/spofity/audio_analysis', preview_audio_dir="../data/audio/previews")

In [31]:
# note: BUG (17/04/22) in PyTube due to YouTube adjusting the JavaScript code containing details of the throttling function to the form: a.D&&(b=a.get("n"))&&(b=$x[0](b)
# Temporary fix until official release releases update:
#           go to pytube/cipher.py and change function_patterns to:
#                     r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
#                     r'\([a-z]\s*=\s*([a-zA-Z0-9$]{2,3})(\[\d+\])?\([a-z]\)'
# and change line 288 in that file to: nfunc=re.escape(function_match.group(1))),
# Enter these values exactly.

fetch_audio_data("../data/spofity/songs.csv", output_dir="../data/audio/full")

Fetching Audio Files	-Data written to C:\msc_data_science_uwi_sta\semester_2\classes\comp_6940\project\comp-6940-project\data\audio\full

 func:fetch_audio_data args:[('../data/spofity/songs.csv',), {'output_dir': '../data/audio/full'}] took: 0.14 sec
[K


In [32]:
fetch_audio_features('../data/spofity/songs.csv', batch_size=500)

Fetching Audio Features	-
 func:fetch_audio_features args:[('../data/spofity/songs.csv',), {'batch_size': 500}] took: 0.87 sec
[K


Unnamed: 0,billboard_name,name,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,All I Want For Christmas Is You,Mariah Carey All I Want For Christmas Is You ...,241.696508,0.317660,0.091317,0.160702,0.005551,2963.679826,4.258230e+05,2887.388904,...,1.619618,99.735660,-3.865835,99.024666,2.729783,112.219580,-7.488522,122.569650,2.323859,141.572560
1,Rockin' Around The Christmas Tree,Brenda Lee Rockin Around The Christmas Tree O...,127.523991,0.333602,0.087056,0.099516,0.002398,2337.289099,3.094718e+05,2206.637701,...,-1.039626,78.420586,-4.437555,55.536427,3.890496,70.359543,0.014326,77.899239,6.889563,93.610161
2,Jingle Bell Rock,Bobby Helms Jingle Bell Rock Official Lyric V...,131.378503,0.278278,0.089981,0.227821,0.008592,1825.951163,3.851751e+05,2046.698502,...,1.430321,58.685158,-4.030815,67.332291,1.802275,58.469532,-5.335912,53.423290,0.133941,58.774597
3,A Holly Jolly Christmas,Burl Ives Holly Jolly Christmas Lyrics.wav,153.553560,0.328846,0.089120,0.069804,0.000859,2200.466281,2.688014e+05,2378.395310,...,-1.355817,60.197350,-6.695084,52.782772,-4.325858,66.221947,-3.533713,50.849602,-1.266797,90.991325
4,Circles,Post Malone Circles Lyrics.wav,215.411519,0.340946,0.093090,0.329962,0.017911,1821.167852,6.991870e+05,2252.989888,...,2.563944,78.141319,-12.359889,83.661438,4.207565,65.643173,-5.280680,54.441185,-0.751733,59.799530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,Christmas Isn't Canceled (Just You),Kelly Clarkson Christmas Isnt Canceled Just Y...,231.619048,0.366283,0.088413,0.246724,0.010484,2656.592627,3.736893e+05,2724.528599,...,5.321839,70.412506,1.213545,77.698616,4.693950,85.284431,-2.604682,76.687698,5.805956,77.433144
1419,Moved To Miami,Roddy Ricch moved to miami Lyrics feat Lil Ba...,222.400726,0.496971,0.093589,0.263491,0.032075,2153.282361,1.736386e+06,2375.496133,...,4.598643,180.801086,-4.373017,110.878738,4.545245,111.550697,4.918246,63.780304,8.114554,75.806396
1420,Hibachi,Roddy Ricch hibachi feat Kodak Black 21 Sava...,170.480907,0.542494,0.082148,0.359854,0.023363,1839.916629,1.082738e+06,2360.276751,...,8.091636,79.854568,-1.646704,123.382797,5.309008,98.507568,0.213412,80.767159,2.804790,73.490234
1421,Thailand,Roddy Ricch Thailand Lyrics.wav,201.131247,0.482211,0.090467,0.299949,0.026273,1990.337814,1.505351e+06,2379.269351,...,10.420262,91.743813,-2.071233,75.112267,8.341298,97.730263,-0.260812,69.875168,1.959964,62.722679


# Testing Metric

In [9]:
rank_counts = [10, 4, 1]
ranks = [11, 13, 14]
squiggle(rank_counts, ranks, scaled=False)

1.2882117882117883

In [10]:
rank_counts = [4, 4, 1]
ranks = [11, 13, 4]
squiggle(rank_counts, ranks, scaled=True)

0.7265253672884535

In [8]:
import pytube
print(pytube.__file__)

C:\Anaconda3\lib\site-packages\pytube\__init__.py
