In [30]:
import io
import pandas as pd
import requests
import time
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

## Create datasets
The datasets are obtained by quering Spotify API using this script. First we obtain the lists of top 100 songs by region. Then we find the trackIDs and use those to query the audio features. 

### Rankings query

In [31]:
# get a day/week's list of top 200 songs
def get_chart(date, region='en', freq='daily', chart='top200'):
    chart = 'regional' if chart == 'top200' else 'viral'
    date = pd.to_datetime(date)
    if date.year < 2017:
        raise ValueError('No chart data available from before 2017')
    if freq == 'weekly':
        start, end = week_dates(date, weekday=4)
        date = f'{start.date()}--{end.date()}'
    else:
        date = f'{date.date()}'
    url = f'https://spotifycharts.com/{chart}/{region}/{freq}/{date}/download'
    data = io.StringIO(requests.get(url).text)
    try:
        df = pd.read_csv(data)
    except pd.errors.ParserError:
        df = None
        print(data)
    return df
# get a range of days/weeks' top 200 songs
def get_charts(start, end, region='en', freq='daily', chart='top200', sleep=1):
    sample = 'D' if freq == 'daily' else 'W'
    dfs = []
    for date in tqdm.tqdm(pd.date_range(start=start, end=end, freq=sample)):
        df = get_chart(date, region=region, freq=freq, chart=chart)
        if df is not None:
            df['date'] = date
            dfs.append(df)
            time.sleep(sleep)
    return pd.concat(dfs)
def formatting(df):
    # set first row as column names
    df.columns = df.iloc[0]
    # drop first row 
    df=df.drop(df.index[0])
    # drop useless columns
    df=df.drop(['URL','Artist','Position'],axis=1)
    df.columns = ['name','streams','date']
    df = df[['name','streams']]
    # streams datatype from string to int
    df.streams = df.streams.astype(int)
    # group by name and sort by aggregated yearly sterams
    data = df.groupby('name').sum().sort_values(by=['streams'],ascending=False)
    return data

In [None]:
# download data for 2 years and 4 regions 
regions = ['global','fi','se','us']
years = [2017,2018]
for year in years:
    for region in regions:
        df = get_charts('{}-01-01'.format(year),'{}-12-30'.format(year),region=region,freq='daily',chart='top200')
        data = formatting(df).iloc[:100]
        data.to_csv("{}_{}.csv".format(region,year))

### Audio features query

In [22]:
# Spotify API authorization
# my credentials but it's not important so I didn't make it a private environment
# variable, instead let's just use this together to save the troubles
client_credentials_manager = SpotifyClientCredentials(client_id='163cc87eba564053a5f582d3c686ea61',
                                                      client_secret='c549e5b597ef4ed3abbb185c19c4653e')
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [23]:
# request auido features from song name
def get_audio_features(name):
    # search for song ID from song name
    search_result = spotify.search(q='track:' + name , type='track',market='gb')
    
    # if no result found, return dataframe with NAN
    if len(search_result['tracks']['items'])==0:
        raw = pd.DataFrame(np.nan, index=[0], columns=['name','acousticness','danceability','duration_ms',
                          'energy','instrumentalness','key','liveness', 
                          'loudness','mode','speechiness','tempo',
                          'time_signature','valence'])
    else:
        trackID=search_result['tracks']['items'][0]['id']
        # request audio features 
        raw = pd.DataFrame(spotify.audio_features(tracks=trackID))
    raw['name'] = name
    audio_features = raw[['name','acousticness','danceability','duration_ms',
                          'energy','instrumentalness','key','liveness', 
                          'loudness','mode','speechiness','tempo',
                          'time_signature','valence']]
    return audio_features
# example
get_audio_features('shape of you')

Unnamed: 0,name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,shape of you,0.581,0.825,233713,0.652,0,1,0.0931,-3.183,0,0.0802,95.977,4,0.931


In [24]:
# get all the audio features for a list of songs
def audio_features(rankings,country_name):
    for n,song in enumerate(rankings['name']):
        if n==0:
            audio_features = get_audio_features(song)
        else:
            next_song_features = get_audio_features(song)
            audio_features = pd.concat([audio_features,next_song_features])
    audio_features = audio_features.reset_index(drop=True)

    # combine ranking and audio features
    result = pd.merge(rankings,audio_features,on='name',how='left')

    # save to csv file|
    result.to_csv("./data/2018_{}_complete.csv".format(country_name))
    return result

## Checking data quality

In [None]:
# missing values
print(pd.isna(global_data_2017).sum())

## Feature Engineering

### Feature selection
1. Remove instrumentalness, mode, and time_signiture because they are not informative (low variation: most belong to one category or all categories quite evenly distributed)
2. Remove speechiness, because we are mostly interested in music not audio book or talk show (Values above 0.66 describe tracks that are probably made entirely of spoken words; Values below 0.33 most likely represent music and other non-speech-like tracks).
3. (undecided) Remove duration_ms, because it may not be a deciding factor.
4. (undecided) Remove liveness(Detects the presence of an audience in the recording), because it may not be a deciding factor.

In [106]:
global_data_2017.columns

Index(['name', 'streams', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

In [107]:
from sklearn.ensemble import ExtraTreesClassifier

In [108]:
# global data
def feature_importance(region):
    X, y = region.drop(['streams','name'],axis=1).values,region['streams'].values
    clf = ExtraTreesClassifier(n_estimators=20)
    clf = clf.fit(X, y)   
    importance = pd.DataFrame(clf.feature_importances_,region.columns[2:])
    return importance
feature_importance(global_data_2017)

Unnamed: 0,0
acousticness,0.094388
danceability,0.095918
duration_ms,0.089796
energy,0.086735
instrumentalness,0.064286
key,0.077551
liveness,0.083163
loudness,0.080612
mode,0.039286
speechiness,0.079082


In [109]:
# remove suboptimal features
# remove key as it's less important, and it's not a continous feature 
# which adds more difficulty to feature scaling and interpretation
datasets = [global_data_2017,finland_data_2017,sweden_data_2017,us_data_2017,global_data_2018,finland_data_2018,sweden_data_2018,us_data_2018]
for data in datasets:
    data.drop(['name','loudness','instrumentalness','mode','time_signature','speechiness',
               'duration_ms','liveness','key'],axis=1,inplace=True)

### Feature scaling

In [111]:
us_data_2017.columns

Index(['streams', 'acousticness', 'danceability', 'energy', 'tempo',
       'valence'],
      dtype='object')

In [113]:
# for 2018
np.set_printoptions(precision=5,suppress=True)
from sklearn.preprocessing import StandardScaler
names = ['global','finland','sweden','us']
scaler = StandardScaler()
for i,data in enumerate(datasets):
    features = data.drop(['streams'],axis=1).values
    scaled_features = scaler.fit_transform(features)
    result = np.zeros((data.shape[0],6))
    result[:,0] = data['streams'].values
    result[:,1:] = scaled_features
    # save final processed data to csv 
    np.savetxt("./data/2018_{}_selected.csv".format(names[i]), 
               result,delimiter=',')