In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from datetime import datetime
from datetime import date, timedelta
import numpy as np

In [4]:
# import dataframe
recent_tracks_df = pd.read_csv('./Top_200_Binary_Classification_Dataset.csv')
top_200_recent_tracks_only = pd.read_csv('./Top_200_Stream_Count_Regression_Dataset.csv')

### One Hot Encodings

In [6]:
recent_tracks_df['release_date'] = pd.to_datetime(recent_tracks_df['release_date'])
recent_tracks_df['month_of_release'] = pd.DatetimeIndex(recent_tracks_df['release_date']).month
recent_tracks_df['year_of_release'] = pd.DatetimeIndex(recent_tracks_df['release_date']).year
recent_tracks_df['day_of_week_release'] = pd.DatetimeIndex(recent_tracks_df['release_date']).dayofweek

In [9]:
## create one-hot-encodings ##
##############################

# for year
years = ['2017', '2018', '2019', '2020']
y_d = pd.get_dummies(
    recent_tracks_df['year_of_release'],
    prefix='',
    prefix_sep='',
    drop_first = False
) 
y_d = y_d.T.reindex(years).T.fillna(0)

# for months
months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
m_d = pd.get_dummies(
    recent_tracks_df['month_of_release'],
    prefix='',
    prefix_sep='',
    drop_first = False
) 
m_d = m_d.T.reindex(months).T.fillna(0)

# join one-hot-encodings
dummies = y_d.join(m_d)

# add to initial df
recent_tracks_df = pd.concat(
    [recent_tracks_df, dummies],
    axis=1
)

In [10]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
recent_tracks_df.rename(columns = {'1':'m1', '2':'m2', '3':'m3', '4':'m4', '5':'m5', '6':'m6', '7':'m7', '8':'m8', '9':'m9', '10':'m10', '11':'m11', '12':'m12'}, inplace = True)
recent_tracks_df.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_200_presence,total_follower_count,log_total_follower_count,month_of_release,year_of_release,day_of_week_release,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12
0,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,False,4677919.0,22.157435,5,2018,4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [11]:
## create one-hot-encodings ##
##############################

# for day of week
days = ['1', '2', '3', '4', '5', '6'] #dropped the first day == "0" (as a result, you don't need an intercept in the model)
doy_of_w_d = pd.get_dummies(
    recent_tracks_df['day_of_week_release'],
    prefix='',
    prefix_sep='',
    drop_first = True
) 
dummies = doy_of_w_d.T.reindex(days).T.fillna(0)

# add to initial df
recent_tracks_df = pd.concat(
    [recent_tracks_df, dummies],
    axis=1
)

In [12]:
recent_tracks_df.rename(columns = {'1':'d1', '2':'d2', '3':'d3', '4':'d4', '5':'d5', '6':'d6'}, inplace = True)
recent_tracks_df.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_200_presence,total_follower_count,log_total_follower_count,month_of_release,year_of_release,day_of_week_release,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,d1,d2,d3,d4,d5,d6
0,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,False,4677919.0,22.157435,5,2018,4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [13]:
## create one-hot-encodings ##
##############################

# for key
keys = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'] #dropped the first key == "0" (as a result, you don't need an intercept in the model)
key_d = pd.get_dummies(
    recent_tracks_df['key'],
    prefix='',
    prefix_sep='',
    drop_first = True
) 
dummies = key_d.T.reindex(keys).T.fillna(0)

# add to initial df
recent_tracks_df = pd.concat(
    [recent_tracks_df, dummies],
    axis=1
)

In [14]:
recent_tracks_df.rename(columns = {'1':'k1', '2':'k2', '3':'k3', '4':'k4', '5':'k5', '6':'k6', '7':'k7', '8':'k8', '9':'k9', '10':'k10', '11':'k11'}, inplace = True)
recent_tracks_df.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_200_presence,total_follower_count,log_total_follower_count,month_of_release,year_of_release,day_of_week_release,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,d1,d2,d3,d4,d5,d6,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11
0,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,False,4677919.0,22.157435,5,2018,4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [15]:
## create one-hot-encodings ##
##############################

# for time_signature
times = ['1', '2', '3', '4', '5'] #dropped the first key == "0" (as a result, you don't need an intercept in the model)
time_d = pd.get_dummies(
    recent_tracks_df['time_signature'],
    prefix='',
    prefix_sep='',
    drop_first = True
) 
dummies = time_d.T.reindex(times).T.fillna(0)

# add to initial df
recent_tracks_df = pd.concat(
    [recent_tracks_df, dummies],
    axis=1
)

In [16]:
recent_tracks_df.rename(columns = {'1':'t1', '2':'t2', '3':'t3', '4':'t4', '5':'t5'}, inplace = True)
recent_tracks_df.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_200_presence,total_follower_count,log_total_follower_count,month_of_release,year_of_release,day_of_week_release,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,d1,d2,d3,d4,d5,d6,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,t1,t2,t3,t4,t5
0,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['Frank Sinatra'],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,False,4677919.0,22.157435,5,2018,4,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,1.0,0.0,0.0


### Balance Dataset

In [18]:
recent_tracks_df['top_200_presence'].value_counts()

False    41021
True      2305
Name: top_200_presence, dtype: int64

In [20]:
# randomly draw 2300 examples from each class
temp_false = recent_tracks_df[recent_tracks_df['top_200_presence'] == False].sample(
    n=2298,
    replace=False
)

temp_true = recent_tracks_df[recent_tracks_df['top_200_presence'] == True].sample(
    n=2298,
    replace=False
)

df_balanced = pd.concat(
    [temp_false, temp_true],
    axis=0)

# shuffle df_balanced
df_balanced.sample(frac=1) # frac=1 retains all the data
df_balanced.reset_index(drop=True, inplace=True) # reset index

print('After preprocessing, our data contains', df_balanced.shape[0], 'songs')

After preprocessing, our data contains 4596 songs


In [21]:
df_balanced['top_200_presence'].value_counts()

False    2298
True     2298
Name: top_200_presence, dtype: int64

In [22]:
df_balanced['top_200_binary'] = np.where(df_balanced['top_200_presence'] == True, 1, 0)
df_balanced.head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_200_presence,total_follower_count,log_total_follower_count,month_of_release,year_of_release,day_of_week_release,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,d1,d2,d3,d4,d5,d6,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,t1,t2,t3,t4,t5,top_200_binary
0,0X9IjFMHXsNGWvqhLxDa1X,Pour Me Another One - Conducta Remix,54,197735,1,"['Krept & Konan', 'Tabitha', 'Conducta']","['31lnFZEM6ysvjOx59VyxRE', '7iBY1RLWDV5zX9NDNQ...",2018-11-20,0.819,0.767,5,-9.83,1,0.126,0.131,0.000413,0.0685,0.669,130.033,4,False,508374.0,18.955531,11,2018,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0
1,0lLEhRMrGKQeosZCpV2zsj,Peço Perdão (feat. Matias Damasio),46,210300,0,"['Rui Orlando', 'Matias Damásio']","['1g00QP1vSwdi3mnn0PmzPa', '0E0XPqa6BzxSkhmhvz...",2020-02-05,0.636,0.837,7,-2.95,0,0.124,0.247,0.0,0.099,0.362,92.99,4,False,166730.0,17.347154,2,2020,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0
2,1EuI95kVxZYLbENy8d6AOI,I'm Good?,64,160187,0,['Hilltop Hoods'],['7dlqUnjoF2U2DkNDMhcgG4'],2020-05-01,0.901,0.761,4,-5.986,0,0.0704,0.311,0.0,0.065,0.96,119.984,4,False,659586.0,19.331201,5,2020,4,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0
3,4Kp9z7pGmZCqABDJgYsKK4,愛近在眼前 - 劇集 “踩過界” 片尾曲,40,217115,0,['Stephanie Ho'],['0eXGbuvMWBpHQ5GE56OCq2'],2017-06-24,0.646,0.564,5,-9.332,0,0.0375,0.646,0.0,0.114,0.395,142.098,4,False,12782.0,13.641826,6,2017,5,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0
4,7gINxzz4ErVMK3FAli7Bt7,そうぎゃらんBAM,48,231735,0,"['ヒプノシスマイク -D.R.B- (Bad Ass Temple)', 'ヒプノシスマイ...","['3fokOZQsXMeMTyvGHofqup', '6c1w45xLPDcBpx1O1I...",2019-11-27,0.75,0.782,8,-6.996,1,0.224,0.0742,0.0,0.0961,0.368,144.009,4,False,39525.0,15.270478,11,2019,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,1.0,0.0,0


### Split Training and Test

In [23]:
## BASELINE MODEL - Followers & Time-Based Parameters

# define outcome #
##################
y_base = df_balanced['top_200_binary'].values

# define features #
###################
X_base = df_balanced[['log_total_follower_count']].values

# split #
#########
# set seed for consistent behavior
np.random.seed(1234)

split = (0.7,0.3) #70% training and 30% test
shuffle = np.random.permutation(np.arange(y_base.shape[0])) # very important to shuffle the data. Why?
X_base, y_base = X_base[shuffle], y_base[shuffle]

splits = np.multiply(len(y_base), split).astype(int) 
X_train_base, X_test_base = np.split(X_base, [splits[0]])
y_train_base, y_test_base = np.split(y_base, [splits[0]])

print('Size of X_train', X_train_base.shape)
print('Size of y_train', y_train_base.shape)
print('Size of X_test', X_test_base.shape)
print('Size of y_test', y_test_base.shape)


# df for y_train and X_train #
##############################
# easier for EDA later on
df_train_base = pd.concat(
    [pd.DataFrame(y_train_base), pd.DataFrame(X_train_base)],
    axis=1
)

# rename columns
df_train_base.columns=['top_200_binary', 'log_total_follower_count']
df_train_base.head(5)

Size of X_train (3217, 1)
Size of y_train (3217,)
Size of X_test (1379, 1)
Size of y_test (1379,)


Unnamed: 0,top_200_binary,log_total_follower_count
0,0,18.876124
1,1,22.178546
2,0,8.396605
3,0,17.175335
4,1,24.760585


In [25]:
## Spotify Music Features MODEL - Followers & Time-Based Parameters & Music Features

# define outcome #
##################
y = df_balanced['top_200_binary'].values

# define features #
###################
X = df_balanced[['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'log_total_follower_count',  '2017', '2018', '2019', '2020', 'm1', 'm2', 'm3', 'm4',
        'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'k1', 'k2', 'k3', 'k4', 'k5',
        'k6', 'k7', 'k8', 'k9', 'k10','k11']].values

# split #
#########
# set seed for consistent behavior
np.random.seed(1234)

split = (0.7,0.3) #70% training and 30% test
shuffle = np.random.permutation(np.arange(y.shape[0])) # very important to shuffle the data. Why?
X, y = X[shuffle], y[shuffle]

splits = np.multiply(len(y), split).astype(int) 
X_train, X_test = np.split(X, [splits[0]])
y_train, y_test = np.split(y, [splits[0]])

print('Size of X_train', X_train.shape)
print('Size of y_train', y_train.shape)
print('Size of X_test', X_test.shape)
print('Size of y_test', y_test.shape)


# df for y_train and X_train #
##############################
# easier for EDA later on
df_train = pd.concat(
    [pd.DataFrame(y_train), pd.DataFrame(X_train)],
    axis=1
)

# rename columns
df_train.columns=['top_200_binary', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'log_total_follower_count',  '2017', '2018', '2019', '2020', 'm1', 'm2', 'm3', 'm4',
        'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'k1', 'k2', 'k3', 'k4', 'k5',
        'k6', 'k7', 'k8', 'k9', 'k10','k11']
df_train.head(5)

Size of X_train (3217, 43)
Size of y_train (3217,)
Size of X_test (1379, 43)
Size of y_test (1379,)


Unnamed: 0,top_200_binary,duration_ms,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,log_total_follower_count,2017,2018,2019,2020,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,d1,d2,d3,d4,d5,d6,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11
0,0,194213.0,0.529,0.484,-6.632,0.0308,0.133,0.104,0.281,125.931,18.876124,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,193293.0,0.712,0.556,-7.214,0.0531,0.084,0.527,0.22,90.494,22.178546,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,73360.0,0.474,0.551,-7.803,0.0696,0.844,0.596,0.254,108.109,8.396605,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,200975.0,0.666,0.777,-5.889,0.0324,0.575,0.213,0.574,139.936,17.175335,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,248472.0,0.603,0.602,-7.083,0.23,0.049,0.279,0.595,119.703,24.760585,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# standardize
sc_x = StandardScaler()

## Apply Standardization to Base + Music Features Model Continuous features and append back to One-Hot Encodings

X_train_cont_features = x_train_df_cont_features.values
X_test_cont_features = x_test_df_cont_features.values

X_train_one_hot_features = x_train_one_hot_features.values
X_test_one_hot_features = x_test_one_hot_features.values

X_train_cont_features_std = sc_x.fit(X_train_cont_features).transform(X_train_cont_features)
X_test_cont_features_std = sc_x.fit(X_train_cont_features).transform(X_test_cont_features)


X_train_std = pd.concat(
    [pd.DataFrame(X_train_cont_features_std), pd.DataFrame(X_train_one_hot_features)],
    axis=1
).values

X_test_std = pd.concat(
    [pd.DataFrame(X_test_cont_features_std), pd.DataFrame(X_test_one_hot_features)],
    axis=1
).values

## Apply Standardization to Baseline Model

X_train_std_base = sc_x.fit(X_train_base).transform(X_train_base)
X_test_std_base = sc_x.fit(X_train_base).transform(X_test_base)

NameError: name 'StandardScaler' is not defined