# Predicting Song Popularity

Classifying songs as present in BillBoard hot-100 charts or not.

In [33]:
#Importing required headers.
import copy
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

#Disabling warnings
import warnings
warnings.filterwarnings('ignore')

In [34]:
#Data.
data = pd.read_csv("../Data/MSD_BB.csv")
data.head()

#Encode artist id into quantitative.
le = preprocessing.LabelEncoder()
data['artist_id'] = le.fit_transform(data['artist_id'].astype('str'))

#Keeping required attributes.
drop_list = ['artist_location', 'artist_latitude',\
             'artist_longitude','artist_name', 'release',\
             'title' ,'song_hotttnesss', 'start_of_fade_out']
train = data.drop(drop_list, axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 15 columns):
artist_familiarity           9997 non-null float64
artist_hotttnesss            10001 non-null float64
artist_id                    10001 non-null int64
duration                     10001 non-null float64
end_of_fade_in               10001 non-null float64
key                          10001 non-null int64
key_confidence               10001 non-null float64
loudness                     10001 non-null float64
mode                         10001 non-null int64
mode_confidence              10001 non-null float64
tempo                        10001 non-null float64
time_signature               10001 non-null int64
time_signature_confidence    10001 non-null float64
year                         10001 non-null int64
bbhot                        10001 non-null int64
dtypes: float64(9), int64(6)
memory usage: 1.1 MB


In [52]:
#Actual BillBoard hotness values.
Y = copy.deepcopy(train.bbhot)

#Imputing values.
train["artist_familiarity"] = train["artist_familiarity"].fillna(train["artist_familiarity"].median())

#Training set without target.
train1 = train.drop("bbhot", axis=1)
train1.info()

X_train, X_test, y_train, y_test = train_test_split(train1, train['bbhot'], test_size=0.2, random_state=5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 14 columns):
artist_familiarity           10001 non-null float64
artist_hotttnesss            10001 non-null float64
artist_id                    10001 non-null int64
duration                     10001 non-null float64
end_of_fade_in               10001 non-null float64
key                          10001 non-null int64
key_confidence               10001 non-null float64
loudness                     10001 non-null float64
mode                         10001 non-null int64
mode_confidence              10001 non-null float64
tempo                        10001 non-null float64
time_signature               10001 non-null int64
time_signature_confidence    10001 non-null float64
year                         10001 non-null int64
dtypes: float64(9), int64(5)
memory usage: 1.1 MB


In [49]:
#Function to evaluate my model with Cross validation.
from sklearn.model_selection import cross_val_score
def testingModel(model, X_train, Y_train):
    scores = cross_val_score(model, X_train, Y_train, cv=10, scoring = "roc_auc")
    print("Scores:\n", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())
    return scores.mean()

In [55]:
%%time
#Random Forest Classification
rf = RandomForestClassifier(n_estimators=100, oob_score = True)
rf.fit(X_train, y_train)
acc_random_forest = testingModel(rf, X_train, y_train)

print()
print(classification_report(y_test, rf.predict(X_test)))

Scores:
 [0.59727196 0.60979301 0.65262125 0.61071315 0.57932959 0.61413515
 0.6027665  0.63449225 0.66676552 0.65107831]
Mean: 0.6218966692623541
Standard Deviation: 0.026614917249448207

             precision    recall  f1-score   support

          0       0.87      1.00      0.93      1740
          1       0.62      0.02      0.04       261

avg / total       0.84      0.87      0.81      2001

Wall time: 24.3 s


In [56]:
%%time
#Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = testingModel(logreg, X_train, y_train)

print()
print(classification_report(y_test,logreg.predict(X_test)))

Scores:
 [0.64004334 0.59083512 0.59443963 0.6118386  0.52606044 0.59043969
 0.58890359 0.61851531 0.63307022 0.57173536]
Mean: 0.5965881290775867
Standard Deviation: 0.031079028086737456

             precision    recall  f1-score   support

          0       0.87      1.00      0.93      1740
          1       0.00      0.00      0.00       261

avg / total       0.76      0.87      0.81      2001

Wall time: 1.22 s


In [57]:
%%time
#KNN
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)
acc_knn = testingModel(knn, X_train, y_train)

print()
print(classification_report(y_test, knn.predict(X_test)))

Scores:
 [0.51698065 0.58819638 0.49741449 0.56467582 0.52098067 0.61294125
 0.59537498 0.54649359 0.55742118 0.56648086]
Mean: 0.5566959854609023
Standard Deviation: 0.03504724300029208

             precision    recall  f1-score   support

          0       0.87      1.00      0.93      1740
          1       0.50      0.00      0.01       261

avg / total       0.82      0.87      0.81      2001

Wall time: 556 ms


In [60]:
%%time
#Decision trees
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
acc_decision_tree = testingModel(decision_tree, X_train, y_train)

print()
print(classification_report(y_test, decision_tree.predict(X_test)))

Scores:
 [0.50694424 0.52279813 0.52944442 0.56227282 0.49379477 0.52590835
 0.52704141 0.52406808 0.53100333 0.50373907]
Mean: 0.5227014618169199
Standard Deviation: 0.01776157687304187

             precision    recall  f1-score   support

          0       0.87      0.88      0.88      1740
          1       0.16      0.16      0.16       261

avg / total       0.78      0.78      0.78      2001

Wall time: 1.6 s


In [59]:
#Consolidating the results.
results = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Random Forest', 'Decision Tree'],
    'Score': [acc_knn, acc_log, acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.621897,Random Forest
0.596588,Logistic Regression
0.556696,KNN
0.52445,Decision Tree


In [61]:
%%time
#Gradient boosted decision trees.
X_train, X_test, y_train, y_test = train_test_split(train1, Y, test_size=0.33, random_state=7)
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions for test data
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, decision_tree.predict(X_test)))

Accuracy: 87.79%
             precision    recall  f1-score   support

          0       0.97      0.98      0.97      2900
          1       0.82      0.79      0.80       401

avg / total       0.95      0.95      0.95      3301

Wall time: 1.03 s


-------------------------------------------------------------------------------------------------------------------

Predicting song hotness using regression techniques.

In [16]:
#Dataset.
data = pd.read_csv("../Data/Cleaned_MSD10k.csv")

#Droping NaN rows in song_hotness.
index = data['song_hotttnesss'].index[data['song_hotttnesss'].apply(np.isnan)]
data = data.drop(index, axis = 0).reset_index(drop=True)

#Imputing values.
train["artist_familiarity"] = train["artist_familiarity"].fillna(train["artist_familiarity"].median())

#Encode artist id into quantitative
le = preprocessing.LabelEncoder()
data['artist_id'] = le.fit_transform(data['artist_id'].astype('str'))
data['release'] = le.fit_transform(data['release'].astype('str'))

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 42 columns):
Unnamed: 0                    5647 non-null int64
artist_familiarity            5646 non-null float64
artist_hotttnesss             5647 non-null float64
artist_id                     5647 non-null int64
artist_latitude               2210 non-null float64
artist_location               5647 non-null object
artist_longitude              2210 non-null float64
artist_name                   5647 non-null object
artist_terms                  5647 non-null object
artist_terms_freq             5647 non-null object
artist_terms_weight           5647 non-null object
bars_confidence               5647 non-null object
bars_start                    5647 non-null object
beats_confidence              5647 non-null object
beats_start                   5647 non-null object
duration                      5647 non-null float64
end_of_fade_in                5647 non-null float64
key                   

In [17]:
#Dropping attributes that cannot be processed (eg. 2D arrays etc).
drop_list = ['Unnamed: 0','artist_location', 'artist_latitude', 'artist_longitude','artist_name', 'artist_id', 'artist_terms', 'artist_terms_freq', 'beats_start', 'artist_terms_weight', 'bars_confidence',
             'bars_start', 'beats_confidence', 'release', 'sections_confidence', 'sections_start', 'segments_confidence', 'segments_loudness_max', 
             'segments_loudness_max_time', 'segments_loudness_start', 'segments_pitches','segments_start', 'segments_timbre', 'tatums_confidence', 
             'tatums_start','title', 'track_id']
train = data.drop(drop_list, axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 15 columns):
artist_familiarity           5646 non-null float64
artist_hotttnesss            5647 non-null float64
duration                     5647 non-null float64
end_of_fade_in               5647 non-null float64
key                          5647 non-null float64
key_confidence               5647 non-null float64
loudness                     5647 non-null float64
mode                         5647 non-null float64
mode_confidence              5647 non-null float64
song_hotttnesss              5647 non-null float64
start_of_fade_out            5647 non-null float64
tempo                        5647 non-null float64
time_signature               5647 non-null float64
time_signature_confidence    5647 non-null float64
year                         5647 non-null float64
dtypes: float64(15)
memory usage: 661.8 KB


In [18]:
#Actual Values.
Y = copy.deepcopy(train.song_hotttnesss)
Y.shape

#New dataframe without target.
train = train.drop("song_hotttnesss", axis=1)

In [19]:
%%time
#Regression model.
X_train, X_test, y_train, y_test = train_test_split(train, Y, test_size=0.2, random_state = 5)
model = XGBRegressor()
model.fit(X_train, y_train)

#Make predictions for test data
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})  

print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 47.75%
Wall time: 741 ms


In [20]:
#Looking at results
df.head(10)

Unnamed: 0,Actual,Predicted
855,0.754321,0.568642
1157,0.0,0.351981
1472,0.0,0.244505
5098,0.0,0.256196
3797,0.650042,0.619274
3791,0.270776,0.191372
1307,0.0,0.231711
1092,0.0,0.287741
3532,0.482138,0.388209
2059,0.0,0.297604


In [21]:
#Performance of the model.
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 

Mean Absolute Error: 0.15124598213844945
Mean Squared Error: 0.03313731195256913
Root Mean Squared Error: 0.18203656762466472


In [26]:
#Random number generator.
import random
array = [random.random() for i in range(len(y_test))]

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, array))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, array))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, array))) 

Mean Absolute Error: 0.325601883035631
Mean Squared Error: 0.1613577634361271
Root Mean Squared Error: 0.4016936188640879


Comparing the results we see that our model preforms better than an random predictor.