# Modeling

In [None]:
import os

import pandas as pd
import numpy as np

import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, cross_val_score, RandomizedSearchCV, validation_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report


from bayes_opt import BayesianOptimization


from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.utils import resample, shuffle

from sklearn.preprocessing import LabelEncoder

random_state=42

In [108]:
# To suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [86]:
df= pd.read_csv('/Users/sayalinagarkar/Documents/Sayali/INFO-6105/Project/US_1921_2021_normalized.csv')

In [87]:
df.shape

(587927, 15)

In [88]:
df.head()

Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration
0,0.06,1922,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3,126.903
1,0.0,1922,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1,98.2
2,0.0,1922,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5,181.64
3,0.0,1922,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3,176.907
4,0.0,1922,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4,163.08


We won't need years so dropping it.

In [89]:
df_no_year= df.drop('year', axis=1)

In [90]:
df_no_year.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration
0,0.06,0.645,0.445,C,-13.338,major,0.451,0.674,0.744,0.151,0.127,104.851,3,126.903
1,0.0,0.695,0.263,C,-22.136,major,0.957,0.797,0.0,0.148,0.655,102.009,1,98.2
2,0.0,0.434,0.177,C#,-21.18,major,0.0512,0.994,0.0218,0.212,0.457,130.418,5,181.64
3,0.0,0.321,0.0946,G,-27.961,major,0.0504,0.995,0.918,0.104,0.397,169.98,3,176.907
4,0.0,0.402,0.158,D#,-16.9,minor,0.039,0.989,0.13,0.311,0.196,103.22,4,163.08


We will encode keys into numbers 0-11 and minor and major into 0 and 1, respectively as using oneHotEncoder will introduce more features which is not efficient.

In [91]:
keys= ['C','C#','D','D#', 'E', 'F','F#','G', 'Ab','A','Bb','B' ]
numbers=[x for x in range(12)]
key_dict= dict(zip(keys, numbers))
key_dict.update({'minor':0,'major':1})
df=df_no_year.replace(key_dict)

In [92]:
df.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration
0,0.06,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,126.903
1,0.0,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,98.2
2,0.0,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,181.64
3,0.0,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,176.907
4,0.0,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,163.08


We have a lot of features to work with. Having too much features can lead to overfitting. We will try to reduce the number of features. For that we will see the correlation of each feature with popularity.

In [93]:
df_corr= df.corr()['popularity'].abs().sort_values(ascending=False)
df_corr

popularity          1.000000
acousticness        0.370666
loudness            0.327682
energy              0.302016
instrumentalness    0.236540
danceability        0.187727
time_signature      0.086832
tempo               0.071998
liveness            0.049070
speechiness         0.047311
mode                0.033414
duration            0.027130
key                 0.015163
valence             0.004451
Name: popularity, dtype: float64

In [94]:
df.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration
0,0.06,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,126.903
1,0.0,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,98.2
2,0.0,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,181.64
3,0.0,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,176.907
4,0.0,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,163.08


It can be seen that mode, key and valence are less correlated with popularity. So dropping thses features.

In [95]:
least_corr= ['key','mode','valence']
df= df.drop(least_corr, axis=1)
features= df.drop('popularity', axis=1).columns

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587927 entries, 0 to 587926
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        587927 non-null  float64
 1   danceability      587927 non-null  float64
 2   energy            587927 non-null  float64
 3   loudness          587927 non-null  float64
 4   speechiness       587927 non-null  float64
 5   acousticness      587927 non-null  float64
 6   instrumentalness  587927 non-null  float64
 7   liveness          587927 non-null  float64
 8   tempo             587927 non-null  float64
 9   time_signature    587927 non-null  int64  
 10  duration          587927 non-null  float64
dtypes: float64(10), int64(1)
memory usage: 49.3 MB


# Classification Model

We'll try to predict songs' popularities by categorizing popularities(into "high","mid","low"), and classifying songs into those categories.

In [97]:
pd.cut(df['popularity'], bins=3)

0         (-0.001, 0.333]
1         (-0.001, 0.333]
2         (-0.001, 0.333]
3         (-0.001, 0.333]
4         (-0.001, 0.333]
               ...       
587922    (-0.001, 0.333]
587923    (-0.001, 0.333]
587924    (-0.001, 0.333]
587925    (-0.001, 0.333]
587926    (-0.001, 0.333]
Name: popularity, Length: 587927, dtype: category
Categories (3, interval[float64, right]): [(-0.001, 0.333] < (0.333, 0.667] < (0.667, 1.0]]

In [98]:
labels=['low','medium', 'high']
df['popularity']= pd.cut(df['popularity'], bins=3, labels=labels, right=True)

In [99]:
df.popularity.value_counts()

low       363052
medium    213090
high       11785
Name: popularity, dtype: int64

The dataset is unbalanced. This is intuitive, because there aren't as many popular songs as there are non-popular songs. However,imbalance in dataset will tamper with the accuracy of our model. One way to counteract this is by upsampling songs with high popularity. We will then perform K-Nearest Neighbor classification, because they are good at handling noisy data.

Up-sampling songs

In [100]:
df_high= df[df.popularity=='high']
df_mid= df[df.popularity=='medium']
df_low= df[df.popularity=='low']

In [101]:
df_mid_upsampled= resample(df_mid, replace=True, n_samples= 363052, random_state=42)
df_high_upsampled= resample(df_high, replace=True, n_samples= 363052, random_state=42)

list_df_upsampled_tohigh=[df_high_upsampled, df_mid_upsampled, df_low]

df_resampled= pd.concat(list_df_upsampled_tohigh)

In [102]:
df_resampled.popularity.value_counts()

low       363052
medium    363052
high      363052
Name: popularity, dtype: int64

In [103]:
X_re= df_resampled.drop('popularity', axis=1)
y_re= df_resampled['popularity']

In [104]:
X_train, X_test, y_train, y_test= train_test_split(X_re, y_re, random_state= 42, test_size=0.3)

# K-Nearest Neighbors (KNN) Classifier

In [77]:
KNN= KNeighborsClassifier()

KNN.fit(X_train, y_train)

y_pred_classification= KNN.predict(X_test)
print(classification_report(y_test, y_pred_classification))

              precision    recall  f1-score   support

        high       0.88      1.00      0.94    108764
         low       0.70      0.58      0.63    108706
      medium       0.65      0.67      0.66    109277

    accuracy                           0.75    326747
   macro avg       0.74      0.75      0.74    326747
weighted avg       0.74      0.75      0.74    326747



Because KNN is not a tree-based algorithm, it requires standardization. We will put it as part of our function.

In [78]:
def fit_model(n_neighbors):
    StandardScaler()
    n_neighbors= round(n_neighbors)

    regressor= KNeighborsClassifier(n_neighbors= n_neighbors)
    

    return np.mean(cross_validate(regressor, X_train, y_train, scoring='accuracy', error_score= 'raise', cv=5)['test_score'])

In [79]:
KNN_BO = BayesianOptimization(fit_model,{
        'n_neighbors': (1,80)
    })

KNN_BO.maximize(n_iter=10, init_points=2, allow_duplicate_points=True)

|   iter    |  target   | n_neig... |
-------------------------------------


Passing acquisition function parameters or gaussian process parameters to maximize
is no longer supported, and will cause an error in future releases. Instead,
please use the "set_gp_params" method to set the gp params, and pass an instance
 of bayes_opt.util.UtilityFunction using the acquisition_function argument

  KNN_BO.maximize(n_iter=10, init_points=2, allow_duplicate_points=True)


| [0m1        [0m | [0m0.5758   [0m | [0m54.09    [0m |
| [95m2        [0m | [95m0.6508   [0m | [95m20.63    [0m |
| [95m3        [0m | [95m0.6595   [0m | [95m19.32    [0m |
| [95m4        [0m | [95m0.7078   [0m | [95m9.207    [0m |
| [95m5        [0m | [95m0.8104   [0m | [95m1.0      [0m |
| [0m6        [0m | [0m0.559    [0m | [0m80.0     [0m |
| [0m7        [0m | [0m0.7628   [0m | [0m2.178    [0m |
| [0m8        [0m | [0m0.5956   [0m | [0m36.4     [0m |
| [0m9        [0m | [0m0.5676   [0m | [0m66.87    [0m |
| [0m10       [0m | [0m0.6817   [0m | [0m13.56    [0m |
| [0m11       [0m | [0m0.5802   [0m | [0m45.02    [0m |
| [0m12       [0m | [0m0.6226   [0m | [0m28.46    [0m |


In [80]:
print(KNN_BO.max)

{'target': 0.8103603164973979, 'params': {'n_neighbors': 1.0}}


In [81]:
n_neighbors= KNN_BO.max['params']['n_neighbors']

In [82]:
model = KNeighborsClassifier(n_neighbors= round(n_neighbors))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

        high       0.99      0.96      0.98    112218
         low       0.67      0.81      0.74     89912
      medium       0.83      0.73      0.77    124617

    accuracy                           0.83    326747
   macro avg       0.83      0.83      0.83    326747
weighted avg       0.84      0.83      0.83    326747



KNN classifier performs better, with accuracy score of 0.83 (83%).