In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import multilabel_confusion_matrix, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.combine import SMOTEENN



In [2]:
#Import our data
songs_df = pd.read_csv('spotify_songs.csv')

songs_df['playlist_genre'].unique()  

array(['rock', 'r&b', 'pop', 'edm', 'latin', 'rap'], dtype=object)

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['playlist_genre'] == 'edm']

In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['key','playlist_genre','playlist_name','track_artist','lyrics','track_album_name','track_name','track_album_id','track_id','track_album_release_date','playlist_id','language','playlist_subgenre','liveness','acousticness','speechiness'])

In [5]:
#bin target variable (popularity) into 3 classes: 'Unpopular', 'Somewhat popular', 'Popular'
#bin target variable relative to genre of interest
#drop old popularity 
bins_1 = [0,60,75,100]
labels_1 = ['UP','SWP','P']


genre_df['popularity_binned'] = pd.cut(genre_df['track_popularity'], bins = bins_1, labels = labels_1)
    
genre_df = genre_df.drop(columns = ['track_popularity'])

genre_df['popularity_binned'].unique()

['UP', NaN, 'SWP', 'P']
Categories (3, object): ['UP' < 'SWP' < 'P']

In [6]:
#encode categorical variables
#Check bin evenness and drop NaN
#genre_df = pd.get_dummies(genre_df, columns = ['mode','key','explicit','release_season'])
print(genre_df['popularity_binned'].value_counts())

genre_df = genre_df.dropna(axis=0)


UP     1543
SWP     182
P        23
Name: popularity_binned, dtype: int64


In [7]:
#Split our data into target and feature variables
X = genre_df.drop(columns=['popularity_binned']).values
y= genre_df['popularity_binned'].dropna().values


In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size = 0.50, test_size = 0.50, random_state=1)


In [9]:
# Below is two resampling methods, some resampling methods work best 
# for certain genres so I have included code for both, simply comment out
# whatever methods you don't wish to use

In [10]:
#Fix class imbalance by using undersampling
#from imblearn.under_sampling import RandomUnderSampler
#ros = RandomUnderSampler(random_state=1)
#X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [11]:
##Fix class imbalance by using oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [12]:
#Encode our categorical variables
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

encoder.fit(y_resampled)
y_train = encoder.transform(y_resampled)

print(np.unique(y_train, return_counts = True))

(array([0, 1, 2]), array([763, 763, 763], dtype=int64))


In [13]:
#Scale and Transform our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_resampled)
X_test = scaler.fit_transform(X_test)


In [14]:
#Set up parameters
n_estimators = [5,25,50,100]
max_features = ['auto','sqrt']
max_depth = [2,3,4,5]
criterion = ['gain','entropy']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [15]:
#create Random Forest Classifier model and optimize parameters
#rfc = RandomForestClassifier()
#parameters = {
#    "n_estimators": n_estimators,
#    "max_depth" : max_features,
#    "max_depth" : max_depth,
#   "min_samples_split" : min_samples_split,
#    "min_samples_leaf" : min_samples_leaf,
#    "bootstrap" : bootstrap
#    }
#cv = GridSearchCV(rfc,parameters,cv=5)
#cv.fit(X_train,y_train)

In [16]:
# Show best model hyperparameters
#print(f'The best parameters are {cv.best_params_}')

In [17]:
#Fit the model with best parameters
rfc = RandomForestClassifier(n_estimators = 50, 
                           max_depth = 5, 
                           min_samples_split = 5,
                           min_samples_leaf =  3,
                           bootstrap = True,
                           random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=5,
                       n_estimators=50, random_state=0)

In [18]:
#predictions
y_pred = rfc.predict(X_test)
y_pred


array([1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 0, 1, 0, 1, 1, 1, 2, 1,
       1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 2, 1, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 1, 0, 0, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 2, 1, 2, 1, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 2, 0, 1, 2,
       1, 2, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2,
       1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 0, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 0, 2, 1, 0, 1, 1, 1, 2,
       2, 2, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1,
       2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 1, 0, 2, 2, 0,
       0, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 0, 2, 2,
       2, 2, 2, 1, 2, 1, 1, 0, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2,
       0, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2,
       1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2,

In [19]:
# Creating confusion matrices
mcm = multilabel_confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


In [20]:
# Turn our confusion matrices into a dataframe
cm_df = pd.DataFrame(cm, index = ['Actual_U','Actual_A','Actual_P'], columns = ['Pred_U','Actual_A','Pred_P'])

cm_df

Unnamed: 0,Pred_U,Actual_A,Pred_P
Actual_U,1,5,2
Actual_A,13,49,24
Actual_P,72,328,380


In [21]:
# Create a dictionary of confusion matrices for our multilabel confusion matrix
mcm_dict = {}
for i in np.arange(0,3):
    mcm_dict.update({f'Confusion Matrix {i+1}': pd.DataFrame(mcm[i], columns = ['Pred_Neg','Pred_Pos'], index = ['Acc_Neg','Acc_Pos'])})


In [22]:
mcm_dict['Confusion Matrix 1']


Unnamed: 0,Pred_Neg,Pred_Pos
Acc_Neg,781,85
Acc_Pos,7,1


In [23]:
mcm_dict[f'Confusion Matrix 1']['Pred_Pos'].values[1]

1

In [24]:
classes = ['Unpopular','Average','Popular']


for i in np.arange(1,4):
    print(f'Model Scoring for {classes[i-1]} songs')
    TP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[1]                
    TN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[0]
    FP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[0]
    FN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[1]
    print(mcm_dict[f'Confusion Matrix {i}'])
    precision = TP / (FP + TP)
    recall = TP / (FN + TP)
    accuracy = (TP + TN)/ (TP + FN + TN + FP)
    
    print(f'The precision score is {precision}')
    print(f'The recall score is {recall}')
    print(f'The accuracy score is {accuracy}')
    print('-------------------------------------')
          

    
    
   

Model Scoring for Unpopular songs
         Pred_Neg  Pred_Pos
Acc_Neg       781        85
Acc_Pos         7         1
The precision score is 0.011627906976744186
The recall score is 0.125
The accuracy score is 0.8947368421052632
-------------------------------------
Model Scoring for Average songs
         Pred_Neg  Pred_Pos
Acc_Neg       455       333
Acc_Pos        37        49
The precision score is 0.12827225130890052
The recall score is 0.5697674418604651
The accuracy score is 0.5766590389016019
-------------------------------------
Model Scoring for Popular songs
         Pred_Neg  Pred_Pos
Acc_Neg        68        26
Acc_Pos       400       380
The precision score is 0.9359605911330049
The recall score is 0.48717948717948717
The accuracy score is 0.5125858123569794
-------------------------------------


In [25]:
# Show overall accuracy score
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.4919908466819222

In [26]:
# Create a dataframe representing X_train so that the columns array can be used for creating a feature importances dataframe
X_df = genre_df.drop(columns=['popularity_binned'])
X_df.columns

Index(['danceability', 'energy', 'loudness', 'mode', 'instrumentalness',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')

In [27]:
#Evaluate the model
cr = classification_report(y_test, y_pred, target_names = classes)
print(cr)

              precision    recall  f1-score   support

   Unpopular       0.01      0.12      0.02         8
     Average       0.13      0.57      0.21        86
     Popular       0.94      0.49      0.64       780

    accuracy                           0.49       874
   macro avg       0.36      0.39      0.29       874
weighted avg       0.85      0.49      0.59       874



In [28]:
#Check feature importance
feat_imp = rfc.feature_importances_

# We can sort the features by their importance.
feat_imp = sorted(zip(rfc.feature_importances_, X_df.columns), reverse=True)

In [29]:
#Create feature importances dataframe
feat_imp_df = pd.DataFrame(feat_imp, columns = ['Importance','Feature'])
feat_imp_df

Unnamed: 0,Importance,Feature
0,0.201743,tempo
1,0.167661,valence
2,0.15964,instrumentalness
3,0.136021,danceability
4,0.117233,duration_ms
5,0.110934,loudness
6,0.086335,energy
7,0.020433,mode
