In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import multilabel_confusion_matrix, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.combine import SMOTEENN


In [2]:
#Import our data
songs_df = pd.read_csv('spotify_songs.csv')

songs_df['playlist_genre'].unique()  

array(['rock', 'r&b', 'pop', 'edm', 'latin', 'rap'], dtype=object)

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['playlist_genre'] == 'rock']

genre_df['track_popularity'].count()



3521

In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['key','playlist_genre','playlist_name','track_artist','lyrics','track_album_name','track_name','track_album_id','track_id','track_album_release_date','playlist_id','language','playlist_subgenre','liveness','acousticness','speechiness'])

genre_df['track_popularity'].hist

<bound method hist_series of 0        41
1        28
15       41
50       50
54       56
         ..
18430     7
18437    50
18439    57
18447    64
18453    61
Name: track_popularity, Length: 3521, dtype: int64>

In [5]:
#bin target variable (popularity) into 5 classes: 'Very Unpopular', 'Unpopular', 'Somewhat popular', 'Popular', 'Very Popular'
#bin target variable relative to genre of interest
#drop old popularity 
bins_1 = [0,20,40,60,80,100]
labels_1 = ['vu','u','swp','p','vp']

bins_2 = [0,20,40,60,80,100]
labels_2 = ['vu','u','swp','p','vp']

if genre_df['track_popularity'][genre_df['track_popularity'] > 80].count() <= 100:
        genre_df['popularity_binned'] = pd.cut(genre_df['track_popularity'], bins = bins_2, labels = labels_2)
        
else:
    genre_df['popularity_binned'] = pd.cut(genre_df['popularity'], bins = bins_1, labels = labels_1)
    
genre_df = genre_df.drop(columns = ['track_popularity'])

genre_df['popularity_binned'].unique()

['swp', 'u', 'vu', NaN, 'p', 'vp']
Categories (5, object): ['vu' < 'u' < 'swp' < 'p' < 'vp']

In [6]:
#encode categorical variables
#genre_df = pd.get_dummies(genre_df, columns = ['mode'])
genre_df['popularity_binned'].value_counts()

genre_df = genre_df.dropna(axis=0)

genre_df.shape

(3233, 9)

In [7]:
#Split our data into target and feature variables
X = genre_df.drop(columns=['popularity_binned']).values
y= genre_df['popularity_binned'].dropna().values


In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size = 0.70, test_size = 0.30, random_state=1)


In [9]:
#Fix class imbalance by using SMOTEENN resampling
smote_enn = SMOTEENN(random_state = 0)
X_resampled, y_resampled = smote_enn.fit_resample(X,y)

In [10]:
#Encode our categorical variables
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

encoder.fit(y_resampled)
y_train = encoder.transform(y_resampled)

print(np.unique(y_train, return_counts = True))

(array([0, 1, 2, 3, 4]), array([ 45,   9, 101, 115, 150], dtype=int64))


In [11]:
#Scale and Transform our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_resampled)
X_test = scaler.fit_transform(X_test)


In [12]:
#Set up parameters
n_estimators = [5,25,50]
max_features = ['auto','sqrt']
max_depth = [2,3,4]
criterion = ['gain','entropy']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [13]:
#create Random Forest Classifier model and optimize parameters
rfc = RandomForestClassifier()
parameters = {
    "n_estimators": n_estimators,
    "max_depth" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
    }
cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 25, 50]})

In [14]:
print(f'The best parameters are {cv.best_params_}')

The best parameters are {'bootstrap': False, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [15]:
#Fit the model with best parameters
rfc = RandomForestClassifier(n_estimators = cv.best_params_['n_estimators'], 
                           max_depth = cv.best_params_['max_depth'], 
                           min_samples_split = cv.best_params_['min_samples_split'],
                           min_samples_leaf =  cv.best_params_['min_samples_leaf'],
                           bootstrap = cv.best_params_['bootstrap'],
                           random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=4, n_estimators=50,
                       random_state=0)

In [16]:
#predictions
y_pred = rfc.predict(X_test)
y_pred


array([[0.1922142 , 0.01297833, 0.28387639, 0.11184635, 0.39908473],
       [0.08053922, 0.01258648, 0.30553014, 0.13075722, 0.47058694],
       [0.07670898, 0.0159369 , 0.17297404, 0.22024728, 0.51413279],
       ...,
       [0.10030446, 0.07457692, 0.51935521, 0.0081208 , 0.29764261],
       [0.16702219, 0.03842725, 0.263653  , 0.12125471, 0.40964285],
       [0.11408851, 0.02505103, 0.379354  , 0.02362404, 0.45788243]])

In [17]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cm

array([[  0,   0, 164, 153, 339],
       [  0,   0, 230, 206, 395],
       [  0,   0, 100,  86, 170],
       [  0,   0,   1,  10,   3],
       [  0,   0,  88,  87, 231]], dtype=int64)

In [18]:
mcm_dict = {}
for i in np.arange(0,4):
    mcm_dict.update({f'Confusion Matrix {i+1}': pd.DataFrame(mcm[i], columns = ['Pred_Pos','Pred_Neg'], index = ['Acc_Pos','Acc_Neg'])})


In [19]:
classes = ['vu','u','swp','p','vp']


for i in np.arange(1,5):
    print(f'Model Scoring for {classes[i-1]} songs')
    TP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[0]                
    TN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[1]
    FP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[1]
    FN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[0]
    print(mcm_dict[f'Confusion Matrix {i}'])
    precision = TP / (FP + TP)
    recall = TP / (FN + TP)
    accuracy = (TP + TN)/ (TP + FN + TN + FP)
    
    print(f'The precision score is {precision}')
    print(f'The recall score is {recall}')
    print(f'The accuracy score is {accuracy}')
    print('-------------------------------------')
          

    
    
   

Model Scoring for vu songs
         Pred_Pos  Pred_Neg
Acc_Pos      1607         0
Acc_Neg       656         0
The precision score is 0.7101193106495802
The recall score is 1.0
The accuracy score is 0.7101193106495802
-------------------------------------
Model Scoring for u songs
         Pred_Pos  Pred_Neg
Acc_Pos      1432         0
Acc_Neg       831         0
The precision score is 0.6327883340698188
The recall score is 1.0
The accuracy score is 0.6327883340698188
-------------------------------------
Model Scoring for swp songs
         Pred_Pos  Pred_Neg
Acc_Pos      1424       483
Acc_Neg       256       100
The precision score is 0.8476190476190476
The recall score is 0.746722600943891
The accuracy score is 0.6734423331860362
-------------------------------------
Model Scoring for p songs
         Pred_Pos  Pred_Neg
Acc_Pos      1717       532
Acc_Neg         4        10
The precision score is 0.9976757699012202
The recall score is 0.76345042240996
The accuracy score is 0.76314

In [20]:
X_df = genre_df.drop(columns=['popularity_binned'])
X_df.columns
pd.DataFrame(y_pred).value_counts()

4    1138
2     583
3     542
dtype: int64

In [21]:
#Evaluate the model
cr = classification_report(y_test, y_pred, target_names = classes)
print(cr)

              precision    recall  f1-score   support

          vu       0.00      0.00      0.00       656
           u       0.00      0.00      0.00       831
         swp       0.17      0.28      0.21       356
           p       0.02      0.71      0.04        14
          vp       0.20      0.57      0.30       406

    accuracy                           0.15      2263
   macro avg       0.08      0.31      0.11      2263
weighted avg       0.06      0.15      0.09      2263



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
#Check feature importance
feat_imp = rfc.feature_importances_

# We can sort the features by their importance.
sorted(zip(rfc.feature_importances_, X_df.columns), reverse=True)

[(0.271748926042901, 'duration_ms'),
 (0.23806271885893637, 'danceability'),
 (0.2026314292119361, 'tempo'),
 (0.08186871753593997, 'valence'),
 (0.07728623476855973, 'instrumentalness'),
 (0.06599211462180031, 'loudness'),
 (0.04890070068383018, 'energy'),
 (0.0135091582760964, 'mode')]