In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import multilabel_confusion_matrix, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
#Import our data
songs_df = pd.read_csv('SpotifyFeatures.csv')
songs_df.shape

(232725, 18)

In [3]:
#Create subsetted dataframe for our genre
genre_df = songs_df[songs_df['genre'] == 'Country']
genre_df.sort_values('popularity')
genre_df['popularity'][genre_df['popularity'] > 80].count()

2

In [4]:
#Start prepping data for our model
#Drop columns not relevant to the model
genre_df = genre_df.drop(columns=['genre','artist_name','track_name','track_id','time_signature','liveness','acousticness','speechiness'])

genre_df.head()

Unnamed: 0,popularity,danceability,duration_ms,energy,instrumentalness,key,loudness,mode,tempo,valence
688,45,0.551,200013,0.704,2e-06,G#,-5.426,Major,97.075,0.541
689,42,0.375,208187,0.859,0.0,G,-3.243,Major,74.059,0.597
690,46,0.699,123360,0.408,0.000219,B,-11.46,Major,171.922,0.842
691,54,0.555,238600,0.726,0.000178,D,-8.939,Major,107.719,0.505
692,42,0.64,243000,0.478,0.0,B,-6.96,Major,136.717,0.315


In [5]:
#bin target variable (popularity) into 5 classes: 'Very Unpopular', 'Unpopular', 'Somewhat popular', 'Popular', 'Very Popular'
#bin target variable relative to genre of interest
#drop old popularity 
bins_1 = [0,50,65,80,100]
labels_1 = ['Unpopular','Somewhat Popular','Popular', 'Very Popular']

bins_2 = [0,55,65,70,100]
labels_2 = ['Unpoopular','Somewhat Popular', 'Popular','Very Popular']

if genre_df['popularity'][genre_df['popularity'] > 80].count() <= 100:
        genre_df['popularity_binned'] = pd.cut(genre_df['popularity'], bins = bins_2, labels = labels_2)
        
else:
    genre_df['popularity_binned'] = pd.cut(genre_df['popularity'], bins = bins_1, labels = labels_1)
    
genre_df = genre_df.drop(columns = ['popularity'])

genre_df['popularity_binned'].value_counts()

Unpoopular          7221
Somewhat Popular    1063
Popular              200
Very Popular         116
Name: popularity_binned, dtype: int64

In [6]:
#encode categorical variables
genre_df = pd.get_dummies(genre_df, columns = ['mode','key'])
genre_df['popularity_binned'].value_counts()

Unpoopular          7221
Somewhat Popular    1063
Popular              200
Very Popular         116
Name: popularity_binned, dtype: int64

In [7]:
#Split our data into target and feature variables
X = genre_df.drop(columns=['popularity_binned']).values
y= genre_df['popularity_binned'].values


In [8]:
#Create Test/Train Splits
from sklearn.model_selection import train_test_split
X_test, X_train, y_test, y_train = train_test_split(X, y, train_size = 0.85, test_size = 0.15, random_state=1)


In [9]:
#Encode our categorical variables
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

encoder.fit(y_train)
y_train = encoder.transform(y_train)

print(np.unique(y_train, return_counts = True))

(array([0, 1, 2, 3, 4]), array([  33,  171, 1063,   25,    8], dtype=int64))


In [10]:
#Scale and Transform our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [11]:
#Set up parameters
n_estimators = [5,25,50]
max_features = ['auto','sqrt']
max_depth = [2,3,4]
criterion = ['gain','entropy']
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]

In [12]:
#create Random Forest Classifier model and optimize parameters
rfc = RandomForestClassifier()
parameters = {
    "n_estimators": n_estimators,
    "max_depth" : max_features,
    "max_depth" : max_depth,
    "min_samples_split" : min_samples_split,
    "min_samples_leaf" : min_samples_leaf,
    "bootstrap" : bootstrap
    }
cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [5, 25, 50]})

In [13]:
print(f'The best parameters are {cv.best_params_}')

The best parameters are {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 5}


In [14]:
#Fit the model with best parameters
rfc = RandomForestClassifier(n_estimators = cv.best_params_['n_estimators'], 
                           max_depth = cv.best_params_['max_depth'], 
                           min_samples_split = cv.best_params_['min_samples_split'],
                           min_samples_leaf =  cv.best_params_['min_samples_leaf'],
                           bootstrap = cv.best_params_['bootstrap'],
                           random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, n_estimators=5, random_state=0)

In [15]:
#predictions
y_pred = rfc.predict(X_test)
y_pred
y_probs = rfc.predict_proba(X_test)
y_probs

array([[0.02219463, 0.13132319, 0.81938647, 0.02138048, 0.00571523],
       [0.03147014, 0.11918449, 0.82658417, 0.01370806, 0.00905313],
       [0.03520307, 0.17565998, 0.74840839, 0.03013898, 0.01058957],
       ...,
       [0.02156276, 0.1562133 , 0.79240845, 0.017864  , 0.01195149],
       [0.0295058 , 0.17502756, 0.76412783, 0.02600141, 0.0053374 ],
       [0.04215866, 0.13894726, 0.78464881, 0.02520091, 0.00904436]])

In [16]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cm

array([[   0,    0,  167,    0,    0],
       [   0,    0,  892,    0,    0],
       [   0,    0, 6158,    0,    0],
       [   0,    0,   91,    0,    0],
       [   0,    0,   56,    0,    0]], dtype=int64)

In [17]:
mcm_dict = {}
for i in np.arange(0,4):
    mcm_dict.update({f'Confusion Matrix {i+1}': pd.DataFrame(mcm[i], columns = ['Pred_Pos','Pred_Neg'], index = ['Acc_Pos','Acc_Neg'])})


In [18]:
classes = ['Unpopular','Somewhat Popular','Popular', 'Very Popular']


for i in np.arange(1,5):
    print(f'Model Scoring for {classes[i-1]} songs')
    TP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[0]                
    TN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[1]
    FP = mcm_dict[f'Confusion Matrix {i}']['Pred_Pos'].values[1]
    FN = mcm_dict[f'Confusion Matrix {i}']['Pred_Neg'].values[0]
    print(mcm_dict[f'Confusion Matrix {i}'])
    precision = TP / (FP + TP)
    recall = TP / (FN + TP)
    accuracy = (TP + TN)/ (TP + FN + TN + FP)
    
    print(f'The precision score is {precision}')
    print(f'The recall score is {recall}')
    print(f'The accuracy score is {accuracy}')
    print('-------------------------------------')
          

    
    
   

Model Scoring for Unpopular songs
         Pred_Pos  Pred_Neg
Acc_Pos      7197         0
Acc_Neg       167         0
The precision score is 0.9773221075502444
The recall score is 1.0
The accuracy score is 0.9773221075502444
-------------------------------------
Model Scoring for Somewhat Popular songs
         Pred_Pos  Pred_Neg
Acc_Pos      6472         0
Acc_Neg       892         0
The precision score is 0.8788701792504073
The recall score is 1.0
The accuracy score is 0.8788701792504073
-------------------------------------
Model Scoring for Popular songs
         Pred_Pos  Pred_Neg
Acc_Pos         0      1206
Acc_Neg         0      6158
The precision score is nan
The recall score is 0.0
The accuracy score is 0.8362303096143401
-------------------------------------
Model Scoring for Very Popular songs
         Pred_Pos  Pred_Neg
Acc_Pos      7273         0
Acc_Neg        91         0
The precision score is 0.9876425855513308
The recall score is 1.0
The accuracy score is 0.9876425855

  precision = TP / (FP + TP)


In [19]:
X_df = genre_df.drop(columns=['popularity_binned'])
X_df.columns

Index(['danceability', 'duration_ms', 'energy', 'instrumentalness', 'loudness',
       'tempo', 'valence', 'mode_Major', 'mode_Minor', 'key_A', 'key_A#',
       'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E', 'key_F',
       'key_F#', 'key_G', 'key_G#'],
      dtype='object')

In [20]:
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.8362303096143401

In [21]:
#Check feature importance
feat_imp = rfc.feature_importances_

# We can sort the features by their importance.
sorted(zip(rfc.feature_importances_, X_df.columns), reverse=True)

[(0.2729317680179007, 'energy'),
 (0.25946943966518493, 'loudness'),
 (0.1246329713226541, 'duration_ms'),
 (0.0924810444992286, 'tempo'),
 (0.07688337399228949, 'valence'),
 (0.06835239965778477, 'danceability'),
 (0.056116949503348605, 'instrumentalness'),
 (0.023004075651060656, 'mode_Minor'),
 (0.01892770939572558, 'key_D#'),
 (0.007200268294822696, 'key_C'),
 (0.0, 'mode_Major'),
 (0.0, 'key_G#'),
 (0.0, 'key_G'),
 (0.0, 'key_F#'),
 (0.0, 'key_F'),
 (0.0, 'key_E'),
 (0.0, 'key_D'),
 (0.0, 'key_C#'),
 (0.0, 'key_B'),
 (0.0, 'key_A#'),
 (0.0, 'key_A')]